In [None]:
!pip install tensorflow-text==2.5
!pip install tf-models-official==2.5

In [None]:
import os
import shutil
from collections import Counter, defaultdict

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

import matplotlib.pyplot as plt


tf.get_logger().setLevel('ERROR')

In [None]:
epochs = 5
num_train_steps = 100 * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')
classifier_model = tf.keras.models.load_model('bert_imdb_sentiment', custom_objects={'AdamWeightDecay':optimizer})

In [None]:
classifier_model.summary()

In [None]:
words = ['depressingly bad.',
         'The movie was terrible',
         'The overall effect is very pleasant', 
         'the desire is strong to repeat the experience on both smaller doses' ,
 '(to perhaps permit one to be alone in safety) and on larger doses. ',
         'Also to repeat the experience under a variety of external stimuli',
         'with music available for contemplation - at the beach - at night.',
         'There is complete recall of all experiences and thoughts, after the drug has been dissipated.',
         'The feeling of comfort, introspection, and tranquility'
         'One feels that this feeling may be prolonged and further enjoyed upon',
         'repeated usages. Such a change would be most desirable in many personalities - and may bespeak a',
         
        ]

In [None]:
examples = [
    'this is such an amazing movie!',  # this is the same sentence tried earlier
    'The movie was great!',
    'The movie was meh.',
    'The movie was okish.',
    'The movie was terrible...'
]


original_results = tf.sigmoid(classifier_model(tf.constant(examples)))

print('Results from the saved model:')
def print_my_examples(inputs, results):
  result_for_printing = \
    [f'input: {inputs[i]:<20} : score: {results[i][0]:.6f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()
print_my_examples(examples, original_results)

In [None]:
#drugs= './split_drugs/train/'
drugs= './all_drugs/train/'
step_size = 256
scores = defaultdict(dict)
stats = Counter()
def read_file(in_file, path, cur_testimonial, cur_sentence, cur_words, test_split):
    subd = 'train/' if np.random.random() > test_split else 'test/'
    with open(f'{path}/{subd}/{in_file.replace(".txt", "").lower()}/{cur_testimonial:05d}_{cur_sentence:03d}.txt', 'w') as out_file:
        out_file.write(' '.join(cur_words).strip())

for drug in os.listdir(drugs):
    for testimonial in sorted(os.listdir(drugs + drug)):
        if not testimonial.endswith('.txt'):
            continue
           
        with open(os.path.join(drugs, drug, testimonial)) as text:
            words = text.read()
            for index in range(0, len(words)-step_size, step_size//2):
                if testimonial not in scores[drug]:
                    scores[drug][testimonial] = [] 
                cur_word = words[index:index+step_size]
                results = tf.sigmoid(classifier_model(tf.constant([cur_word])))
                scores[drug][testimonial].append(results[0][0].numpy())
                #print(f'cur_word: {cur_word:<20} \n Score: {results[0][0]:.6f}\n\n')
        if stats[drug] % 5 == 0:
            print(f'Got:{drug}, n={stats[drug]}')
        stats[drug] += 1

#         if stats[drug] > 400:
#             break
#     if len(stats) > 4:
#         break


In [None]:
for d in scores:
    print(f'{d}: len: {[len(scores[d][x]) for x in scores[d]]} ')

In [None]:
import numpy as np
import scipy.interpolate as interp
standardized = defaultdict(dict)
means = defaultdict()
stds = defaultdict()
new_size = 100
for drug in scores:
    for t in scores[drug]:
        try:
            interpolator = interp.interp1d(np.arange(len(scores[drug][t])), scores[drug][t])
            standardized[drug][t] = interpolator(np.linspace(0, len(scores[drug][t])-1, new_size))
            if drug not in means:
                means[drug] = standardized[drug][t]
                stds[drug] = standardized[drug][t]*standardized[drug][t]
            else:
                means[drug] += standardized[drug][t]
                stds[drug] += standardized[drug][t]*standardized[drug][t]
        except ValueError:
            print(f'vall err {drug} {t}')
    means[drug] /= len(scores[drug])
    stds[drug] /= len(scores[drug])
    stds[drug] -= means[drug]*means[drug]
    stds[drug] = np.sqrt(stds[drug])

In [None]:
import numpy as np
from collections import OrderedDict
from matplotlib.lines import Line2D
keep_drugs = ['psilocin', 'lsd', 'mdma', 'cocaine', 'thc', 'haloperidol', 'sertraline',
              'salvia', 'risperidone', 'bupropion']
keep_drugs = list(scores.keys())
def plot_sentiment(scores, means, stds):
    fig, ax = plt.subplots(figsize=(16, 12.5), dpi=300)
    names = list(scores.keys())
    cmap = plt.get_cmap('viridis')
    colors = cmap(np.linspace(0, 1, len(names)))
    markers = "v.,o1^2>348<spdxh*PH+XD|"
    o_drugs = sorted(list(scores.keys()))
    for i, (drug, color) in enumerate(zip(o_drugs, colors)):
        if drug not in keep_drugs:
            continue
        for testimonial in scores[drug]:
            sentiments = scores[drug][testimonial]
            #ax.plot(range(len(sentiments[:200])), sentiments[:200], c=color, label=drug, alpha=0.1)
        ax.plot(range(len(means[drug])), means[drug], c=color, marker=markers[i%len(markers)], label=drug,
               fillstyle='none' if i%2==0 else 'full')
        ax.fill_between(range(len(means[drug])), means[drug] - stds[drug], means[drug] + stds[drug], 
                        color=color, alpha=0.05)
    ax.set_title('Sentiment Trajectory of Trip')
    ax.set_xlabel('Time in Narrative')
    ax.set_ylabel('Sentiment')
    handles, labels = fig.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys())

plot_sentiment(standardized, means, stds)
      

In [None]:
import numpy as np
from collections import OrderedDict
from matplotlib.lines import Line2D
keep_drugs = ['psilocin', 'lsd', 'mdma', 'cocaine', 'thc', 'haloperidol', 'sertraline',
              'salvia', 'risperidone', 'bupropion']
keep_drugs = list(scores.keys())
def plot_sentiment(scores, means, stds):
    fig, ax = plt.subplots(figsize=(16, 12.5), dpi=300)
    names = list(scores.keys())
    cmap = plt.get_cmap('viridis')
    colors = cmap(np.linspace(0, 1, len(names)))
    markers = "v.,o1^2>348<spdxh*PH+XD|"
    o_drugs = sorted(list(scores.keys()))
    for i, (drug, color) in enumerate(zip(o_drugs, colors)):
        if drug not in keep_drugs:
            continue
        for testimonial in scores[drug]:
            sentiments = scores[drug][testimonial]
            #ax.plot(range(len(sentiments[:200])), sentiments[:200], c=color, label=drug, alpha=0.1)
        ax.plot(range(len(means[drug])), means[drug], c=color, marker=markers[i%len(markers)], label=drug,
               fillstyle='none' if i%2==0 else 'full')
        ax.fill_between(range(len(means[drug])), means[drug] - stds[drug], means[drug] + stds[drug], 
                        color=color, alpha=0.05)
    ax.set_title('Sentiment Trajectory of Trip')
    ax.set_xlabel('Time in Narrative')
    ax.set_ylabel('Sentiment')
    handles, labels = fig.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys())

plot_sentiment(standardized, means, stds)


In [None]:
import pickle
# open file for writing
# create a binary pickle file 
f = open("sentiment_scores.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(scores,f)

# close file
f.close()
# f = open("sentiment_scores.txt","w")

# # write file
# f.write( str(scores) )

# # close file
# f.close()


In [None]:
import re 
word_array = re.split(r"\.\s*", words)
word_array = [w for w in word_array if len(w) > 2]

In [None]:
len(word_array)

In [None]:
word_array[:4]

In [None]:
def print_my_examples(inputs, results, limit=70):
  result_for_printing = \
    [f'input: {inputs[i][:limit]} : score: {results[i][0]:.3f}'
                         for i in range(len(inputs))]
  print(*result_for_printing, sep='\n')
  print()

In [None]:
step_size = 4
for i in range(step_size, len(word_array), step_size):
    mescaline = tf.sigmoid(classifier_model(tf.constant(word_array[i-4:i])))
    print_my_examples(word_array[i-step_size:i], mescaline)