In [None]:
!pip install tensorflow-text==2.5
!pip install tf-models-official==2.5

In [None]:
import os
import shutil

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

import matplotlib.pyplot as plt
from ml4h.TensorMap import TensorMap, Interpretation

drug_folder = 'split_drugs'
drug_folder = 'split_small_test_all'

preprocess_model = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

#base_model = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
base_model = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"

tf.get_logger().setLevel('ERROR')

In [None]:
#df = pd.read_csv(f'./test_sentences_128_meta_data.csv')
df = pd.read_csv(f'/home/sam/unify-emotion-datasets/data/full_dataset/goemotions_1.csv')
in_cols = ['text']
output_cols = []
for i,c in enumerate(df):
    if i > 8:
        print(df[c].value_counts())
        output_cols.append(c)
print(df.info())
print(output_cols)

In [None]:
tensor_maps_out = []
for oc in output_cols:
    tensor_maps_out.append(TensorMap(f'{oc}', Interpretation.CATEGORICAL, shape=(1,), 
                           channel_map={f'no_{oc}':0, f'{oc}':1}))
def make_dataset(csv, in_cols, out_cols, batch_size=32):
    i = tf.data.experimental.make_csv_dataset(csv, select_columns=in_cols, 
                                              batch_size=1, shuffle=False)
    o = tf.data.experimental.make_csv_dataset(csv, select_columns=out_cols, 
                                              batch_size=1, shuffle=False)
    ds = tf.data.Dataset.zip((i,o))
    ds = ds.shuffle(70000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds


def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    test_size = int(ds_size - (train_size+val_size))
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    print(f'Partition {ds_size} examples into train:{train_size} val:{val_size} test:{test_size}')
    return train_ds, val_ds, test_ds, train_size, val_size, test_size

train_ds = make_dataset('/home/sam/unify-emotion-datasets/data/full_dataset/goemotions_1.csv', in_cols, output_cols)
valid_ds = make_dataset('/home/sam/unify-emotion-datasets/data/full_dataset/goemotions_2.csv', in_cols, output_cols)
test_ds = make_dataset('/home/sam/unify-emotion-datasets/data/full_dataset/goemotions_3.csv', in_cols, output_cols)
train_size, val_size, test_size = 70000, 70000, 71225

In [None]:
from ml4h.TensorMap import TensorMap, Interpretation

In [None]:
for feature_batch, label in valid_ds.take(1):
    print(f"label {label}")
    for key, value in feature_batch.items():
        print(f"\n\n\n Key is  {key:20s}: {value[0]}")

In [None]:
dropout_rate = 0.1

text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
bert_preprocess_model = hub.KerasLayer(
    preprocess_model)

bert_model = hub.KerasLayer(
    base_model,
    trainable=True)

text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

bert_results = bert_model(text_preprocessed)

print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

def build_classifier_model(tfhub_handle_preprocess, tfhub_handle_encoder, tensor_maps_out):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(dropout_rate)(net)
    #net = tf.keras.layers.Dense(256, activation='swish')(net)
    #net = tf.keras.layers.Dropout(dropout_rate)(net)
    outputs = []
    for otm in tensor_maps_out:
        outputs.append(tf.keras.layers.Dense(len(otm.channel_map), activation=None, name=otm.name)(net))
    return tf.keras.Model(text_input, outputs)

classifier_model = build_classifier_model(preprocess_model, base_model, tensor_maps_out)
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result[-1]))
tf.keras.utils.plot_model(classifier_model)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.metrics.SparseCategoricalAccuracy()

In [None]:
epochs = 8
batch_size=32
steps_per_epoch = train_size//batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
print(f'warm up {steps_per_epoch}  and {num_train_steps}')
init_lr = 1e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
print(f'Training model with bert')
history = classifier_model.fit(x=train_ds, steps_per_epoch=steps_per_epoch, 
                               validation_data=valid_ds, validation_steps=val_size//batch_size,
                               epochs=epochs, shuffle=True
                              )

In [None]:
loss = classifier_model.evaluate(test_ds, steps=test_size//batch_size)

print(f'Loss: {loss}')

In [None]:
from ml4h.plots import plot_metric_history

plot_metric_history(history, num_train_steps, 'bert sentiment training')

In [None]:
#print(f'Accuracy: {accuracy}')

history_dict = history.history
print(history_dict.keys())

#acc = history_dict['tag_8_sparse_categorical_accuracy']
#val_acc = history_dict['val_tag_8_sparse_categorical_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# r is for "solid red line"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
print(val_acc)

In [None]:
classifier_model.summary()

In [None]:
classifier_model.save(f'bert_{len(tensor_maps_out)}_sentiment_classifier')

In [None]:
classifier_model.load_weights(f'bert_{len(tensor_maps_out)}_sentiment_classifier_v2022_04_22')

In [None]:
from collections import defaultdict
predictions = defaultdict(list)
truths = defaultdict(list)
for text, labels in test_ds.as_numpy_iterator():
    for l in labels:
        truths[l].extend(map(int, list(labels[l])))
    p = classifier_model.predict(text)
    for i,ot in enumerate(classifier_model.output_names):
        #print(f'{i} ot {ot} np.argmax(p[i]) {np.argmax(p[i], axis=-1).shape}')
        predictions[ot].extend(list(p[i]))
        
    if len(truths[l]) > 9965:
        break

In [None]:
from ml4h.plots import plot_roc, subplot_rocs
def make_one_hot(y, num_labels):
    ohy = np.zeros((y.shape[-1], num_labels))
    for i in range(0, y.shape[-1]):
        ohy[i, int(y[i])] = 1.0
    return ohy

rocs = []
perfs = {}
for otm in tensor_maps_out:
    print(f' otm {otm} {np.array(predictions[otm.name]).shape}')
    perfs[otm] = plot_roc(np.array(predictions[otm.name]), 
             make_one_hot(np.array(truths[otm.name]), len(otm.channel_map)), 
             otm.channel_map, otm.name)
    rocs.append((np.array(predictions[otm.name]), 
                 make_one_hot(np.array(truths[otm.name]), len(otm.channel_map)), 
                 otm.channel_map))
subplot_rocs(rocs)

In [None]:
import math
tag_auc = []
for tm in perfs:
    print(f'per, {perfs[tm]} ')
    tag_auc.append((tm.name, perfs[tm][tm.name] ))
#         p = perfs[tm][itags[tm.name]]
#         t = itags[tm.name]
#         if math.isnan(p):
#             continue
#         tag_auc.append((t, p))

In [None]:
tag_auc = sorted(tag_auc, key=lambda x: x[1])

In [None]:
tag_auc

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
_ = plt.figure(figsize=(5, 11), dpi=300)
plt.barh(range(len(tag_auc)), [t[1] for t in tag_auc])
plt.axvline(0.5, linestyle='dashed', c='orange')
plt.yticks(np.arange(len(tag_auc)), [t[0].replace('_', ' ').capitalize() for t in tag_auc], ha='right')
plt.ylabel('Go Emotion Taxonomy')
plt.xlabel('Test Set ROC AUC')
plt.box(False)

