# Used Packages

In [383]:
import os
import shutil
import random

import pandas as pd
import spacy
import networkx as nx
import numpy as np
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from official.nlp import optimization

import matplotlib.pyplot as plt

# Model Creation

In [384]:
PATH = 'data'

##  Dataset

In [385]:
# downloading the imdb dataset (if not already done)
# removing the unnecessary unsup folder because this is a supervised ml task
if not os.path.isdir('data'):
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

    dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                        untar=True, cache_dir=PATH,
                                        cache_subdir='')

    shutil.rmtree('unsup')

In [386]:
# setting the directory for the training and test data
train_dir = os.path.join(PATH, 'train')
test_dir = os.path.join(PATH, 'test')

### Dataset Parameters

In [387]:
# setting model parameters
# autotune allows the automatic setting of the number of prefetched data ahead
# of time they are requested in the learning process
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 42

### Splitting Dataset

In [388]:
# training set 80 percent of all files with 20 left for validation
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir, 
    batch_size=batch_size)

test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


## Model Training

In [389]:
# setting the bert encoder and preprocessor
if not os.listdir('models'):
    tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
    tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [390]:
# generating the bert encoder and preprocess layer for the model
# (save model error can be fixed by deleting temp folder)
if not os.listdir('models'):
    bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
    bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [391]:
# function for building the classifiert model 
# text input -> preprocessing -> encode -> droput -> dense 
if not os.listdir('models'):
    def build_classifier_model():
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
        encoder_inputs = preprocessing_layer(text_input)
        encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
        outputs = encoder(encoder_inputs)
        net = outputs['pooled_output']
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
        return tf.keras.Model(text_input, net)

In [392]:
# initialize classifier model
if not os.listdir('models'):
    classifier_model = build_classifier_model()

In [393]:
# sett loss and metric functions
if not os.listdir('models'):
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()

In [394]:
# set epochs and learning rate
if not os.listdir('models'):
    epochs = 5
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')

In [395]:
if not os.listdir('models'):
    # compile the model
    classifier_model.compile(optimizer=optimizer,
                            loss=loss,
                            metrics=metrics)

In [396]:
if not os.listdir('models'):
    history = classifier_model.fit(x=train_ds,
                                validation_data=val_ds,
                                epochs=epochs)

In [397]:
if not os.listdir('models'):
    loss, accuracy = classifier_model.evaluate(test_ds)

In [398]:
if not os.listdir('models'):
    dataset_name = 'imdb'
    saved_model_path = './models/{}_bert'.format(dataset_name.replace('/', '_'))

In [399]:
if not os.listdir('models'):
    classifier_model.save(saved_model_path, include_optimizer=False)

# Loading the model and working with it

In [400]:
if "saved_model_path" not in locals():
    saved_model_path = f'models/{os.listdir("models")[0]}'

saved_model_path

'models/imdb_bert'

In [401]:
model = tf.keras.models.load_model(saved_model_path, compile=False)

In [402]:
test_data_unbatched = list(test_ds.unbatch().as_numpy_iterator())

In [403]:
rand_ind = random.randint(0, len(test_data_unbatched))
rand_sen_label_pair = test_data_unbatched[rand_ind]
rand_sen_label_pair[0]

b"It is one of the best of Stephen Chow. I give it a nine out of ten.<br /><br />I was surprised to see that Shaolin Soccer was rated on top of all singsing's movies. Unbelievable."

In [424]:
def classify_text(model, text, parent_ind=None, child_ind=None):
    score = tf.sigmoid(model(tf.constant([text])))[0][0].numpy()
    pred_label = np.where(score > 0.5, 1, 0).item()

    if parent_ind or child_ind:
        return (score, pred_label, parent_ind, child_ind)
    else:
        return (score, pred_label)

In [425]:
org_text_pred = classify_text(model, rand_sen_label_pair[0])
org_text_pred

(0.99781907, 1)

## Dependecy Parser

In [426]:
depend_parser = spacy.load('en_core_web_sm')

In [427]:
parsed_text = depend_parser(str(rand_sen_label_pair[0]))
sentence_spans = list(parsed_text.sents)[0]

In [428]:
spacy.displacy.render(sentence_spans, jupyter=True, options={"compact": True})

### Leave-n-out

In [429]:
new_texts = []
for token in parsed_text:
    tok_ind = token.i
    
    if token.children:
        for child in token.children:
            new_text = []
            child_ind = child.i

            for item in parsed_text:
                if item.i == tok_ind or item.i == child.i:
                    pass
                else:
                    new_text.append(item.text)

            new_texts.append((" ".join(new_text), tok_ind, child_ind))

In [430]:
new_texts_classifications_df = pd.DataFrame([
    classify_text(
        model, text_triple[0], text_triple[1], text_triple[2])
    for text_triple in new_texts],
    columns=['Output Strength',
             'Predicted Label',
             'Parent Index',
             'Child Index'])

new_texts_classifications_df


Unnamed: 0,Output Strength,Predicted Label,Parent Index,Child Index
0,0.995221,1,1,0
1,0.981516,1,1,2
2,0.991132,1,1,9
3,0.994501,1,2,3
4,0.877591,1,3,5
5,0.826551,1,5,4
6,0.86105,1,5,6
7,0.996627,1,6,8
8,0.997003,1,8,7
9,0.996007,1,11,10


#### Remove Pairs with wrong Predicted Label

In [431]:
new_texts_classifications_df = new_texts_classifications_df.drop(new_texts_classifications_df.index[new_texts_classifications_df['Predicted Label'] != org_text_pred[1]] )
new_texts_classifications_df

Unnamed: 0,Output Strength,Predicted Label,Parent Index,Child Index
0,0.995221,1,1,0
1,0.981516,1,1,2
2,0.991132,1,1,9
3,0.994501,1,2,3
4,0.877591,1,3,5
5,0.826551,1,5,4
6,0.86105,1,5,6
7,0.996627,1,6,8
8,0.997003,1,8,7
9,0.996007,1,11,10


#### Calculate output strength difference

In [432]:
new_texts_classifications_df['Strength Difference'] = org_text_pred[1] - new_texts_classifications_df['Output Strength']
new_texts_classifications_df

Unnamed: 0,Output Strength,Predicted Label,Parent Index,Child Index,Strength Difference
0,0.995221,1,1,0,0.004779
1,0.981516,1,1,2,0.018484
2,0.991132,1,1,9,0.008868
3,0.994501,1,2,3,0.005499
4,0.877591,1,3,5,0.122409
5,0.826551,1,5,4,0.173449
6,0.86105,1,5,6,0.13895
7,0.996627,1,6,8,0.003373
8,0.997003,1,8,7,0.002997
9,0.996007,1,11,10,0.003993


#### Sort by output strength difference

In [433]:
new_texts_classifications_df = new_texts_classifications_df.sort_values(by=['Strength Difference'])
new_texts_classifications_df

Unnamed: 0,Output Strength,Predicted Label,Parent Index,Child Index,Strength Difference
36,0.997734,1,37,39,0.002266
35,0.997013,1,37,38,0.002987
8,0.997003,1,8,7,0.002997
7,0.996627,1,6,8,0.003373
20,0.996296,1,20,21,0.003704
19,0.996204,1,20,19,0.003796
22,0.996086,1,23,22,0.003914
9,0.996007,1,11,10,0.003993
28,0.995989,1,28,29,0.004011
16,0.995955,1,14,13,0.004045


In [444]:
gr1 = new_texts_classifications_df.copy()
gr1[['Child Index', 'Parent Index']] = new_texts_classifications_df[['Parent Index', 'Child Index']]
gr2 = pd.concat([new_texts_classifications_df, gr1]).reset_index(drop=True)
gr3 = gr2.loc[gr2.groupby(['Parent Index'])["Strength Difference"].idxmax()]
gr3

Unnamed: 0,Output Strength,Predicted Label,Parent Index,Child Index,Strength Difference
56,0.995221,1,0,1,0.004779
33,0.981516,1,1,2,0.018484
70,0.981516,1,2,1,0.018484
34,0.877591,1,3,5,0.122409
73,0.826551,1,4,5,0.173449
36,0.826551,1,5,4,0.173449
72,0.86105,1,6,5,0.13895
39,0.997003,1,7,8,0.002997
40,0.996627,1,8,6,0.003373
69,0.991132,1,9,1,0.008868
