# Used Packages

In [86]:
import os
import shutil
import random

import pandas as pd
import spacy
import numpy as np
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from official.nlp import optimization

import matplotlib.pyplot as plt

# Model Creation

In [11]:
PATH = 'data'

##  Dataset

In [12]:
# downloading the imdb dataset (if not already done)
# removing the unnecessary unsup folder because this is a supervised ml task
if not os.path.isdir('data'):
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

    dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                        untar=True, cache_dir=PATH,
                                        cache_subdir='')

    shutil.rmtree('unsup')

In [13]:
# setting the directory for the training and test data
train_dir = os.path.join(PATH, 'train')
test_dir = os.path.join(PATH, 'test')

### Dataset Parameters

In [29]:
# setting model parameters
# autotune allows the automatic setting of the number of prefetched data ahead
# of time they are requested in the learning process
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 42

### Splitting Dataset

In [30]:
# training set 80 percent of all files with 20 left for validation
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir, 
    batch_size=batch_size)

test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


## Model

In [16]:
# setting the bert encoder and preprocessor
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [17]:
# generating the bert encoder and preprocess layer for the model
# (save model error can be fixed by deleting temp folder)
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [18]:
# function for building the classifiert model 
# text input -> preprocessing -> encode -> droput -> dense 
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [19]:
# initialize classifier model
classifier_model = build_classifier_model()

In [22]:
# sett loss and metric functions
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [31]:
# set epochs and learning rate
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [32]:
# compile the model
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

### Training

In [33]:
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
loss, accuracy = classifier_model.evaluate(test_ds)



In [35]:
dataset_name = 'imdb'
saved_model_path = './models/{}_bert'.format(dataset_name.replace('/', '_'))

In [56]:
classifier_model.save(saved_model_path, include_optimizer=False)



INFO:tensorflow:Assets written to: ./models/imdb_bert\assets


INFO:tensorflow:Assets written to: ./models/imdb_bert\assets


# Loading the model and working with it

In [36]:
model = tf.keras.models.load_model(saved_model_path, compile=False)

In [84]:
test_data_unbatched = list(test_ds.unbatch().as_numpy_iterator())

In [198]:
rand_ind = random.randint(0, len(test_data_unbatched))
rand_sen_label_pair = test_data_unbatched[rand_ind]
rand_sen_label_pair[0]

b"I have seen so many bad reviews on Supervivientes de los Andes that I felt compelled to stand for it (or at least I'll try). First of all, of course that it looks dated, it was made in the seventies with very low budget, but that's part of it's charm. I like contemporary films but also dig the old ones for what they worth. I'm not the one to feel the urge to only see or like movies with modern treatments and effects; besides, almost every movie buff likes old fashioned motion pictures (who doesn't like films from El Santo or Plan 9 from outer space, no matter it's overall quality?). In the aspect of pace, is just a tool for covering (again) it's low cost, and I think the constant dialogs are in order of a better character and situations development. Sure, Alive has better FX, but I won't despise the old one just because of that, and I don't feel quite attracted to English speakers in an event involving people from Uruguay and for me, that gives a plus to Supervivientes de los Andes. 

In [199]:
def classify_text(model, text):
    score = tf.sigmoid(model(tf.constant([text])))[0][0].numpy()
    pred_label = np.where(score > 0.5, 1, 0).item()
    return (score, pred_label)

In [200]:
classify_text(model, rand_sen_label_pair[0])

(0.83365536, 1)

## Dependecy Parser

In [201]:
depend_parser = spacy.load('en_core_web_sm')

In [202]:
parsed_text = depend_parser(str(rand_sen_label_pair[0]))
sentence_spans = list(parsed_text.sents)[0]

In [203]:
spacy.displacy.render(sentence_spans, jupyter=True, options={"compact": True})

### Leave-n-out

In [204]:
new_texts = []
for token in parsed_text:
    tok_ind = token.i
    
    if token.children:
        for child in token.children:
            new_text = []
            child_ind = child.i

            for item in parsed_text:
                if item.i == tok_ind or item.i == child.i:
                    pass
                else:
                    new_text.append(item.text)

            new_texts.append((" ".join(new_text)))

In [207]:
new_texts_classifications = [classify_text(model, text) for text in new_texts]
new_texts_classifications

[(0.6818715, 1),
 (0.65880877, 1),
 (0.60381085, 1),
 (0.8073026, 1),
 (0.7126456, 1),
 (0.7594188, 1),
 (0.6838556, 1),
 (0.43834484, 0),
 (0.78849816, 1),
 (0.534134, 1),
 (0.7266864, 1),
 (0.6159707, 1),
 (0.7131819, 1),
 (0.5454787, 1),
 (0.6220037, 1),
 (0.7058664, 1),
 (0.8123967, 1),
 (0.8067444, 1),
 (0.79686856, 1),
 (0.7528397, 1),
 (0.7639578, 1),
 (0.7460597, 1),
 (0.7648676, 1),
 (0.734621, 1),
 (0.78007853, 1),
 (0.7828777, 1),
 (0.74738836, 1),
 (0.7468988, 1),
 (0.7331514, 1),
 (0.72787976, 1),
 (0.7677414, 1),
 (0.83321947, 1),
 (0.8197409, 1),
 (0.81910944, 1),
 (0.8596195, 1),
 (0.7213638, 1),
 (0.72718936, 1),
 (0.74637985, 1),
 (0.70255756, 1),
 (0.6975753, 1),
 (0.75904, 1),
 (0.7404403, 1),
 (0.6403055, 1),
 (0.6931861, 1),
 (0.62103814, 1),
 (0.7759081, 1),
 (0.8682085, 1),
 (0.86217946, 1),
 (0.67837083, 1),
 (0.7799625, 1),
 (0.8216156, 1),
 (0.7799865, 1),
 (0.6327552, 1),
 (0.84721905, 1),
 (0.7852516, 1),
 (0.31773797, 0),
 (0.6396585, 1),
 (0.25040853, 0),