# Connect to drive & import libraries

In [2]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [3]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import os
from keras.engine import Layer
from keras.layers import Lambda, Input, Dense
from keras.models import Model
import matplotlib.pyplot as plt
import keras
import numpy as np
import nltk
import tqdm
import tensorflow_hub as hub
import string
from keras import backend as K
import tensorflow as tf
import tensorflow.keras.backend as B
import tensorflow_hub as hub
import re
from tensorflow.python.keras.engine import Layer

Using TensorFlow backend.


In [5]:
tf.__version__

'1.15.2'

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics

In [7]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Constants

In [0]:
# data parameters
dataset_address = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
test_data_percentage = 0.2
# model parameters
learning_rate = 0.0002
batch_size = 32
max_sequence_length = 128
number_of_epochs = [10]
# model links
bert_model_hub_link = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
elmo_model_hub_link = 'https://tfhub.dev/google/elmo/3'

# Reading & Pre-processing the data

In [10]:
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin=dataset_address, 
        extract=True)
  
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                         "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                          "aclImdb", "test"))
    return train_df, test_df
 
train, test = download_and_load_datasets()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [11]:
train.head(7)

Unnamed: 0,sentence,sentiment,polarity
0,"Yes, a true classic! This is what British dram...",9,1
1,I didn't expect much when I first saw the DVD ...,8,1
2,"Thanks to a dull, dimensionless screenplay by ...",1,0
3,"First off, I had my doubts just looking at the...",1,0
4,this was a fantastic episode. i saw a clip fro...,10,1
5,I saw this 25 years ago on PBS. It was very di...,10,1
6,"While the original titillates the intellect, t...",2,0


In [12]:
train_X = train.sentence.values
train_Y = train.polarity.values

test_X = test.sentence.values
test_Y = test.polarity.values
print(len(train_X), len(test_X))

25000 25000


## Clean and tokenize 

In [0]:
def clean_text(all_X, tokenizer_mod=None):
  nltk_eng_stopwords = nltk.corpus.stopwords.words('english')
  converted_X = []
  for this_x in all_X:
    if tokenizer_mod == None:
      this_x_no_stopword = this_x.split()
      converted_X.append(this_x_no_stopword)
    else:
      this_x_no_stopword = " ".join([val for val in nltk.tokenize.word_tokenize(this_x_no_punc) if not val in nltk_eng_stopwords])
      this_x_tokenized = tokenizer_mod.tokenize(this_x_no_stopword)
      converted_X.append(this_x_tokenized)
  return converted_X

def convert_y_to_0_and_1(all_Y):
  label_number_mapping = {0: [1, 0], 1: [0, 1]}
  converted_y = [label_number_mapping[val] for val in all_Y]
  return converted_y

In [0]:
all_train_X_cleaned = clean_text(train_X)

In [0]:
all_test_X_cleaned = clean_text(test_X)

In [0]:
all_train_y_converted = convert_y_to_0_and_1(train_Y)
all_test_y_converted = convert_y_to_0_and_1(test_Y)

### Testing Elmo results

In [17]:
print('X[0] => before: {} and after: {}'.format(train_X[0], all_train_X_cleaned[0]))
print('Y[0] => before: {} and after: {}'.format(train_Y[0], all_train_y_converted[0]))

X[0] => before: Yes, a true classic! This is what British drama is all about,realism and the minimal use of special effects (and over inflated budgets). I last saw this drama when it was last screened on British terrestial TV in 1994. It truly should be viewed by everyone who likes a scary plot,no big names but non-the-less great acting.Sadly the copywrite is now owned by someone unknown and as such this great drama is unlikely to be aired anytime soon.I myself recently acquired The Woman In Black on VHS,so now once again I shall be able to enjoy this truly great British drama. You should try and enjoy it too!<br /><br />Mark R. Horobin and after: ['Yes,', 'a', 'true', 'classic!', 'This', 'is', 'what', 'British', 'drama', 'is', 'all', 'about,realism', 'and', 'the', 'minimal', 'use', 'of', 'special', 'effects', '(and', 'over', 'inflated', 'budgets).', 'I', 'last', 'saw', 'this', 'drama', 'when', 'it', 'was', 'last', 'screened', 'on', 'British', 'terrestial', 'TV', 'in', '1994.', 'It', '

# Creating Model

## Using Elmo

In [0]:
all_X_preped_for_elmo_train = []
for val in all_train_X_cleaned:
  val_splitted = val[:max_sequence_length]
  val_to_add = ' '.join(val_splitted)
  all_X_preped_for_elmo_train.append(val_to_add)

In [0]:
all_X_preped_for_elmo_test = []
for val in all_test_X_cleaned:
  val_splitted = val[:max_sequence_length]
  val_to_add = ' '.join(val_splitted)
  all_X_preped_for_elmo_test.append(val_to_add)

In [0]:
X_train_elmo = np.asarray(all_X_preped_for_elmo_train, dtype="str")
X_test_elmo = np.asarray(all_X_preped_for_elmo_test, dtype="str")
y_train_elmo = np.asarray(all_train_y_converted, dtype=np.int32)

In [0]:
elmo = hub.Module(elmo_model_hub_link, trainable = True)
def get_elmo_embedding_for_each_input_using_lambda_function(inp):
  return elmo(tf.squeeze(tf.cast(inp, tf.string)), signature='default', as_dict=True)['default']

In [0]:
def create_elmo_using_model():
  input_text = Input(shape=(1,), dtype="string")
  embeddings = Lambda(get_elmo_embedding_for_each_input_using_lambda_function, output_shape=(1024, ))(input_text)
  dense = Dense(1024, activation='relu')(embeddings)
  pred = Dense(2, activation='softmax')(dense)
  model = Model(inputs=[input_text], outputs=pred)
  return model 

In [0]:
def evaluate_model_elmo(model):
  # elmo_test_input = [np.array(X_test_elmo_tokens), np.array(X_test_elmo_seqlens)]
  predicted_y = model.predict(X_test_elmo)
  y_test_conv = [np.argmax(val) for val in all_test_y_converted]
  predicted_y_conv = [np.argmax(val) for val in predicted_y]
  print('Accuracy:', accuracy_score(y_test_conv, predicted_y_conv))
  print('Classification report:', classification_report(y_test_conv, predicted_y_conv))
  print('F1 (Micro):', f1_score(y_test_conv, predicted_y_conv, average='micro'))
  print('F1 (Macro):', f1_score(y_test_conv, predicted_y_conv, average='macro'))
  print('F1 (Weighted):', f1_score(y_test_conv, predicted_y_conv, average='weighted'))
  print('Recall (Micro):', recall_score(y_test_conv, predicted_y_conv, average='micro'))
  print('Recall (Macro):', recall_score(y_test_conv, predicted_y_conv, average='macro'))
  print('Recall (Weighted):', recall_score(y_test_conv, predicted_y_conv, average='weighted'))
  print('Precision (Micro):', precision_score(y_test_conv, predicted_y_conv, average='micro'))
  print('Precision (Macro):', precision_score(y_test_conv, predicted_y_conv, average='macro'))
  print('Precision (Weighted):', precision_score(y_test_conv, predicted_y_conv, average='weighted'))
  fpr, tpr, thresholds = metrics.roc_curve(y_test_conv, predicted_y_conv, pos_label=1)
  print('AUC:', metrics.auc(fpr, tpr))

In [0]:
def plot_accuracy_and_loss(file_save_name, history):
  plt.plot(history.history['accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.savefig(file_save_name+'_accuracy.png')
  plt.cla()
  plt.plot(history.history['loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.savefig(file_save_name+'_loss.png')
  plt.cla()

In [25]:
this_optimizer = keras.optimizers.Adam(lr = learning_rate, beta_1=0.9, beta_2=0.999, amsgrad=False)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
for epochs in number_of_epochs:
  print('Number of epochs:', epochs)
  model = create_elmo_using_model()
  print(model.summary())
  model.compile(loss='binary_crossentropy', optimizer=this_optimizer, metrics=['accuracy'])
  history = model.fit(
      X_train_elmo, 
      y_train_elmo,
      epochs=epochs,
      batch_size=batch_size
  )
  print('Done training')
  image_save_file_name = 'drive/My Drive/CA5_Sabri_810198312/Results/Q2_elmo_nopreprocess_model_{}_epochs'.format(epochs)
  plot_accuracy_and_loss(image_save_file_name, history)
  evaluate_model_elmo(model)

Number of epochs: 10
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 2050      
Total params: 1,051,650
Trainable params: 1,051,650
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




















Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

**10 Epochs**

```

```