# Connect to drive & import libraries

In [3]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


In [0]:
%tensorflow_version 2.x

In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import os
from keras.engine import Layer
import matplotlib.pyplot as plt
import keras
import numpy as np
import nltk
import tqdm
import tensorflow_hub as hub
import string
import re
from keras import backend as K

In [6]:
tf.__version__

'2.2.0-rc3'

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics

In [8]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [9]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/35/5c/6439134ecd17b33fe0396fb0b7d6ce3c5a120c42a4516ba0e9a2d6e43b25/bert-for-tf2-0.14.4.tar.gz (40kB)
[K     |████████                        | 10kB 30.9MB/s eta 0:00:01[K     |████████████████▏               | 20kB 1.6MB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 1.8MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
import bert

# Constants

In [0]:
# data parameters
dataset_address = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
test_data_percentage = 0.2
# model parameters
learning_rate = 0.0002
batch_size = 32
max_sequence_length = 128
number_of_epochs = [1, 10, 20, 50]
# model links
bert_model_hub_link = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1'
elmo_model_hub_link = 'https://tfhub.dev/google/elmo/3'

# Reading & Pre-processing the data

In [0]:
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin=dataset_address, 
        extract=True)
  
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                         "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                          "aclImdb", "test"))
    return train_df, test_df
 
train, test = download_and_load_datasets()

In [24]:
train_X = train.sentence.values
train_Y = train.polarity.values

test_X = test.sentence.values
test_Y = test.polarity.values
print(len(train_X), len(test_X))

25000 25000


## Create bert tokenizer

In [0]:
bert_layer = hub.KerasLayer(bert_model_hub_link)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer_instance = bert.bert_tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

### Bert functions for mask and id and segment id

In [0]:
def convert_text_to_number_and_fix_length(this_x, tokenizer_mod):
  this_x_ids = tokenizer_mod.convert_tokens_to_ids(this_x,)
  if len(this_x_ids) >= max_sequence_length:
    this_x_fix_len = this_x_ids[:max_sequence_length]
  else:
    this_x_fix_len = this_x_ids + [0]*(max_sequence_length - len(this_x_ids)) 
  return this_x_fix_len

def create_mask_for_input(this_x):
  if len(this_x) >= max_sequence_length:
    this_x_masks = [1]*max_sequence_length
  else:
    this_x_masks = [1]*len(this_x) + [0] * (max_sequence_length - len(this_x))
  return this_x_masks

def create_req_bert_inputs(all_X, tokenizer_mod):
  X_input_ids, X_input_masks, X_segment_ids = [], [], []

  for this_x in tqdm.tqdm(all_X, position=0, leave=True):
    X_input_ids.append(convert_text_to_number_and_fix_length(this_x, tokenizer_mod))
    X_input_masks.append(create_mask_for_input(this_x))
    X_segment_ids.append([0 for _ in range(max_sequence_length)])

  return X_input_ids, X_input_masks, X_segment_ids

## Clean and tokenize 

In [0]:
def clean_text(all_X, tokenizer_mod=None):
  nltk_eng_stopwords = nltk.corpus.stopwords.words('english')
  converted_X = []
  for this_x in all_X:
    this_x_lower = this_x.lower()
    this_x_no_punc = this_x_lower.translate(str.maketrans('', '', string.punctuation))
    if tokenizer_mod == None:
      this_x_tokenized = nltk.tokenize.word_tokenize(this_x_no_punc)
      this_x_no_stopword = [val for val in this_x_tokenized if not val in nltk_eng_stopwords]
      converted_X.append(this_x_no_stopword)
    else:
      this_x_no_stopword = " ".join([val for val in nltk.tokenize.word_tokenize(this_x_no_punc) if not val in nltk_eng_stopwords])
      this_x_tokenized = tokenizer_mod.tokenize(this_x_no_stopword)
      converted_X.append(this_x_tokenized)
  return converted_X

def convert_y_to_0_and_1(all_Y):
  label_number_mapping = {0: [1, 0], 1: [0, 1]}
  converted_y = [label_number_mapping[val] for val in all_Y]
  return converted_y

In [0]:
all_train_X_cleaned_using_bert = clean_text(train_X, bert_tokenizer_instance)

In [0]:
all_test_X_cleaned_using_bert = clean_text(test_X, bert_tokenizer_instance)

In [0]:
all_train_y_converted = convert_y_to_0_and_1(train_Y)
all_test_y_converted = convert_y_to_0_and_1(test_Y)

### Testing BERT results

In [31]:
print('X[0] => before: {} and after: {}'.format(train_X[0], all_train_X_cleaned_using_bert[0]))
print('Y[0] => before: {} and after: {}'.format(train_Y[0], all_train_y_converted[0]))

X[0] => before: Unlike many other films, which are disturbing either by dint of their naked unpleasantness (Man Bites Dog) or their sheer violence (most Peckinpah films), Deliverance shocks by its plausibility. Certainly, the buggery scene is pretty straightforward in its unpleasantness, but the film's effect derives far more from its slow build-up and the tangible sense of isolation surrounding the four leads, both before and after everything starts to go wrong. The moment when the canoes pass under the child on the bridge, who does not even acknowledge the men he had earlier played music with, let alone show any sign of human affection towards them, is among the most sinister in modern film. The tension increases steadily throughout the canoe trip, and perseveres even after the final credits - the ending makes the significance of the characters' ordeals horrifically real. The movie's plausibility is greatly aided by the playing of the leads, particularly Ned Beatty and Jon Voight as 

# Creating Model

## Using Bert

In [37]:
train_input_ids, train_input_masks, train_segment_ids = create_req_bert_inputs(all_train_X_cleaned_using_bert, bert_tokenizer_instance)
test_input_ids, test_input_masks, test_segment_ids = create_req_bert_inputs(all_test_X_cleaned_using_bert, bert_tokenizer_instance)

100%|██████████| 25000/25000 [00:01<00:00, 15414.33it/s]
100%|██████████| 25000/25000 [00:00<00:00, 27253.93it/s]


In [38]:
bert_model_preped_input = [np.asarray(train_input_ids, dtype=np.int32), np.asarray(train_input_masks, dtype=np.int32), np.asarray(train_segment_ids, dtype=np.int32)]
len(bert_model_preped_input), bert_model_preped_input[0].shape, bert_model_preped_input[0][1].shape

(3, (25000, 128), (128,))

In [0]:
this_optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')

In [0]:
def create_model():
  input_word_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32,name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32,name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32,name="segment_ids")
  bert_layer = hub.KerasLayer(bert_model_hub_link,trainable=True)
  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  pooling_lay = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
  dense = tf.keras.layers.Dense(768, activation='relu')(pooling_lay)
  pred = tf.keras.layers.Dense(2, activation='softmax')(dense)
  model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=pred)
  return model 

In [0]:
def plot_accuracy_and_loss(file_save_name, history):
  plt.plot(history.history['accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.savefig(file_save_name+'_accuracy.png')
  plt.cla()
  plt.plot(history.history['loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.savefig(file_save_name+'_loss.png')
  plt.cla()

In [0]:
def evaluate_model(model):
  bert_model_test_input = [np.asarray(test_input_ids, dtype=np.int32), np.asarray(test_input_masks, dtype=np.int32), np.asarray(test_segment_ids, dtype=np.int32)]
  predicted_y = model.predict(bert_model_test_input)
  y_test_conv = [np.argmax(val) for val in all_test_y_converted]
  predicted_y_conv = [np.argmax(val) for val in predicted_y]
  print('Accuracy:', accuracy_score(y_test_conv, predicted_y_conv))
  print('Classification report:', classification_report(y_test_conv, predicted_y_conv))
  print('F1 (Micro):', f1_score(y_test_conv, predicted_y_conv, average='micro'))
  print('F1 (Macro):', f1_score(y_test_conv, predicted_y_conv, average='macro'))
  print('F1 (Weighted):', f1_score(y_test_conv, predicted_y_conv, average='weighted'))
  print('Recall (Micro):', recall_score(y_test_conv, predicted_y_conv, average='micro'))
  print('Recall (Macro):', recall_score(y_test_conv, predicted_y_conv, average='macro'))
  print('Recall (Weighted):', recall_score(y_test_conv, predicted_y_conv, average='weighted'))
  print('Precision (Micro):', precision_score(y_test_conv, predicted_y_conv, average='micro'))
  print('Precision (Macro):', precision_score(y_test_conv, predicted_y_conv, average='macro'))
  print('Precision (Weighted):', precision_score(y_test_conv, predicted_y_conv, average='weighted'))
  fpr, tpr, thresholds = metrics.roc_curve(y_test_conv, predicted_y_conv, pos_label=1)
  print('AUC:', metrics.auc(fpr, tpr))

**1 Epoch**


```
Accuracy: 0.9730941704035875

Classification report:               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       0.99      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115

F1 (Micro): 0.9730941704035875
F1 (Macro): 0.9371920627290753
F1 (Weighted): 0.9719016923313616
Recall (Micro): 0.9730941704035875
Recall (Macro): 0.9028151986183074
Recall (Weighted): 0.9730941704035875
Precision (Micro): 0.9730941704035875
Precision (Macro): 0.9812994238357023
Precision (Weighted): 0.9736217182706349
AUC: 0.9028151986183074
```

**10 Epochs**

```
Accuracy: 0.8654708520179372
Classification report:               precision    recall  f1-score   support

         ham       0.87      1.00      0.93       965
        spam       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

F1 (Micro): 0.8654708520179372
F1 (Macro): 0.46394230769230765
F1 (Weighted): 0.803057088651259
Recall (Micro): 0.8654708520179372
Recall (Macro): 0.5
Recall (Weighted): 0.8654708520179372
Precision (Micro): 0.8654708520179372
Precision (Macro): 0.4327354260089686
Precision (Weighted): 0.7490397956926541
AUC: 0.5
```

**20 Epochs**

```
Accuracy: 0.8654708520179372
Classification report:               precision    recall  f1-score   support

         ham       0.87      1.00      0.93       965
        spam       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

F1 (Micro): 0.8654708520179372
F1 (Macro): 0.46394230769230765
F1 (Weighted): 0.803057088651259
Recall (Micro): 0.8654708520179372
Recall (Macro): 0.5
Recall (Weighted): 0.8654708520179372
Precision (Micro): 0.8654708520179372
Precision (Macro): 0.4327354260089686
Precision (Weighted): 0.7490397956926541
AUC: 0.5
```

**50 Epochs**

```
Accuracy: 0.8654708520179372
Classification report:               precision    recall  f1-score   support

         ham       0.87      1.00      0.93       965
        spam       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

F1 (Micro): 0.8654708520179372
F1 (Macro): 0.46394230769230765
F1 (Weighted): 0.803057088651259
Recall (Micro): 0.8654708520179372
Recall (Macro): 0.5
Recall (Weighted): 0.8654708520179372
Precision (Micro): 0.8654708520179372
Precision (Macro): 0.4327354260089686
Precision (Weighted): 0.7490397956926541
AUC: 0.5
```

In [44]:
val_counts_in_train_y = {}
for val in all_train_y_converted:
  if not str(val) in val_counts_in_train_y:
    val_counts_in_train_y[str(val)] = 0
  val_counts_in_train_y[str(val)] += 1
val_counts_in_train_y

{'[0, 1]': 12500, '[1, 0]': 12500}

In [0]:
all_train_y_converted = np.asarray(all_train_y_converted)

In [0]:
for epochs in number_of_epochs:
  print('Number of epochs:', epochs)
  model = create_model()
  print(model.summary())
  model.compile(loss='binary_crossentropy', optimizer=this_optimizer, metrics=['accuracy'])
  history = model.fit(
      bert_model_preped_input, 
      all_train_y_converted,
      epochs=epochs,
      batch_size=batch_size
  )
  print('Done training')
  image_save_file_name = 'drive/My Drive/CA5_Sabri_810198312/Results/Q2_bert_model_{}_epochs'.format(epochs)
  plot_accuracy_and_loss(image_save_file_name, history)
  evaluate_model(model)

Number of epochs: 1
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_m

  _warn_prf(average, modifier, msg_start, len(result))


Recall (Weighted): 0.5
Precision (Micro): 0.5
Precision (Macro): 0.25
Precision (Weighted): 0.25
AUC: 0.5
Number of epochs: 10


  _warn_prf(average, modifier, msg_start, len(result))


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer_3 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           