# Imports and utility functions

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt # plotting
import matplotlib.image as mpimg # images
import numpy as np #numpy
import seaborn as sns
import tensorflow.compat.v2 as tf #use tensorflow v2 as a main 
import tensorflow.keras as keras # required for high level applications
from sklearn.model_selection import train_test_split # split for validation sets
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import normalize # normalization of the matrix
import scipy
import pandas as pd
import re

In [None]:
def cleanTexts(texts):
    cleaned = []
    pattern = "[^a-zA-Z0-9]"
    for text in texts:
        clrd = re.sub(pattern," ",text).lower().strip()
        cleaned.append(clrd)
    return cleaned

# Load data

In [None]:
from sklearn.utils import shuffle

dataset = pd.read_csv('train_data_imdb.csv')
dataset = shuffle(dataset)
dataset.tail()

Unnamed: 0.1,Unnamed: 0,text,label
12341,23856,Why would any legitimate actor having read the...,0
17921,6432,"And my children love it now! Granted, I can wa...",1
1233,4408,Using Buster Keaton in the twilight of his car...,1
13700,14169,"Minimal script, minimal character development,...",0
10062,8556,"Not really a big box office draw, but I was pl...",1


In [None]:
x_train = list(cleanTexts(dataset['text']))
# print(x[:5])

y_train = list(dataset['label'])
# print(y[:5])

#x_train = x_train[:10000]
#y_train = y_train[:10000]

x_train = x_train[:25000]
y_train = y_train[:25000]

In [None]:
test_data = pd.read_csv('test_data_imdb.csv')

x_test = list(cleanTexts(test_data['text']))
y_test = list(test_data['label'])

x_test = x_test[:10000]
y_test = y_test[:10000]

# Train model and predict on test dataset

In [None]:
from tensorflow import string as tf_string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.compat.v1.keras.layers import CuDNNGRU, CuDNNLSTM
from tensorflow.keras.layers import LSTM, GRU, Bidirectional

# Import FastText embeddings

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz

--2022-04-17 10:54:43--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2022-04-17 10:55:53 (18.3 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]



In [None]:
import gzip

In [None]:
!gzip -d cc.en.300.vec.gz

In [None]:
path_to_fasttext_file = './cc.en.300.vec'

embeddings_index = {}
with open(path_to_fasttext_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 2000000 word vectors.


# 1 - 4 Vectorizer parameters

In [None]:
from tensorflow import string as tf_string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

embedding_dim = 300 # Dimension of embedded representation 
vocab_size = 30000 # Number of unique tokens in vocabulary
sequence_length = 128 # Output dimension after vectorizing

vect_layer = TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length)
vect_layer.adapt(x_train)

In [None]:
voc = vect_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
num_tokens = len(voc) + 2
hits = 0
misses = 0


embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 28300 words (1700 misses)


In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.10, random_state=69, stratify=y_train)

## 1.Experiment

In [None]:
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = LSTM(64, activation='relu', return_sequences=True)(emb)
x = keras.layers.Dropout(0.6)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dropout(0.6)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_17 (Embedding)    (None, 128, 300)          9000600   
                                                                 
 bidirectional_1 (Bidirectio  (None, 128, 128)         186880    
 nal)                                                            
                                                                 
 dropout_36 (Dropout)        (None, 128, 128)          0         
                                                                 
 flatten_17 (Flatten)        (None, 16384)             0  

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 14
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


In [None]:
model.evaluate(x_test,y_test)

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

Accuracy score is 78.79% 
f1-score is 0.8091424457842168% 
              precision    recall  f1-score   support

           0       0.87      0.68      0.76      4984
           1       0.74      0.90      0.81      5016

    accuracy                           0.79     10000
   macro avg       0.80      0.79      0.79     10000
weighted avg       0.80      0.79      0.79     10000



## 2.Experiment

In [None]:
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = Bidirectional(LSTM(64, activation='relu', return_sequences=True))(emb)
x = keras.layers.Dropout(0.6)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dropout(0.6)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_18 (Embedding)    (None, 128, 300)          9000600   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128, 128)         186880    
 nal)                                                            
                                                                 
 dropout_38 (Dropout)        (None, 128, 128)          0         
                                                                 
 flatten_18 (Flatten)        (None, 16384)             0  

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 14
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


In [None]:
model.evaluate(x_test,y_test)



[0.5521315336227417, 0.7289000153541565]

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

Accuracy score is 72.89% 
f1-score is 0.726078609679701% 
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      4984
           1       0.74      0.72      0.73      5016

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



## 3.Experiment

In [None]:
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = LSTM(128, activation='relu', return_sequences=True)(emb)
x = keras.layers.Dropout(0.2)(x)
x = GRU(128, activation='relu', return_sequences=True)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(128, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_3 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_20 (Embedding)    (None, 128, 300)          9000600   
                                                                 
 lstm_21 (LSTM)              (None, 128, 128)          219648    
                                                                 
 dropout_41 (Dropout)        (None, 128, 128)          0         
                                                                 
 gru_10 (GRU)                (None, 128, 128)          99072     
                                                          

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 8
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
model.evaluate(x_test,y_test)



[0.35831525921821594, 0.8446999788284302]

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

Accuracy score is 84.47% 
f1-score is 0.8460089241447694% 
              precision    recall  f1-score   support

           0       0.85      0.84      0.84      4984
           1       0.84      0.85      0.85      5016

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



## 4.Experiment

In [None]:
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = Bidirectional(LSTM(128, activation='relu', return_sequences=True))(emb)
x = keras.layers.Dropout(0.2)(x)
x = Bidirectional(GRU(128, activation='relu', return_sequences=True))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(128, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_27 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_3 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_26 (Embedding)    (None, 128, 300)          9000600   
                                                                 
 bidirectional_13 (Bidirecti  (None, 128, 256)         439296    
 onal)                                                           
                                                                 
 dropout_65 (Dropout)        (None, 128, 256)          0         
                                                                 
 bidirectional_14 (Bidirecti  (None, 128, 256)         296

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 6
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
model.evaluate(x_test,y_test)



[0.3652898073196411, 0.840499997138977]

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

# 5 - 6 Vectorizer parameters

In [None]:
from tensorflow import string as tf_string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

embedding_dim = 300 # Dimension of embedded representation
vocab_size = 100000 # Number of unique tokens in vocabulary
sequence_length = 250 # Output dimension after vectorizing 

vect_layer = TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=sequence_length)
vect_layer.adapt(x_train)

In [None]:
voc = vect_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
num_tokens = len(voc) + 2
hits = 0
misses = 0


embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 59576 words (14907 misses)


In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.10, random_state=69, stratify=y_train)

## 5.Experiment

In [None]:
#4
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = Bidirectional(LSTM(64, activation='relu', return_sequences=True))(emb)
#x = Bidirectional(GRU(64, activation='relu', return_sequences=False))(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(128, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_9 (TextV  (None, 300)              0         
 ectorization)                                                   
                                                                 
 embedding_18 (Embedding)    (None, 300, 300)          22345500  
                                                                 
 bidirectional_27 (Bidirecti  (None, 300, 128)         186880    
 onal)                                                           
                                                                 
 flatten_15 (Flatten)        (None, 38400)             0         
                                                                 
 dense_53 (Dense)            (None, 128)               491

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 6
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

In [None]:
model.evaluate(x_test,y_test)

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

## 6.Experiment

In [None]:
input_layer = keras.layers.Input(shape=(1,), dtype=tf_string)
x_v = vect_layer(input_layer)
emb = keras.layers.Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)(x_v)
x = Bidirectional(LSTM(128, activation='relu', return_sequences=True))(emb)
x = keras.layers.Dropout(0.2)(x)
x = Bidirectional(LSTM(128))(x)
x = keras.layers.Dense(128, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, 'relu')(x)
x = keras.layers.Dropout(0.2)(x)
output_layer = keras.layers.Dense(1, 'sigmoid')(x)

model = keras.Model(input_layer, output_layer)
model.summary()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Model: "model_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_29 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_4 (TextV  (None, 250)              0         
 ectorization)                                                   
                                                                 
 embedding_28 (Embedding)    (None, 250, 300)          22345500  
                                                                 
 bidirectional_17 (Bidirecti  (None, 250, 256)         439296    
 onal)                                                           
                                                                 
 dropout_73 (Dropout)        (None, 250, 256)          0         
                                                                 
 bidirectional_18 (Bidirecti  (None, 256)              394

In [None]:
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=70, restore_best_weights=True)

batch_size = 768
epochs = 6
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), callbacks=[es], epochs=epochs, batch_size=batch_size)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
model.evaluate(x_test,y_test)



[0.3598681688308716, 0.8489000201225281]

In [None]:
y_pred=model.predict(x_test)

accuracy_sc = accuracy_score(y_pred=y_pred.round(),y_true=y_test)*100
f1_sc = f1_score(y_pred=y_pred.round(),y_true=y_test)

print("Accuracy score is {}% ".format(accuracy_sc))
print("f1-score is {}% ".format(f1_sc))
print(classification_report(y_pred=y_pred.round(),y_true=y_test))

Accuracy score is 84.89% 
f1-score is 0.8502626102467544% 
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      4984
           1       0.85      0.86      0.85      5016

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# Save model

In [None]:
model.save('IMDB_FT_fourth')

INFO:tensorflow:Assets written to: FR_LSTM_sixth/assets




In [None]:
!zip -r /content/IMDB_FT_first.zip /content/IMDB_FT_first/

  adding: content/FR_LSTM_sixth/ (stored 0%)
  adding: content/FR_LSTM_sixth/variables/ (stored 0%)
  adding: content/FR_LSTM_sixth/variables/variables.index (deflated 67%)
  adding: content/FR_LSTM_sixth/variables/variables.data-00000-of-00001 (deflated 8%)
  adding: content/FR_LSTM_sixth/assets/ (stored 0%)
  adding: content/FR_LSTM_sixth/keras_metadata.pb (deflated 90%)
  adding: content/FR_LSTM_sixth/saved_model.pb (deflated 78%)
