# Machine Learning Project Serie 1:
# IMDB Movie Review Sentiment Classification 
# Episode 3: Embedding Word Matrix
This episode focuses on fitting and testing the data with embedding word matrix usually used in NLP and a densely connected network.

## I. Importing Libraries

In [1]:
import numpy as np
import os
import pathlib
import tensorflow as tf
from tensorflow.keras import regularizers
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Embedding, Reshape, Flatten, Dropout, GRU
from keras.layers import RepeatVector, Dense, Activation, Lambda, Softmax, Conv1D
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import keras

## II. Extracting Data

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data()

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
word_dict = tf.keras.datasets.imdb.get_word_index(path="imdb_word_index.json")

In [4]:
vocab_len = len(word_dict)
print("Total words count:", vocab_len)

Total words count: 88584


## III. Data Preprocessing

In [51]:
chosen_cmt_len = 2000
max_index = 25000

def padding(initial_x):
    output = np.zeros((chosen_cmt_len))
    for i in range(chosen_cmt_len):
        if i < len(initial_x) and initial_x[i] < max_index:
            output[i] = initial_x[i]
        else:
            output[i] = 0
    return output

In [52]:
x_train_padded = np.zeros((len(x_train), chosen_cmt_len))
for i in range(len(x_train)):
    x_train_padded[i] = padding(x_train[i])

In [53]:
x_test_padded = np.zeros((len(x_test), chosen_cmt_len))
for i in range(len(x_test)):
    x_test_padded[i] = padding(x_test[i])

## IV. Machine Learning Model:

In [54]:
e_s = 20

In [55]:
# Creating model:
def model():
    
    # Retrieving inputs
    X_input = Input(shape=(chosen_cmt_len,))
    
    # Embedding meanings
    embedding = Embedding(max_index, e_s)(X_input)
    
    drop = Dropout(0.9)(embedding)
    
    flatten = Flatten()(drop)
    
    output = Dense(1, activation='sigmoid',
                   kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activity_regularizer=regularizers.l2(1e-5)
                  )(flatten)

    model = Model(inputs = X_input, outputs = output)
    
    return model

In [56]:
model = model()
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 2000)]            0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 2000, 20)          500000    
_________________________________________________________________
dropout_7 (Dropout)          (None, 2000, 20)          0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 40001     
Total params: 540,001
Trainable params: 540,001
Non-trainable params: 0
_________________________________________________________________


In [57]:
# Optimizer for the model
learning_rate = 5e-3
opt = Adam(lr=learning_rate, decay=1e-5)
model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=[tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", threshold=0.5)])

In [58]:
# Storing histories
histories = []
testings  = []

# Track testing accuracy
prev_acc = 0
curr_acc = 0.01

# Max testing accuracy
max_acc = 0

# Fitting and evaluating the model after epochs
epoch = 1

# Keep training as long as testing accuracy on testing set is still increasing
while epoch < 21:
    # Fitting
    print("Epoch:", epoch)
    print("Fitting data:")
    history = model.fit(x = x_train_padded, y = np.array(y_train).reshape(25000, 1), epochs=1, batch_size=1000)
    
    # Evaluating
    print("Testing data:")
    testing = model.evaluate(x_test_padded, np.array(y_test).reshape(25000, 1))
    
    # Assigning max accuracy
    if testing[1] > max_acc:
        max_acc = testing[1]
        
    # Assigning test accuracy
    prev_acc = curr_acc
    curr_acc = testing[1]
        
    # Adjust learning rate
    if prev_acc > curr_acc:
        learning_rate /= 10
        opt = Adam(lr=learning_rate, decay=1e-5)
        model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=[tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", threshold=0.5)])
    
    # Storing
    histories.append(history)
    testings.append(testing)
    
    epoch += 1
    print('\n')

print("Optimal testing accuracy is: {:.2f}%".format(max_acc * 100))

Epoch: 1
Fitting data:
Testing data:


Epoch: 2
Fitting data:
Testing data:


Epoch: 3
Fitting data:
Testing data:


Epoch: 4
Fitting data:
Testing data:


Epoch: 5
Fitting data:
Testing data:


Epoch: 6
Fitting data:
Testing data:


Epoch: 7
Fitting data:
Testing data:


Epoch: 8
Fitting data:
Testing data:


Epoch: 9
Fitting data:
Testing data:


Epoch: 10
Fitting data:
Testing data:


Epoch: 11
Fitting data:
Testing data:


Epoch: 12
Fitting data:
Testing data:


Epoch: 13
Fitting data:
Testing data:


Epoch: 14
Fitting data:
Testing data:


Epoch: 15
Fitting data:
Testing data:


Epoch: 16
Fitting data:
Testing data:


Epoch: 17
Fitting data:
Testing data:


Epoch: 18
Fitting data:
Testing data:


Epoch: 19
Fitting data:
Testing data:


Epoch: 20
Fitting data:
Testing data:


Optimal testing accuracy is: 88.19%


In [59]:
model.evaluate(x_train_padded, np.array(y_train).reshape(25000, 1))



[0.15848954021930695, 0.9601200222969055]

## V. Summary:
Embedding matrix performed quite well given the number of parameters trained was relatively small compared to the model used in previous episode.

|-|Loss|Accuracy|Sample size|
|-|-|-|-|
|Training|0.16|96.1%|25,000|
|Testing |0.31|88.2%|25,000|

## VIII. Thank you:
Thank you for viewing my project. See you in the next episode.