In [1]:
from tensorflow.keras.datasets import imdb
import numpy as np
import tensorflow as tf

In [None]:
NUM_WORDS = 10000
MAX_LENGHT = 25

In [None]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=NUM_WORDS)

# Inspect data

Train data is an array of integer indexes

Label 1 for possitive and 0 for negative feelings towards the review

In [None]:
train_data.shape

In [None]:
train_data[0][:10]

In [None]:
train_labels[0]

# translate sentence to words

In [None]:
index_to_word = dict([(value, key) for key, value in imdb.get_word_index().items()])

In [None]:
index_to_word[1]

In [None]:
" ".join([index_to_word[index] for index in train_data[0]])

# 1st Approach using 1 hot encoding

In [None]:
type(train_data)

In [None]:
for s in train_data:
    print(type(s))
    break

In [None]:
def padding(sentence, max_lenght):
    if max_lenght:
        return sentence[:max_lenght]
    return sentence

def onehot_v2(sentences, num_words, max_lenght):
    """
        For each sentence I have a word index
        4 7 43 314
        then limit those words to max lenght
        the last step is to place a 1 in the position of each word Index
        the output will be a 10000 position array
        the array goes from 0 so position 4 will be the 5th position of the array
        0 0 0 1 0 0 1 0 0 0 0 0 .... (44)1 ... (315)1
    """
    zeros_array = np.zeros((len(sentences), num_words))
    for sentence_index, sentence in enumerate(sentences):
        for word_index in padding(sentence, max_lenght=max_lenght):
            zeros_array[sentence_index, word_index] = 1
    
    return zeros_array

#padding was not working
def onehot_v1(sentences, num_words, max_lenght):
    """
        For each sentence I have a word index
        4 7 43 314
        then limit those words to max lenght
        the last step is to place a 1 in the position of each word Index
        the output will be a 10000 position array
        the array goes from 0 so position 4 will be the 5th position of the array
        0 0 0 1 0 0 1 0 0 0 0 0 .... (44)1 ... (315)1
    """
    zeros_array = np.zeros((len(sentences), num_words))
    for sentence_index, sentence in enumerate(padding(sentences, max_lenght=max_lenght)):
        for word_index in sentence:
            zeros_array[sentence_index, word_index] = 1
    
    return zeros_array

def onehot_v2(sentences, num_words, max_lenght):
    """
        For each sentence I have a word index
        4 7 43 314
        then limit those words to max lenght
        the last step is to place a 1 in the position of each word Index
        the output will be a 10000 position array
        the array goes from 0 so position 4 will be the 5th position of the array
        0 0 0 1 0 0 1 0 0 0 0 0 .... (44)1 ... (315)1
    """
    zeros_array = np.zeros((len(sentences), num_words))
    for sentence_index, sentence in enumerate(sentences):
        for word_index in padding(sentence, max_lenght=max_lenght):
            zeros_array[sentence_index, word_index] = 1
    
    return zeros_array

In [None]:
one_hot_train_data = onehot_v2(train_data, max_lenght=MAX_LENGHT, num_words=NUM_WORDS)

In [None]:
one_hot_train_data.shape

# 0 is reserved for the UKN token

In [None]:
one_hot_test_data = onehot_v2(test_data, max_lenght=MAX_LENGHT, num_words=NUM_WORDS)

In [None]:
one_hot_test_data.shape

# one hot labels

In [None]:
one_hot_test_label = np.array(test_labels, dtype="float32")
one_hot_train_label = np.array(train_labels, dtype="float32")

# Test on hot encoding

In [None]:
MAX_LENGHT

In [None]:
type(train_data)

In [None]:
type(train_data[1])

In [None]:
padding(train_data[1], 5)

In [None]:
train_data[1][:5]

# Bug padding was not working

In [None]:
onehot_v1([train_data[1]], NUM_WORDS, 5)

In [None]:
onehot_v2([train_data[1]], NUM_WORDS, 5)

# Model

In [None]:
X_inputs = tf.keras.Input(shape=(NUM_WORDS,))
X = tf.keras.layers.Dense(units=32, activation="relu")(X_inputs)
X = tf.keras.layers.Dense(units=16, activation="relu")(X_inputs)
X = tf.keras.layers.Dense(units=8, activation="relu")(X_inputs)
X = tf.keras.layers.Dense(units=4, activation="relu")(X_inputs)
X_output = tf.keras.layers.Dense(units=1, activation="sigmoid")(X)

model=tf.keras.Model(inputs=X_inputs, outputs=X_output)

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.RMSprop(lr=1e-3),
                metrics=["accuracy", "Precision", "Recall", "AUC"])

# Callbacks

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
                monitor="val_loss",
                patience = 5
            )

def scheduler(epoch, lr):
  if epoch < 15:
    return lr
  else:
    return lr * tf.math.exp(-0.05)

learning_rate_exp_reduce = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(save_best_only=True, filepath="checkpoint")

In [None]:
model.fit(one_hot_train_data,
         one_hot_train_label,
         epochs=500,
         validation_data=(one_hot_test_data, one_hot_test_label),
         callbacks=[early_stop, learning_rate_exp_reduce, model_checkpoint])