In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
training_data = pd.read_csv("bonus_train_data.csv")


## Create training sets

In [2]:
X_batch = training_data.iloc[:,0]
y_train = training_data.iloc[:,1]

## Process the Data

In [3]:
y_batch= tf.convert_to_tensor(y_train)

In [4]:
#Prepare strings by removing certain characters and Tokenize the data 
def preprocess(X_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>")

In [5]:
X_train = preprocess(X_batch)

In [6]:
from collections import Counter

vocabulary = Counter()
for review in X_train:
        vocabulary.update(list(review.numpy()))
len(vocabulary)

23994

In [7]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]
#Keep only the 10000 most common words

In [8]:
#Here we are replacing the words with their respective ID
#We also create a lookup table using 1000 Out of vocab words
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [9]:
#Encoding here
def encode_words(X_batch):
    return table.lookup(X_batch)

In [10]:
X_train = encode_words(X_train)

In [11]:
X_train

<tf.Tensor: shape=(6090, 32), dtype=int64, numpy=
array([[ 7225,    10,  4578, ...,     0,     0,     0],
       [ 7228,  7229,  7230, ...,     0,     0,     0],
       [ 2256,  3408,     6, ...,     0,     0,     0],
       ...,
       [10042, 10670,   126, ...,     0,     0,     0],
       [  879,   136,     5, ...,     0,     0,     0],
       [    9,  3390,     6, ...,     0,     0,     0]], dtype=int64)>

In [12]:
y_train

0       1
1       0
2       1
3       1
4       0
       ..
6085    0
6086    0
6087    0
6088    1
6089    1
Name: target, Length: 6090, dtype: int64

## Build and Train Model

In [13]:
embed_size = 128
#Setting mask_zero to True to train model ignore padding tokens with id 0.
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, 
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


We see that our model has a very high accuracy and has learned the training data well.

## Using Pretrained Embedding for comparison

In [14]:
!pip install tensorflow_hub
import tensorflow_hub as hub

model2 = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string, input_shape=[], output_shape=[50]),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])
model2.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])



In [15]:
history = model2.fit(X_batch, y_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


The pretrained embedding doesnt have as high of a score on training accuracy, but I was able to run 
many more epochs in a much shorter amount of time. 

## Evaluating Models

In [16]:
testing_data = pd.read_csv("bonus_test_data.csv")


X_test = testing_data.iloc[:,0]
y_test = testing_data.iloc[:,1]

In [18]:
model2.evaluate(X_test,y_test)



[0.48978060483932495, 0.7767564058303833]

In [19]:
y_test= tf.convert_to_tensor(y_test)

In [20]:
X_test = preprocess(X_test)
X_test = encode_words(X_test)

In [21]:
model.evaluate(X_test, y_test)



[0.9823856353759766, 0.7222586870193481]

## Conclusion

We see that our model performs worse on accuracy and has a high loss when evaluating suggesting we overfit our dataset. 
The pretrained embedded model performs better and has a lower loss which shows how using pretrained modules can show improvment.

Recommendations to improve the model might be to ad regulization or to tokenize the preprocess the texts to improve training.
Because of my limited knowledge of NLP and Sentiment analysis, I had a difficult time exploring alternative methods of Tokenizing and encoding my data.

