## Model 2 - RNN Based With Embedding Layer

In [39]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1 - Read the data 

In [4]:
df = pd.read_csv("../data/IMDB Dataset.csv")

def preprocess(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x    

df.review = df.review.apply(preprocess)
df.sentiment = df.sentiment == "positive"

In [5]:
x_train = df.review.values
y_train = df.sentiment.astype(int).values

### 2- Build the model

In [2]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Masking, GRU
from tensorflow.keras.models import Model

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [6]:
# Preprocee x_train
MAX_SEQ_LEN = 200
MAX_WORDS_VOCAB = 10000

In [7]:
tokenizer = Tokenizer(num_words=MAX_WORDS_VOCAB, oov_token=0)
tokenizer.fit_on_texts(x_train)

x_train_tokenized = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train_tokenized, maxlen=MAX_SEQ_LEN)

In [9]:
def make_model(embedding_dim = 300, n_hidden_lstm = 128, n_hidden_dense = 128):
    x = inp = Input(shape = [MAX_SEQ_LEN])
    x = Embedding(input_dim=MAX_WORDS_VOCAB, output_dim=embedding_dim)(x)
    x = LSTM(units=n_hidden_lstm)(x)
    x = Dense(units=n_hidden_dense, activation='relu')(x)
    x = Dropout(rate=0.3)(x)
    out = x = Dense(1, activation='sigmoid')(x)
    model = Model(inp, out)
    return model

In [43]:
model = make_model()
model.summary()
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 300)          3000000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 3,236,289
Trainable params: 3,236,289
Non-trainable params: 0
_________________________________________________

### 3- Train

In [52]:
# set up the epoches to have better accuracy
def train(x_train, y_train, model, batch_size=1000):
    scores = []
    loss = []
    num_batch = int(x_train.shape[0] / batch_size)
    for i in range(num_batch-1):
        start = i * batch_size
        end = (i + 1) * batch_size
        history = model.fit(x_train[start:end], y_train[start:end], validation_data=(x_train, y_train), epochs=1)
        scores.append(history.history['val_accuracy'])
        loss.append(history.history['val_loss'])
    return scores

In [51]:
scores = train(x_train, y_train, model)

In [None]:
plt.plot(scores, label='Train Accuracy')
plt.xlabel('# of data points (1000)')
plt.ylabel('Training Accuracy')
plt.title('Training curve on data points')
plt.show()