## Model 3 - RNN Based With Spacy

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import spacy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1 - Read the data

In [5]:
df = pd.read_csv("../data/IMDB Dataset.csv")

def preprocess(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x    

df.review = df.review.apply(preprocess)
df.sentiment = df.sentiment == "positive"

In [38]:
x_train = df.review.values
y_train = df.sentiment.astype(int).values

### 2 - Fetch the embedding vector

In [6]:
nlp = spacy.load("en_core_web_md")

In [None]:
EMBEDDING_LAYER=96

x_train_embeedding = np.zeros((x_train.shape[0], MAX_SEQ_LEN, EMBEDDING_LAYER))
for idx, text in enumerate(x_train):
    doc = nlp(text)
    tensorized = doc.tensor
    x_train_embeedding[idx]=pad_sequences(np.transpose(tensorized), dtype='float32', maxlen=MAX_SEQ_LEN, padding='post', truncating='post').reshape(MAX_SEQ_LEN, EMBEDDING_LAYER)
    

### 3 - Build the model

In [18]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Masking, GRU
from tensorflow.keras.models import Model

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [20]:
def make_model(input_shape, n_hidden_lstm = 128, n_hidden_dense = 64):
    x = inp = Input(shape=input_shape)
    x = Masking()(x)
    x = LSTM(units=n_hidden_lstm)(x)
    x = Dense(units=n_hidden_dense, activation='relu')(x)
    x = Dropout(rate=0.2)(x)
    out = x = Dense(1, activation='sigmoid')(x)
    model = Model(inp, out)
    return model

In [41]:
model = make_model(input_shape=[MAX_SEQ_LEN, EMBEDDING_LAYER])
model.summary()
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 200, 96)]         0         
_________________________________________________________________
masking_1 (Masking)          (None, 200, 96)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               115200    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 123,521
Trainable params: 123,521
Non-trainable params: 0
_____________________________________________________

### 4 - Train

In [42]:
# set up the epoches to have better accuracy
def train(x_train, y_train, model, batch_size=1000):
    scores = []
    loss = []
    num_batch = int(x_train.shape[0] / batch_size)
    for i in range(num_batch-1):
        start = i * batch_size
        end = (i + 1) * batch_size
        history = model.fit(x_train[start:end], y_train[start:end], validation_data=(x_train, y_train), epochs=1)
        scores.append(history.history['val_accuracy'])
        loss.append(history.history['val_loss'])
    return scores

In [None]:
scores = train(x_train_embeedding, y_train, model)