# RNN with pre-trained embeddings

* Make an RNN based model, but use word embeddings from word2vec or document tensor from Spacy (see hint below) as input (i.e. do not start with an embedding layer!)

## Spacy document tensor

Feeding a Spacy document tensor into an RNN can yield very nice results. Spacy is part of the conda environment, and should already be installed. Make sure that you download at least the medium size english model (taken from the quickstart from the Spacy docs)

~~~sh
python -m spacy download en_core_web_md
~~~

Example code for converting to tensors:
~~~py
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
tensorized = doc.tensor
~~~

In [1]:
import pandas as pd
import spacy
import numpy as np

import re

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Masking, Input, LSTM, Flatten, Embedding, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import losses

In [30]:
df = pd.read_csv('../data/IMDB Dataset.csv')

MAX_SEQ_LENGTH = 200
SAMPLES = 10000

def preprocess_imdb_raw_data(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x 

def reduce_sentence_length(x, max_seq_length=MAX_SEQ_LENGTH):
    return ' '.join(x.split(' ')[:max_seq_length])

X = [reduce_sentence_length(preprocess_imdb_raw_data(x)) for x in df['review'].values]

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values

df.shape

(50000, 2)

In [31]:
nlp = spacy.load("en_core_web_sm")

In [32]:
X_tensors = [nlp(x).tensor for x in X[:SAMPLES]]

X_padded = pad_sequences(X_tensors, maxlen=MAX_SEQ_LENGTH)

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y[:SAMPLES], test_size=0.2, random_state=1)

In [34]:
from tensorflow.keras.layers import Masking, Input, LSTM, Flatten, Embedding, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import losses

doc_vec_size = 96

def make_model(input_size=doc_vec_size, 
               dense_layer_size=128, 
               dropout_probs=0.2):

    inp = Input(shape=[MAX_SEQ_LENGTH, input_size])
    
    x = Masking(mask_value=0)(inp)
        
    x = LSTM(dense_layer_size)(x)
    x = Dense(dense_layer_size, activation="relu")(x)
    
    x = Dropout(dropout_probs)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model(inp, out)
    print(model.summary())
    
    model.compile("adam", loss=losses.binary_crossentropy, metrics=['accuracy'])
    
    return model

model = make_model()


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 200, 96)]         0         
_________________________________________________________________
masking_1 (Masking)          (None, 200, 96)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               115200    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 131,841
Trainable params: 131,841
Non-trainable params: 0
_____________________________________________________

In [35]:
def save_model(model, filedir='../models'):
    
    with open(f"{filedir}/rnn_model_pretrained.json", "w") as json_file:
        json_file.write(model.to_json())

    model.save_weights(f"{filedir}/rnn_model_pretrained.h5")
    
def load_model(filedir='../models'):

    json_file = open(f"{filedir}/rnn_model_pretrained.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(f"{filedir}/rnn_model_pretrained.h5")
    
    return loaded_model

In [36]:
model.fit(X_train, y_train, epochs=5)

Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1dbe42898>

In [37]:
save_model(model)

# Evaluation

In [26]:
from sklearn.metrics import classification_report

y_test_probs = model.predict(x=X_test)
y_test_pred = (y_test_probs >= 0.5).astype(int)

print(f"Test: {classification_report(y_test, y_test_pred)}")

Test:               precision    recall  f1-score   support

           0       0.60      0.26      0.37       114
           1       0.44      0.77      0.56        86

    accuracy                           0.48       200
   macro avg       0.52      0.52      0.46       200
weighted avg       0.53      0.48      0.45       200



In [27]:
y_train_probs = model.predict(x=X_train)
y_train_pred = (y_train_probs >= 0.5).astype(int)

print(f"Train: {classification_report(y_train, y_train_pred)}")

Train:               precision    recall  f1-score   support

           0       0.88      0.64      0.74       385
           1       0.73      0.92      0.82       415

    accuracy                           0.78       800
   macro avg       0.81      0.78      0.78       800
weighted avg       0.80      0.78      0.78       800

