In [51]:
import pandas as pd 
import numpy as np
import re

In [52]:
df = pd.read_csv('../data/IMDB Dataset.csv')

def preprocess_imdb_raw_data(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x 

X = [preprocess_imdb_raw_data(x) for x in df['review'].values]

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Make an RNN based model and fit the word embeddings yourself

* https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

# Preprocessing

In [53]:
from tensorflow.keras.preprocessing.text import Tokenizer

MAX_SEQ_LENGTH = 200
MAX_WORDS_VOCAB = 10000

tokenizer = Tokenizer(num_words=MAX_WORDS_VOCAB, oov_token=0)
tokenizer.fit_on_texts(X)

X_tokenized = tokenizer.texts_to_sequences(X)

In [54]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_padded = pad_sequences(X_tokenized, maxlen=MAX_SEQ_LENGTH)

X_padded.shape

(50000, 200)

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=1)

# RNN Model

In [56]:
from tensorflow.keras.layers import Masking, Input, LSTM, Flatten, Embedding, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import losses

def make_model(input_size=MAX_SEQ_LENGTH, 
               embedding_dim=MAX_WORDS_VOCAB, 
               dense_layer_size=128, 
               dropout_probs=0.2):

    inp = Input(shape=[input_size])  

    x = Masking(mask_value=0)(inp)
    
    x = Embedding(input_dim=embedding_dim, output_dim=input_size)(x)
    
    x = LSTM(dense_layer_size)(x)
    x = Dense(dense_layer_size, activation="relu")(x)
    
    x = Dropout(dropout_probs)(x)
    out = Dense(1, activation="sigmoid")(x)

    model = Model(inp, out)
    print(model.summary())
    
    return model

model = make_model()
model.compile("adam", loss=losses.binary_crossentropy, metrics=['accuracy'])

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
masking_3 (Masking)          (None, 200)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 200, 200)          2000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               168448    
_________________________________________________________________
dense_6 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129 

In [57]:
model.fit(x=X_train, y=y_train, validation_split=0.1)

Train on 36000 samples, validate on 4000 samples


<tensorflow.python.keras.callbacks.History at 0x14d362160>

In [63]:
def save_model(model, filedir='../models'):
    
    with open(f"{filedir}/rnn_model.json", "w") as json_file:
        json_file.write(model.to_json())

    model.save_weights(f"{filedir}/rnn_model.h5")
    
def load_model(filedir='../models'):

    json_file = open(f"{filedir}/rnn_model.json", 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(f"{filedir}/rnn_model.h5")
    
    return loaded_model

save_model(model)

# Evaluation

In [58]:
from sklearn.metrics import classification_report

y_train_probs = model.predict(x=X_train)
y_train_pred = (y_train_probs >= 0.5).astype(int)

print(f"Train: {classification_report(y_train, y_train_pred)}")

Train:               precision    recall  f1-score   support

           0       0.93      0.89      0.91     19956
           1       0.89      0.93      0.91     20044

    accuracy                           0.91     40000
   macro avg       0.91      0.91      0.91     40000
weighted avg       0.91      0.91      0.91     40000



In [59]:
y_test_probs = model.predict(x=X_test)
y_test_pred = (y_test_probs >= 0.5).astype(int)

print(f"Test: {classification_report(y_test, y_test_pred)}")

Test:               precision    recall  f1-score   support

           0       0.89      0.86      0.88      5044
           1       0.86      0.90      0.88      4956

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



## Keras evaluation

In [61]:
scores = model.evaluate(X_train, y_train, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

accuracy: 90.76%
