##Rotten Tomatoes review classification using Keras without pre-trained embedding layer

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!pip install wordvecpy
from wordvecpy import TextProcessor
import timeit
import spacy
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting wordvecpy
  Downloading https://files.pythonhosted.org/packages/84/c9/5b45b3206183cf59c2045a8d50e7b33e9aa552615dd6107f5a0a1827d8cb/wordvecpy-0.5.tar.gz
Building wheels for collected packages: wordvecpy
  Building wheel for wordvecpy (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/0a/37/8d/929b022daf780d0597ee8aa6eac33e9b69cd4b09215d1944a1
Successfully built wordvecpy
Installing collected packages: wordvecpy
Successfully installed wordvecpy-0.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


Now we load our dataset into a dataframe and preprocess the review text

In [0]:
df = pd.read_csv('/content/drive/My Drive/rotten_tomatoes_reviews.csv')

processor = TextProcessor(df['Review'], lemmatizer='spaCy en')
df['Review'] = processor.transform(combined_strings=True)

Now to create integer sequences for all of the reviews in the dataframe

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop(['Freshness'], axis = 1), df["Freshness"], test_size = 0.2, random_state = 0)

tokenizer = Tokenizer()

tokenizer.fit_on_texts(x_train['Review'])

max_length = max([len(s.split()) for s in x_train['Review']])

vocab_size = len(tokenizer.word_index) + 1

x_train_tokens = tokenizer.texts_to_sequences(x_train['Review'])
x_test_tokens = tokenizer.texts_to_sequences(x_test['Review'])

x_train_pad = pad_sequences(x_train_tokens, maxlen = max_length, padding = 'post')
x_test_pad = pad_sequences(x_test_tokens, maxlen = max_length, padding = 'post')

Using TensorFlow backend.


The first model we'll make will be a simple CNN classifier



In [5]:
from keras.models import Input, Model
from keras.layers import Conv1D, AveragePooling1D, Dropout, SpatialDropout1D
from keras.layers import BatchNormalization, MaxPooling1D, Dense, Flatten
from keras.layers import Embedding

model_input = Input(shape = (max_length,))
layers_cnn = Embedding(vocab_size, 300)(model_input)
layers_cnn = Conv1D(filters = 128, kernel_size = 7, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 5, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = SpatialDropout1D(.35)(layers_cnn)
layers_cnn = AveragePooling1D(pool_size = 5, strides = 1)(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 2, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 1, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = SpatialDropout1D(.35)(layers_cnn)
layers_cnn = AveragePooling1D(pool_size = 3, strides = 1)(layers_cnn)
layers_cnn = Flatten()(layers_cnn)
cnn_model_output = Dense(1, activation = 'sigmoid')(layers_cnn)

cnn_model = Model(inputs = model_input, outputs = cnn_model_output)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
cnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [7]:
t0=timeit.default_timer()
cnn_model.fit(x_train_pad, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 1850.952



In [8]:
score = cnn_model.evaluate(x_test_pad, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.9753
accuracy is: 0.87465625


Not bad.  Now to try a simple RNN.

In [0]:
from keras.layers import Bidirectional, CuDNNLSTM

layers_rnn = Embedding(vocab_size, 300)(model_input)
layers_rnn = Bidirectional(CuDNNLSTM(128))(layers_rnn)
layers_rnn = BatchNormalization()(layers_rnn)
layers_rnn = Dropout(.35)(layers_rnn)
rnn_model_output = Dense(1, activation = 'sigmoid')(layers_rnn)

rnn_model = Model(inputs = model_input, outputs = rnn_model_output)

In [0]:
rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [11]:
t0=timeit.default_timer()
rnn_model.fit(x_train_pad, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 1674.291



In [12]:
score = cnn_model.evaluate(x_test_pad, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.9753
accuracy is: 0.87465625


**Eh**.  Not bad either, but not even as good as multinomial naive-bayes or logistic regression.  That's a low bar to hold an LSTM or CNN to.  Let's combine them in parallel and see if we can make it a little better.

In [0]:
from keras.layers import concatenate
comb_para_model = concatenate([layers_cnn, layers_rnn])
comb_model_layer = Dense(256, activation = 'selu')(comb_para_model)
comb_model_output = Dense(1, activation = 'sigmoid')(comb_model_layer)

comb_model = Model(inputs = model_input, outputs = comb_model_output)

In [0]:
comb_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [15]:
t0=timeit.default_timer()
comb_model.fit(x_train_pad, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 3386.357



In [16]:
score = comb_model.evaluate(x_test_pad, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 1.0076
accuracy is: 0.8743229166666666


Ok, so no improvement.  Maybe with a few extra epochs we could break 90% but the model is already pretty overfit.  Let's try combining CNN and RNN layers together

In [0]:
layers_crnn = Embedding(vocab_size, 300)(model_input)
layers_crnn = Bidirectional(CuDNNLSTM(128, return_sequences=True))(layers_crnn)
layers_crnn = SpatialDropout1D(.2)(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 7, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 5, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 2, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = CuDNNLSTM(128)(layers_crnn)
layers_crnn = Dropout(.2)(layers_crnn)
layers_crnn = Dense(128, activation = 'relu')(layers_crnn)
crnn_model_output = Dense(1, activation = 'sigmoid')(layers_crnn)

crnn_model = Model(inputs = model_input, outputs = crnn_model_output)

In [0]:
crnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [19]:
t0=timeit.default_timer()
crnn_model.fit(x_train_pad, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2420.76



In [20]:
score = crnn_model.evaluate(x_test_pad, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.8362
accuracy is: 0.8773020833333334


Wow, that's even worse.

In [0]:
layers_mini = Embedding(vocab_size, 300)(model_input)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = True))(layers_mini)
layers_mini = Conv1D(filters = 64, kernel_size = 7, strides = 1, activation = 'relu')(layers_mini)
layers_mini = MaxPooling1D(3, strides=1)(layers_mini)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = True))(layers_mini)
layers_mini = Conv1D(filters = 64, kernel_size = 5, strides = 1, activation = 'relu')(layers_mini)
layers_mini = MaxPooling1D(3, strides=1)(layers_mini)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = False))(layers_mini)
layers_mini = BatchNormalization()(layers_mini)
layers_mini = Dropout(.25)(layers_mini)
layers_mini = Dense(256, activation = 'relu')(layers_mini)
layers_mini = BatchNormalization()(layers_mini)
layers_mini = Dropout(.25)(layers_mini)
layers_mini = Dense(256, activation = 'relu')(layers_mini)
mini_model_output = Dense(1, activation = 'sigmoid')(layers_mini)

mini_model = Model(inputs = model_input, outputs = mini_model_output)

mini_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [22]:
t0=timeit.default_timer()
mini_model.fit(x_train_pad, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2475.07



In [23]:
score = mini_model.evaluate(x_test_pad, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.9492
accuracy is: 0.8753541666666667
