##Rotten Tomatoes review classification using Keras without pre-trained embedding layer

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!pip install wordvecpy
from wordvecpy import TextProcessor, FastVectokenizer
!pip install pymagnitude
import pymagnitude
import timeit

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting wordvecpy
  Downloading https://files.pythonhosted.org/packages/84/c9/5b45b3206183cf59c2045a8d50e7b33e9aa552615dd6107f5a0a1827d8cb/wordvecpy-0.5.tar.gz
Building wheels for collected packages: wordvecpy
  Building wheel for wordvecpy (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/0a/37/8d/929b022daf780d0597ee8aa6eac33e9b69cd4b09215d1944a1
Successfully built wordvecpy
Installing collected packages: wordvecpy
Successfully installed wordvecpy-0.5
Collecting pymagnitude
[?25l  Downloading https://files.pythonhosted.org/packages/0a/a3/b9a34d22ed8c0ed59b00ff55092129641cdfa09d82f9abdc5088051a5b0c/pymagnitude-0.1.120.tar.gz (5.4MB)
[K     |████████████████████████████████| 5.4MB 40.8MB/s

In [7]:
import spacy
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


Upload our Rotten Tomatoes reviews and the 200-dimensional twitter pre-trained word embedding .magnitude file for use with pymagnitude.

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Now we load our dataset into a dataframe and preprocess the review text

In [0]:
df = pd.read_csv("/content/drive/My Drive/rotten_tomatoes_reviews.csv")
vectors = pymagnitude.Magnitude("/content/drive/My Drive/glove.6B.200d.magnitude")

In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.Review, df.Freshness, test_size = 0.2, random_state = 0)

x_train_processed = TextProcessor(x_train, lemmatizer='spaCy en')
x_test_processed = TextProcessor(x_test, lemmatizer='spaCy en')

Now we create generate the integer embeddings and vector dictionary to use as input to, and the weights of, our Keras embedding layer

In [11]:
vectokenizer = FastVectokenizer(x_train_processed.transform(), vectors, x_test_processed.transform())
x_train, x_test, vector_dict = vectokenizer.to_keras()
max_sentence_length = vectokenizer.max_sentence_length
vocab_size = vectokenizer.max_words

Using TensorFlow backend.


The first model we'll make will be a simple CNN classifier



In [12]:
from keras.models import Input, Model
from keras.layers import Conv1D, AveragePooling1D, Dropout, SpatialDropout1D
from keras.layers import BatchNormalization, MaxPooling1D, Dense, Flatten
from keras.layers import Embedding
from keras.initializers import Constant

model_input = Input(shape = (max_sentence_length,))
layers_cnn = Embedding(vocab_size, 200, embeddings_initializer=Constant(vector_dict), 
                       input_length=max_sentence_length, trainable=False)(model_input)
layers_cnn = Conv1D(filters = 128, kernel_size = 7, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 5, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 2, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Conv1D(filters = 128, kernel_size = 1, strides = 1, activation = 'relu')(layers_cnn)
layers_cnn = BatchNormalization()(layers_cnn)
layers_cnn = Flatten()(layers_cnn)
cnn_model_output = Dense(1, activation = 'sigmoid')(layers_cnn)

cnn_model = Model(inputs = model_input, outputs = cnn_model_output)


Instructions for updating:
Colocations handled automatically by placer.


In [0]:
cnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [14]:
t0=timeit.default_timer()
cnn_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Instructions for updating:
Use tf.cast instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 906.007



In [15]:
score = cnn_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.7529
accuracy is: 0.84875


Not bad.  Now to try a simple RNN.

In [16]:
from keras.layers import Bidirectional, CuDNNLSTM

layers_rnn = Embedding(vocab_size, 300)(model_input)
layers_rnn = Bidirectional(CuDNNLSTM(128))(layers_rnn)
layers_rnn = BatchNormalization()(layers_rnn)
layers_rnn = Dropout(.35)(layers_rnn)
rnn_model_output = Dense(1, activation = 'sigmoid')(layers_rnn)

rnn_model = Model(inputs = model_input, outputs = rnn_model_output)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [18]:
t0=timeit.default_timer()
rnn_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 1676.249



In [19]:
score = cnn_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.7529
accuracy is: 0.84875


**Eh**.  Not good.

In [0]:
from keras.layers import concatenate
comb_para_model = concatenate([layers_cnn, layers_rnn])
comb_model_layer = Dense(256, activation = 'selu')(comb_para_model)
comb_model_output = Dense(1, activation = 'sigmoid')(comb_model_layer)

comb_model = Model(inputs = model_input, outputs = comb_model_output)

In [0]:
comb_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [22]:
t0=timeit.default_timer()
comb_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2356.568



In [23]:
score = comb_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.84
accuracy is: 0.8801041666666667


Well, 88% is the best we've gotten out of the NNs so far.  Maybe it's time to train it for another 10 epochs and see if it improves

In [24]:
t0=timeit.default_timer()
comb_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

score = comb_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2331.475


loss is: 0.8939
accuracy is: 0.880625


About the same as before.  Let's try one more


In [0]:
layers_crnn = Embedding(vocab_size, 300)(model_input)
layers_crnn = Bidirectional(CuDNNLSTM(128, return_sequences=True))(layers_crnn)
layers_crnn = SpatialDropout1D(.2)(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 7, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 5, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 3, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = Conv1D(filters = 128, kernel_size = 2, strides = 1, activation = 'relu')(layers_crnn)
layers_crnn = BatchNormalization()(layers_crnn)
layers_crnn = CuDNNLSTM(128)(layers_crnn)
layers_crnn = Dropout(.2)(layers_crnn)
layers_crnn = Dense(128, activation = 'relu')(layers_crnn)
crnn_model_output = Dense(1, activation = 'sigmoid')(layers_crnn)

crnn_model = Model(inputs = model_input, outputs = crnn_model_output)

In [0]:
crnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [27]:
t0=timeit.default_timer()
crnn_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2346.747



In [28]:
score = crnn_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 0.7913
accuracy is: 0.8748125


So, still not coming close to 90%

In [0]:
layers_mini = Embedding(vocab_size, 300)(model_input)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = True))(layers_mini)
layers_mini = Conv1D(filters = 64, kernel_size = 7, strides = 1, activation = 'relu')(layers_mini)
layers_mini = MaxPooling1D(3, strides=1)(layers_mini)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = True))(layers_mini)
layers_mini = Conv1D(filters = 64, kernel_size = 5, strides = 1, activation = 'relu')(layers_mini)
layers_mini = MaxPooling1D(3, strides=1)(layers_mini)
layers_mini = Bidirectional(CuDNNLSTM(64, return_sequences = False))(layers_mini)
layers_mini = BatchNormalization()(layers_mini)
layers_mini = Dropout(.25)(layers_mini)
layers_mini = Dense(256, activation = 'relu')(layers_mini)
layers_mini = BatchNormalization()(layers_mini)
layers_mini = Dropout(.25)(layers_mini)
layers_mini = Dense(256, activation = 'relu')(layers_mini)
mini_model_output = Dense(1, activation = 'sigmoid')(layers_mini)

mini_model = Model(inputs = model_input, outputs = mini_model_output)

mini_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [30]:
t0=timeit.default_timer()
mini_model.fit(x_train, y_train, epochs = 20, batch_size = 128)
t1=timeit.default_timer()
print('\nTime to train: {}\n'.format(round(t1-t0, 3)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Time to train: 2324.79



In [31]:
score = mini_model.evaluate(x_test, y_test)

print('\nloss is: ' + str(score[0].round(4)))
print('accuracy is: ' + str(score[1]))


loss is: 1.0435
accuracy is: 0.8735


Total failure.  Worse than before.