<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [81]:
import os
import csv

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.layers import BatchNormalization, Flatten, Conv1D, MaxPooling1D
from keras.layers import Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from tqdm import tqdm
from keras_tqdm import TQDMNotebookCallback
tqdm.pandas(desc="progress-bar")

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

In [69]:
seed = 10
np.random.seed(seed)

dataPath = os.path.join('.', 'datasets', 'imdb_movie_reviews')
labeledTrainData = os.path.join(dataPath, 'labeledTrainData.tsv')
unlabeledTrainData = os.path.join(dataPath, 'unlabeledTrainData.tsv')
testData = os.path.join(dataPath, 'testData.tsv')


labDat = pd.read_csv(labeledTrainData, sep = '\t', header = 0, quoting = 3)
unlabDat = pd.read_csv(unlabeledTrainData, sep = '\t', header = 0, quoting = 3)
testDat = pd.read_csv(testData, sep = '\t', header = 0, quoting = 3)


# Sanity check
print('labDat.shape :', labDat.shape)
print('unlabDat.shape :', unlabDat.shape)
print('testDat.shape :', testDat.shape)

unlabDat['sentiment'] = None
testDat['sentiment'] = None

print("\n")
print('labDat.info() :', labDat.info())
print("\n")
print('unlabDat.info() :', unlabDat.info())
print("\n")
print('testDat.info() :', testDat.info())

labDat.shape : (25000, 3)
unlabDat.shape : (50000, 2)
testDat.shape : (25000, 2)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB
labDat.info() : None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
id           50000 non-null object
review       50000 non-null object
sentiment    0 non-null object
dtypes: object(3)
memory usage: 1.1+ MB
unlabDat.info() : None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
review       25000 non-null object
sentiment    0 non-null object
dtypes: object(3)
memory usage: 586.0+ KB
testDat.info() : None


In [70]:
combinedDat = pd.concat(objs=[labDat, unlabDat, trainDat], axis=0).reset_index(drop=True)
print(combinedDat.shape)
print("\n")
print(combinedDat.info())

(100000, 3)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
id           100000 non-null object
review       100000 non-null object
sentiment    25000 non-null object
dtypes: object(3)
memory usage: 2.3+ MB
None


In [71]:
allReviews = combinedDat['review'].astype('str')

t = Tokenizer()
t.fit_on_texts(allReviews)
vocabSize = len(t.word_index) + 1

maxSeqLength = 500

seqs = t.texts_to_sequences(allReviews)
allReviewsPadded = pad_sequences(seqs, maxlen = maxSeqLength, padding = 'post')

In [72]:
labDatPad = allReviewsPadded[:25000]
unlabDatPad = allReviewsPadded[25000:75000]
testDatPad = allReviewsPadded[75000:]

print(len(labDatPad))
print(len(unlabDatPad))
print(len(testDatPad))

25000
50000
25000


In [38]:
# Load the GloVe embedding
embeddingsIndex = dict()

glovePath = os.path.join('.', 'datasets', 'glove.6B')
gloveData = os.path.join(glovePath, 'glove.6B.300d.txt')

f = open(gloveData, encoding="utf8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddingsIndex[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddingsIndex))

400000it [00:41, 9580.24it/s] 


Loaded 400000 word vectors.


In [39]:
# Filter to only weight matrix for words in vocab
embeddingMatrix = np.zeros((vocabSize, 300))

for word, i in tqdm(t.word_index.items()):
    embeddingVector = embeddingsIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector
        
print("len(embeddingMatrix):", len(embeddingMatrix))

100%|██████████| 177726/177726 [00:00<00:00, 493688.05it/s]


len(embeddingMatrix): 177727


In [53]:
X = labDatPad
y = labDat['sentiment']

print(X.shape)
print(y.shape)


X_train, X_eval, y_train ,y_eval = train_test_split(X, y, test_size = 0.25, shuffle = True, random_state = seed, stratify = y)

(25000, 500)
(25000,)


In [57]:
e = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length = maxSeqLength, trainable = False)

model = Sequential()
model.add(e)
model.add(Conv1D(filters = 100, kernel_size = 6, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

Adam_opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=Adam_opt, loss='binary_crossentropy', metrics=['acc'])

#model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('NLP-CNN-1.hdf', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, validation_data=(X_eval, y_eval), epochs=100, verbose=0, callbacks=[early_stopping,save_best,TQDMNotebookCallback(leave_inner = True, leave_outer = True)])

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 6', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 7', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 8', max=18750, style=ProgressStyle(description_width='i…

In [75]:
model.load_weights(filepath = 'NLP-CNN-1.hdf')
predictions = model.predict(testDatPad)
yHat = predictions[:,0]

In [76]:
yHat = np.round(yHat).astype(np.int)
print("** First 10 predictions:")
print(yHat[:10])

** First 10 predictions:
[1 0 1 1 1 1 0 0 0 1]


In [82]:
df_results = pd.DataFrame({'id': testDat.id, 'sentiment': yHat}).set_index('id')
print(df_results.head())

# create a submission csv file
df_results.to_csv('kaggle_submission.csv', quoting = csv.QUOTE_NONE) 
# Keras cnn + GloVe + Early Stopping v1.0


            sentiment
id                   
"12311_10"          1
"8348_2"            0
"5828_4"            1
"7186_2"            1
"12128_7"           1


KS 0.83356

---------------------------------------------------------------------

In [83]:
e = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length = maxSeqLength, trainable = False)

model = Sequential()
model.add(e)
model.add(Conv1D(filters = 100, kernel_size = 6, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

Adam_opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=Adam_opt, loss='binary_crossentropy', metrics=['acc'])

#model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('NLP-CNN-1.1.hdf', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, validation_data=(X_eval, y_eval), epochs=100, verbose=0, callbacks=[early_stopping,save_best,TQDMNotebookCallback(leave_inner = True, leave_outer = True)])

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=18750, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=18750, style=ProgressStyle(description_width='i…

KeyboardInterrupt: 

---------------------------------------------------------------------

In [None]:
e = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length = maxSeqLength, trainable = False)

model = Sequential()
model.add(e)

model.add(Conv1D(filters = 100, kernel_size = 6, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Dropout(0.2))

model.add(Conv1D(filters = 80, kernel_size = 6, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))

model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

Adam_opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=Adam_opt, loss='binary_crossentropy', metrics=['acc'])

#model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('NLP-CNN-1.2.hdf', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, validation_data=(X_eval, y_eval), epochs=100, verbose=0, callbacks=[early_stopping,save_best,TQDMNotebookCallback(leave_inner = True, leave_outer = True)])

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=18750, style=ProgressStyle(description_width='i…

---------------------------------------------------------------------

In [None]:
model = Sequential()
e = Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length = maxSeqLength, trainable = False)
model.add(e)
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
Adam_opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=Adam_opt, loss='binary_crossentropy', metrics=['acc'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
save_best = ModelCheckpoint('NLP-CNN-1.hdf', save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, validation_data=(X_eval, y_eval), epochs=100, verbose=0, callbacks=[early_stopping,save_best,TQDMNotebookCallback(leave_inner = True, leave_outer = True)])

In [74]:
testDat.head()

Unnamed: 0,id,review,sentiment
0,"""12311_10""","""Naturally in a film who's main themes are of ...",
1,"""8348_2""","""This movie is a disaster within a disaster fi...",
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...",
3,"""7186_2""","""Afraid of the Dark left me with the impressio...",
4,"""12128_7""","""A very accurate depiction of small time mob l...",
