In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/drive')

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dropout
#from tensorflow.keras.layers import Dense
#from tensorflow.keras.layers import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
df = pd.read_json('/drive/MyDrive/Colab Notebooks/CS3244 Machine Learning/CS3244 Group Project/Original_data/IMDB_reviews.json', lines = True)

In [None]:
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


#Before Word Embedding

In [None]:
df['spoiler'] = df['is_spoiler'].apply(lambda x: 1 if x is True else 0)
docs = df['review_text']
labels = df['spoiler']
X_train, X_test , y_train, y_test = train_test_split(docs, labels , test_size = 0.2, random_state = 0)
print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  459130 
Number of test instances:  114783


In [None]:
#data converted in some numerical representation.
vocab_size = 5000
X_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_train]
X_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_test]

In [None]:
# words are of different lengths -> must be of the same length -> forced to be of the same length
# make shorter sequences == zeros and drop off some integers on very long sequences

max_length = 100 # adjust this until optimal
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

In [None]:
# embedding layer : turns data into dense vectors of fixed size
# `Conv1D` with 128 units with the `relu` activation function. 
# `GlobalMaxPooling1D` layer : downsamples the input by taking the maximum value.
# `Dense` layer : 10 units for the fully connected layer.
# output layer with the sigmoid activation function because this is a binary problem.
'''
model = Sequential([Embedding(vocab_size, 8, input_length=max_length), 
                    Conv1D(128, 5, activation='relu'), 
                    GlobalMaxPooling1D(), 
                    Dense(10, activation='relu'), 
                    Dense(1, activation='sigmoid')])
'''
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [None]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks = [es])

Epoch 1/20


In [None]:
loss, accuracy = model.evaluate(X_test,y_test)
print('Testing Accuracy is {} '.format(accuracy*100))

Testing Accuracy is 73.3845591545105 


# With Word Embedding

In [None]:
df['spoiler'] = df['is_spoiler'].apply(lambda x: 1 if x is True else 0)
X = df['review_text']
y = df['spoiler']
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.20)
print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  459130 
Number of test instances:  114783


In [None]:
# The maximum number of words that will be used 
# How words that are not in the vocabulary will be represented i.e defining an `oov_token`
# tokenizer : remove punctuation, convert to lower, convert into a numerical representation

vocab_size = 5000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

In [None]:
# convert the text to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [None]:
# sequences are of different lengths -> must be of the same length -> forced to be of the same length w padding  
# maximum length: 100. 
# `trunction_type` = `post`: longer sentences will be truncated from the end.
# `padding_type` = `post` : shorter sentences will be padded with zeros at the end until they reach the required maximum length

max_length = 100
padding_type = "post"
trunction_type = "post"
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunction_type)
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunction_type)

In [None]:
#Download glove
!wget --no-check-certificate \
http://nlp.stanford.edu/data/glove.6B.zip \
-O /tmp/glove.6B.zip

--2021-10-29 01:57:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-10-29 01:57:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-10-29 01:57:22--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/tmp/glove.6B.zip’


2

In [None]:
#### Change this directory
import os
import zipfile
with zipfile.ZipFile('/tmp/glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('/tmp/glove')

In [None]:
#### Change this directory
import numpy as np
embeddings_index = {}
f = open('/tmp/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
# obtain the embedding for every word in the training set
# If an embedding for a certain word doesn’t exist, the embedding will be represented with zeros. 

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, max_length))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# embedding layer is created and embedding matrix obtained above is used as its weight
# `trainable` attribute = false: layer isn’t trained again. otherwise, the layer will be trained again = pre-trained weights will be lost. 
# The first argument is the size of the vocabulary 
# `input_length` : length of the input sequences 
# `output_dim` : dimension of the dense embedding. 


embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                            output_dim=max_length,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [None]:
model = Sequential([
    embedding_layer,
  Conv1D(128, 5, activation='relu'),
  GlobalMaxPooling1D(), 
  Dropout(0.5),
  Dense(10, activation='relu'),
  Dense(1, activation='sigmoid')
])

In [None]:
# what does each parameter mean
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
# You can implement the `EarlyStoppingCallback` to stop the training process once the model stops improving. 
# Takes time to run !
history = model.fit(X_train_padded, y_train, epochs=20, validation_data=(X_test_padded, y_test), callbacks = [es])

Epoch 1/20


In [None]:
loss, accuracy = model.evaluate(X_test_padded,y_test)
print('Testing Accuracy is {} '.format(accuracy*100))

Testing Accuracy is 75.24807453155518 


# Optimising Hyperparameters

In [None]:
def model_to_optimize(num_filters, kernel_size):
  model = Sequential([
  embedding_layer,
  Conv1D(num_filters, kernel_size, activation='relu'),
  GlobalMaxPooling1D(),
  Dense(10, activation='relu'),
  Dropout(0.5),
  Dense(1, activation='sigmoid')])
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [None]:
# Change these
params = {
    "num_filters":[32, 64, 128],
    "kernel_size":[3, 5, 7],
}

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn=model_to_optimize,
                            epochs=20,
                           batch_size=10,
                            verbose=False)

In [None]:
from sklearn.model_selection import GridSearchCV
# change cv = 10 for 10-fold val.
search = GridSearchCV(estimator=model, param_grid=params,
                              cv=5, verbose=1)
search_result = search.fit(X_train_padded, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
test_accuracy = search.score(X_test_padded, y_test)

In [None]:
# best parameters after gridsearch
search.best_params_

In [None]:
# Cross Fold results
pd.DataFrame(search.cv_results_)