In [0]:
#importing libraries

import numpy as np #for mathematical calculations if any needed 
import pandas as pd #for playing with dataframe
import tensorflow as tf 
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer #For tokenizing the words
from keras.preprocessing.sequence import pad_sequences #for padding the words of same length
from sklearn.model_selection import train_test_split
import csv, os
import re #for manipulating with regex
import nltk #importing stopwords to be removed from the dataset
from gensim.models import Word2Vec #for creating word embeddings using Word2Vec model(CBAG or Skipgram)/ Loading pretrained Word2Vec or Glove Model.

In [0]:
#importing the dataset
train_df = pd.read_csv("/content/drive/My Drive/Movie_Reviews/RNN/train_cleaned.csv")
test_df = pd.read_csv("/content/drive/My Drive/Movie_Reviews/RNN/test_cleaned.csv")

In [0]:
#input declearations 
embedding_size = 50
sequence_length = 100
corpus_size = 5000

In [0]:
#setting up stopwords from the nltk library
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
#Cleaning the Train Dataset
#train_df['reviews'] = train_df['reviews'].apply(lambda x: re.sub('<br />',' ',x)) #removing nextline. If it is in another format, please check and remove it.
train_df['reviews'] = train_df['reviews'].apply(lambda x: re.sub('<br />',' ',x)) #removing nextline. If it is in another format, please check and remove it.
train_df['reviews'] = train_df['reviews'].apply(lambda x: re.sub('[^a-zA-z\s]','',x)) #removing all charecters except alphabets
train_df['reviews'] = train_df['reviews'].apply(lambda x: re.sub(r'\s+',' ',x)) #removing extra spaces
train_df['reviews'] = train_df['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #removing stopwords

In [0]:
#Cleaning the test Dataset
#test_df['reviews'] = test_df['reviews'].apply(lambda x: re.sub('<br />',' ',x)) #removing nextline. If it is in another format, please check and remove it.
test_df['reviews'] = test_df['reviews'].apply(lambda x: re.sub('<br />',' ',x)) #removing nextline. If it is in another format, please check and remove it.
test_df['reviews'] = test_df['reviews'].apply(lambda x: re.sub('[^a-zA-z\s]','',x)) #removing all charecters except alphabets
test_df['reviews'] = test_df['reviews'].apply(lambda x: re.sub(r'\s+',' ',x)) #removing extra spaces
test_df['reviews'] = test_df['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #removing stopwords

In [0]:

#Creating_Embedding_matix
sentences = train_df['reviews'].tolist() #converting reviews in dataframe to list.
sentences = [line.lower().split(' ') for line in sentences] #Again convert each sentence in a list to list of lists. #Also each word to lowercase 

In [0]:

"""
sentences - It should be list of lists
min_count - minimum number of times a word should be present to create wordvectors
window - number of words taken to train on either side of the word.
size - embedding layer size
workers - Number of cores it can parallely work on.
sg - #1 for skipgram or #0 for Contionous Bag of words Model
"""
embed_model = Word2Vec(sentences, window=5, min_count=1, size=embedding_size, workers=2, sg=0) 

In [0]:
#Training Embedding Matrix
embed_model.train(sentences,total_examples=len(sentences),epochs=10)

(28591087, 30332070)

In [0]:
"""# save model
embed_model.save('drive/My Drive/Movie_Reviews/RNN/skip_gram_model.bin')
# load model
skipgram_embed_model = Word2Vec.load('drive/My Drive/Movie_Reviews/RNN/skip_gram_model.bin')"""

embed_model.save('drive/My Drive/Movie_Reviews/RNN/CBAG_model.bin')
# load model
embed_model = Word2Vec.load('drive/My Drive/Movie_Reviews/RNN/CBAG_model.bin')


In [0]:
print(embed_model)

Word2Vec(vocab=108991, size=50, alpha=0.025)


In [0]:
"""The num_words in Tokenizer, the maximum number of words to keep, based on word frequency. 
If num_words = 10000 Only the most common 99999 words will be kept. all he extra words will be removed."""
tokenizer = Tokenizer(num_words = corpus_size) #setting up tokenizer
b=tokenizer.fit_on_texts(train_df['reviews']) #fitting tokenizer on dataframe 
X_train = tokenizer.texts_to_sequences(train_df['reviews']) # removing least repeated words and converting them into sequence of numbers.

In [0]:
print('Maximum review length: {}'.format(len(max(X_train, key=len)))) #checking maximum review length
print('Minimum review length: {}'.format(len(min(X_train, key=len)))) #checking minimum review length

Maximum review length: 933
Minimum review length: 3


In [0]:
"""Pads sequences to the same length.
This function transforms a list of num_samples sequences (lists of integers) into a 2D Numpy array of 
shape (num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided, or the 
length of the longest sequence otherwise. Sequences that are shorter than num_timesteps are padded with value at the end.
Sequences longer than num_timesteps are truncated so that they fit the desired length. 
The position where padding or truncation happens is determined by the arguments padding and truncating, respectively.
Pre-padding is the default."""

X_train = pad_sequences(X_train,maxlen = sequence_length)
print(X_train.shape)

(25000, 100)


In [0]:
embedding_matrix = np.zeros((corpus_size, embedding_size))


In [0]:
for word, i in tokenizer.word_index.items():
	embedding_vector = embed_model.wv[word]
	if embedding_vector is not None and i <=(corpus_size-1):
		embedding_matrix[i] = embedding_vector

In [0]:
word="idnt"
if word in list(embedding_matrix_50_loaded.index):
  print (word)

NameError: ignored

In [0]:
#loading Glove model
#File to Required Embedding Matrix
embedding_matrix_50_loaded = pd.read_table("drive/My Drive/EmbeddingDownloaded/glove.6B.50d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
for word, i in tokenizer.word_index.items():
  if i <=(corpus_size-1):
    if word in list(embedding_matrix_50_loaded.index):
      embedding_vector = embedding_matrix_50_loaded.loc[word]
      embedding_matrix[i] = np.asarray(embedding_vector)
    else:
      print (word)


In [0]:
embedding_matrix.shape

(5000, 50)

In [0]:
Y_train = train_df['pos_or_neg']
Y_test = test_df['pos_or_neg']

In [0]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train,Y_train, random_state=0, test_size=0.2)
print(X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape )

(20000, 100) (5000, 100) (20000,) (5000,)


In [0]:
print(Y_train.shape)
print(Y_test.shape)

(20000,)
(25000,)


In [0]:
X_test = tokenizer.texts_to_sequences(test_df['reviews']) #tokenizing the test data
X_test = pad_sequences(X_test,maxlen = sequence_length)

In [0]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=corpus_size,output_dim=embedding_size,input_length=sequence_length,trainable=False),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128)),
  tf.keras.layers.Dense(1,activation="sigmoid")
])
model.compile(optimizer="adam", loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 50)           250000    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               183296    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 433,553
Trainable params: 183,553
Non-trainable params: 250,000
_________________________________________________________________


In [0]:
#Declaring filepath for saving weigths
filepath = "weigths.best.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,monitor='val_acc',verbose=1,save_best_only = True, mode = 'max')

In [0]:
model.fit(X_train,Y_train, validation_data = [X_valid,Y_valid], epochs = 10, batch_size = 200, callbacks=[checkpoint])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 00001: val_acc did not improve from 0.57720
Epoch 2/10
Epoch 00002: val_acc did not improve from 0.57720
Epoch 3/10
Epoch 00003: val_acc improved from 0.57720 to 0.58400, saving model to weigths.best.hdf5
Epoch 4/10
Epoch 00004: val_acc improved from 0.58400 to 0.62400, saving model to weigths.best.hdf5
Epoch 5/10
Epoch 00005: val_acc improved from 0.62400 to 0.62940, saving model to weigths.best.hdf5
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.62940
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.62940
Epoch 8/10
Epoch 00008: val_acc improved from 0.62940 to 0.63620, saving model to weigths.best.hdf5
Epoch 9/10
Epoch 00009: val_acc did not improve from 0.63620
Epoch 10/10
Epoch 00010: val_acc did not improve from 0.63620


<tensorflow.python.keras.callbacks.History at 0x7ff6fdd15f60>

In [0]:
  scores = model.evaluate(X_test,Y_test,verbose=0)
print(model.metrics_names[1], scores[1]*100)

acc 63.39600086212158


In [0]:
os.mknod("model.json")

In [0]:
#Saving the model
model_json = model.to_json()
with open("model.json",'w') as json_file:
  json_file.write(model_json)

In [0]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = tf.keras.models.model_from_json(loaded_model_json)
print("Loaded model from disk")

In [0]:
# load weights into new model
loaded_model.load_weights("weigths.best.hdf5")
print("Loaded weigths from disk")

Loaded model from disk


In [0]:
# evaluate loaded model on test data
loaded_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, Y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

acc: 63.40%
