This notebook is divided into 3 parts. 



1.   In the first part I have trained the model on data_0 file(5M reviews) which we generated in data_separation.ipynb.
2.   In the second part I have trained the model on remaining files (data_1 ---- data_7, each of 5M reviews)
3.   In the 3rd part I will load the test data and check our model performance on the test data.

## Setup and imports

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# get all imports
import re
from tqdm import tqdm     # to show progress of a loop
import bz2                # to extract data from file
from sklearn.utils import shuffle
import numpy as np
import gensim
import sys
import pickle
from keras.layers import *
from keras.models import Model
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import string
# Load stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load the GloVE Embedding and the training data

In [0]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
  
# load glove pretrained model
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('/content/drive/My Drive/Machine Learning Projects/Word Embedding Models/glove/glove.6B.200d.txt')

In [0]:
# make directory to store data
!mkdir data
# get the splitted data
!cp -r '/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis/data/seperated_data' /content/data

## ALL HELPER METHODS



In [0]:
# define all helper methods
def getReviewX(review):
  # seperate label
  review = review.split(' ', 1)[1]
  # replace all numbers
  review = re.sub('[0-9]+', '0', review)
  # replace all urls
  if 'http' in review or 'www.' in review:
    regex_url = '((http(s)+(\:\/\/))?(www\.)?([\w\-\.\/])*(\.[a-zA-Z]{2,3}\/?))[^\s\b\n|]*[^.,;:\?\!\@\^\$ -]'
    review = re.sub(regex_url, '<url>', review)
  return review

def getReviewY(review):
  #seperate the labels
  review = review.split(' ', 1)[0]
  return [1, 0] if review.split(' ', 1)[0] == '__label__1' else [0, 1]  

def convertToNumpyY(y):
  return [1, 0] if y == 0 else [0, 1]

def splitLabelsReviews(lines):
  reviews=[]
  labels=[]
  for line in tqdm(lines):
    review = getReviewX(line)
    label = getReviewY(line)
    reviews.append(review[:1024])   # restrict the size of sent to 1024 chars
    labels.append(label)
  return reviews, labels

def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    # Make translation table to replace puctuation
    replace_char = {key: None for key in string.punctuation}
    replace_char['"'] = None
    table = str.maketrans(replace_char)
    stop_words = set(stopwords.words('english'))
    
    
    m = len(X)                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros(shape=(m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Remove punctuation
        X[i] = X[i].translate(table)
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()        
        
        # Initialize j to 0
        j = 0
        
        # Store indices of unknown words in list and then replace it by
        # the average of all words
        unknown_words_index = []
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Skip Stopwords
            if w in stop_words:
              continue
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index and j < max_len:
              X_indices[i, j] = word_to_index[w]
            else:
              # Handle unknown key (keep it as zeros)
              pass
            # Increment j to j + 1
            j += 1            
    
    return X_indices
  
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros(shape=(vocab_len, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    
    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

## STEP 1

In [0]:
train_file = open('/content/data/seperated_data/data_0', 'r')

In [7]:
# test getReviewX
dummy = 'this is this www.google.com https://www.google.com 456789' 
print(getReviewX(dummy))
#test getReviewY
dummy = '__label__1 asasddasd asd'
print(getReviewY(dummy))

is this <url> <url> 0
[1, 0]


In [0]:
# ----IGNORE----- 

# train_lines = train_file.readlines()
# test_lines = test_file.readlines()
# print(type(train_lines))
# print(type(train_lines[0]))
# convert list of bytes to list of string 
# train_lines = [x.decode('utf-8') for x in train_file.readlines()]
# test_lines = [x.decode('utf-8') for x in test_file.readlines()]

In [11]:
# seperate data and labels 
# no need to decode train file as we have already done it while seperation
reviews_train, y_train = splitLabelsReviews([x for x in train_file.readlines() if x != '\n'])

100%|██████████| 500000/500000 [00:09<00:00, 55274.57it/s]


In [12]:
# shuffle data
reviews_train, y_train = shuffle(reviews_train, y_train)
# convert y to numpy
y_train = np.array(y_train)
# data stats
print('Number of train reviews ' + str(len(reviews_train)))

Number of train reviews 500000


In [14]:
max_len = len(max(reviews_train, key=len).split())
print(max_len)

# FIX THE MAX LENGTH OF SENTENCE
max_len = 80

195


In [17]:
# Build the model

input_sent = Input(shape=(max_len,))

# Create the embedding layer pretrained with glove Vectors (≈1 line)
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
# Propagate sentence_indices through your embedding layer, you get back the embeddings
embeddings = embedding_layer(input_sent)

X = Bidirectional(GRU(128))(embeddings)
X = Dropout(0.2)(X)
X = Dense(2)(X)
X = Activation('softmax')(X)

model = Model(inputs = input_sent, outputs = X)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 200)           80000200  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               252672    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 80,253,386
Trainable params: 253,186
Non-trainable params: 80,000,200
__________________________________________________________

In [0]:
# convert word to indices
reviews_train = sentences_to_indices(reviews_train, word_to_index, max_len)

In [0]:
model.fit(reviews_train, y_train, epochs = 2, batch_size = 64)
model.save('/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis/checkpoint_2_epoch.h5')

Epoch 1/2
Epoch 2/2


## STEP 2

In [0]:
# load previous trained model
model = load_model('/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis/checkpoint_2_epoch.h5')

In [0]:
# code to train model on multiple small data files
for i in range(1, 8):
  print('Starting training on data '+str(i))
  train_file = open('/content/data/seperated_data/data_'+str(i), 'r')
  reviews_train, y_train = splitLabelsReviews([x for x in train_file.readlines() if x != '\n'])
  train_file.close()
  # shuffle data
  reviews_train, y_train = shuffle(reviews_train, y_train)
  # convert y to numpy
  y_train = np.array(y_train)
  # data stats
  print('Number of train reviews ' + str(len(reviews_train)))
  max_len = 80
  # convert word to indices
  reviews_train = sentences_to_indices(reviews_train, word_to_index, max_len)
  
  model.fit(reviews_train, y_train, epochs = 2, batch_size = 64)
  model.save('checkpoint_2_epoch_data_'+str(i)+'.h5')

Starting training on data 3


100%|██████████| 500000/500000 [00:09<00:00, 55275.84it/s]


Number of train reviews 500000
Epoch 1/2
Epoch 2/2
Starting training on data 4


100%|██████████| 500000/500000 [00:08<00:00, 58338.26it/s]


Number of train reviews 500000
Epoch 1/2
Epoch 2/2
Starting training on data 5


100%|██████████| 500000/500000 [00:08<00:00, 55993.70it/s]


Number of train reviews 500000
Epoch 1/2
Epoch 2/2
Starting training on data 6


100%|██████████| 500000/500000 [00:08<00:00, 60098.04it/s]


Number of train reviews 500000
Epoch 1/2
Epoch 2/2

In [0]:
# copy all the checkpoints to drive
!cp /content/checkpoint_2_epoch_data_*.h5 "/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis"

## **STEP 3**

In [19]:
# LOAD THE TESTING DATA
!cp '/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis/data/test.ft.txt.bz2.zip' /content/data
# extract data
!unzip /content/data/test.ft.txt.bz2.zip
# move extracted file
!mv /content/test.ft.txt.bz2 /content/data/test.ft.txt.bz2
test_file = bz2.BZ2File('/content/data/test.ft.txt.bz2')

reviews_test, y_test = splitLabelsReviews([x.decode('utf-8') for x in test_file.readlines()])
reviews_test, y_test = shuffle(reviews_test, y_test)
y_test = np.array(y_test)
print('Number of test reviews ' + str(len(reviews_test)))

Archive:  /content/data/test.ft.txt.bz2.zip
  inflating: test.ft.txt.bz2         


100%|██████████| 400000/400000 [00:07<00:00, 57113.06it/s]


Number of test reviews 400000


In [0]:
# convert word to indices
reviews_test = sentences_to_indices(reviews_test, word_to_index, max_len)

In [0]:
# load the trained model
model = load_model('/content/drive/My Drive/Machine Learning Projects/Amazon Sentiment Analysis/checkpoint_2_epoch_data_7.h5')

In [26]:
predictions = model.predict(reviews_test, verbose=1)



In [0]:
predictions_list = np.array([convertToNumpyY(x) for x in np.argmax(predictions, axis=1)])
predict_diff = [np.array_equal(x, y) for (x, y) in zip(y_test, predictions_list)]

In [64]:
print("ACCURACY on Test Data ---- {}".format(np.mean(predict_diff)))

ACCURACY on Test Data ---- 0.91658
