# Movie Sentiment Analysis with Keras

In [56]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
#!pip install nltk
#!pip install --upgrade gensim

import numpy as np
import os
import os.path

from pdb import set_trace
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[nltk_data] Downloading package punkt to /home/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('../aclImdb'):
    if not os.path.isfile('../aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('../aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [58]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=12500
positive_sample_file_list = glob.glob(os.path.join('../aclImdb/train/pos', "*.txt"))
# positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('../aclImdb/train/neg', "*.txt"))
# negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [59]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)


Positive review(s): There are few really hilarious films about science fiction but this one will knock your sox off. The
Negative review(s): I can find very little thats good to say about this film. I am sure the idea and script looked good 


### Logic to compute DAN model

In [60]:
#ML STUDY GROUP
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

class PreProcessor:
    def __init__(self,REVIEWS,REVIEWS_VAL,LABELS,LABELS_VAL,WE_FILE):
        self.reviews = REVIEWS
        self.reviews_val = REVIEWS_VAL
        self.labels = LABELS
        self.labels_val = LABELS_VAL
        self.we_file = WE_FILE

    def tokenize(self):
#         set_trace()
        print(self.reviews[0])

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.reviews)

        self.sequences = tokenizer.texts_to_sequences(self.reviews)
        self.sequences_val = tokenizer.texts_to_sequences(self.reviews_val)

        self.word_index = tokenizer.word_index
        print("Found %s unique tokens" %(len(self.word_index)))

    def make_data(self):
        self.MAX_SEQUENCE_LENGTH = max([len(self.sequences[i]) for i in range(len(self.sequences))])
        print("self.MAX_SEQUENCE_LENGTH: {}".format(self.MAX_SEQUENCE_LENGTH))

        review = pad_sequences(self.sequences,maxlen=self.MAX_SEQUENCE_LENGTH)
        review_val = pad_sequences(self.sequences_val,maxlen=self.MAX_SEQUENCE_LENGTH)
        
        labels = to_categorical(self.labels)
        labels_val = to_categorical(self.labels_val)

        print("Shape of data tensor: " +str(review.shape))
        print("Shape of label tensor: " +str(labels.shape))

        return review, review_val, labels, labels_val
        
    def get_word_embedding_matrix(self,EMBEDDING_DIM=100):
        embeddings_index = {}

        if self.we_file == "rand":
            return None

        f = open(self.we_file)

        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(embeddings_index))

        self.embedding_matrix = np.zeros((len(self.word_index)+1, EMBEDDING_DIM))

        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        return self.embedding_matrix


In [61]:
import argparse
import numpy as np

from dan.custom_layers import AverageWords, WordDropout

from keras.layers import Embedding, Dense, Input, BatchNormalization, Activation, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad, Adam
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping

from pdb import set_trace

embedding_dim = 300
num_hidden_layers = 3
num_hidden_units = 300
num_epochs = 100
batch_size = 512
dropout_rate = 0.2
word_dropout_rate = 0.3
activation = 'relu'

args = {}
args['We']='./glove.6B.300d.txt'
args['Wels']='' ### rand or ''
args['model']='dan'  ### nbow OR dan
args['wd']='y'

reviews=X_train.values
reviews_val=X_test.values
labels=y_train.values
labels_val=y_test.values

In [62]:
# reviews_val.values

In [63]:
pp = PreProcessor(reviews,reviews_val,labels,labels_val,args['We'])
pp.tokenize()

reviews,reviews_val,labels,labels_val = pp.make_data()

embedding_matrix = pp.get_word_embedding_matrix(embedding_dim)


What can one say about Elvira that hasn't already been said in the world's press? The classic comedienne that IS Elvira delivers in her first full-length big budget comedy masterpiece.  From the very first movie frame thingy, Elvira packs an acting punch that clearly says Film Great....eat your heart out, Bette Davis! See a forlorn Elvira, see an excitable Elvira, see a jealous Elvira, see a murderous Elvira. You can do nothing but marvel at her acting prowess!  At the heart of this comedy masterpiece is Elvira's desire for Las Vegas show stardom. Despite putting "the boob back in the boobtube" as a horror hostess (with the mostest), Elvira finds the small screen constrictive emotionally....and PHYSICALLY! Nuff said, she packs up her kitbag and heads East....a hotdog in one hand and a letter from her Aunt's lawyer outlining her inheritance 'windfall' in the other.  I've seen this movie so many times, I can almost recite it verbatim....(verbatim would just be showing off)!  Grab a copy,

In [64]:
model = Sequential()

if args['Wels'] == "rand":
    model.add(Embedding(len(pp.word_index) + 1,embedding_dim,input_length=pp.MAX_SEQUENCE_LENGTH,trainable=False))
else:
    model.add(Embedding(len(pp.word_index)+1,embedding_dim,weights=[embedding_matrix],input_length=pp.MAX_SEQUENCE_LENGTH,trainable=False))

if args['wd'] == 'y':
    model.add(WordDropout(word_dropout_rate))
model.add(AverageWords())


In [65]:
print('reviews.shape: ' + str(reviews.shape))
print('reviews_val.shape: ' + str(reviews_val.shape))
print('labels.shape: ' + str(labels.shape))
print('labels_val.shape: ' + str(labels_val.shape))
labels.shape[0]

reviews.shape: (18750, 1841)
reviews_val.shape: (6250, 1841)
labels.shape: (18750, 2)
labels_val.shape: (6250, 2)


18750

In [66]:
if args['model'] == 'dan':
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_units))
        model.add(BatchNormalization())
        model.add(Activation(activation))
        model.add(Dropout(dropout_rate))

model.add(Dense(labels.shape[1]))
model.add(BatchNormalization())
model.add(Dropout(dropout_rate))
model.add(Activation('softmax'))

adam = Adam()
model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['accuracy','categorical_accuracy'])

model.summary()

model_checkpoint = ModelCheckpoint('best.weights', monitor='val_loss', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)

callbacks = [model_checkpoint, early_stopping]

history = model.fit(reviews,labels,batch_size=batch_size,epochs=num_epochs,\
          validation_data=(reviews_val,labels_val), callbacks=callbacks)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1841, 300)         23457000  
_________________________________________________________________
word_dropout_6 (WordDropout) (None, 1841, 300)         0         
_________________________________________________________________
average_words_6 (AverageWord (None, 300)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 300)               90300     
_________________________________________________________________
batch_normalization_20 (Batc (None, 300)               1200      
_________________________________________________________________
activation_20 (Activation)   (None, 300)               0         
_________________________________________________________________
dropout_20 (Dropout)         (None, 300)               0         
__________


Epoch 00020: val_loss did not improve from 0.36183
Epoch 21/100

Epoch 00021: val_loss did not improve from 0.36183
Epoch 22/100

Epoch 00022: val_loss did not improve from 0.36183
Epoch 23/100

Epoch 00023: val_loss did not improve from 0.36183
Epoch 24/100

Epoch 00024: val_loss did not improve from 0.36183
Epoch 25/100

Epoch 00025: val_loss did not improve from 0.36183
Epoch 26/100

Epoch 00026: val_loss did not improve from 0.36183
Epoch 27/100

Epoch 00027: val_loss did not improve from 0.36183
Epoch 28/100

Epoch 00028: val_loss did not improve from 0.36183
Epoch 29/100

Epoch 00029: val_loss did not improve from 0.36183
Epoch 00029: early stopping


In [67]:
df = pd.DataFrame(history.history)
df=df[df['val_acc']==df.val_acc.max()]
df.reset_index(inplace=True)
df["title"]=["Keras DAN"]
df["sample_size"]=[SAMPLE_SIZE]
df["nb_epochs"]=[df.iloc[0]["index"]+1]
df.drop(labels="index",axis=1,inplace=True)
print(df)
df.to_csv(path_or_buf=df.iloc[0].title+".csv")

   val_loss  val_acc  val_categorical_accuracy      loss       acc  \
0  0.362056  0.84176                   0.84176  0.405207  0.819573   

   categorical_accuracy      title  sample_size  nb_epochs  
0              0.819573  Keras DAN        12500         12  
