# Movie Sentiment Analysis with Keras

In [1]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
#!pip install nltk
#!pip install --upgrade gensim

import numpy as np
import os
import os.path

from pdb import set_trace
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[nltk_data] Downloading package punkt to /home/jeremie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('../aclImdb'):
    if not os.path.isfile('../aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('../aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [3]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=3000
positive_sample_file_list = glob.glob(os.path.join('../aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('../aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [4]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)


Positive review(s): NYC model Alison Parker (Cristina Raines) rents a room in an old brownstone where she meets a few bi
Negative review(s): The reason the DVD releases of this film are in black and white is because nobody can get their hand


In [5]:
#ML STUDY GROUP
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

class PreProcessor:
    def __init__(self,REVIEWS,REVIEWS_VAL,LABELS,LABELS_VAL,WE_FILE):
        self.reviews = REVIEWS
        self.reviews_val = REVIEWS_VAL
        self.labels = LABELS
        self.labels_val = LABELS_VAL
        self.we_file = WE_FILE

    def tokenize(self):
#         set_trace()
        print(self.reviews[0])

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.reviews)

        self.sequences = tokenizer.texts_to_sequences(self.reviews)
        self.sequences_val = tokenizer.texts_to_sequences(self.reviews_val)

        self.word_index = tokenizer.word_index
        print("Found %s unique tokens" %(len(self.word_index)))

    def make_data(self):
        self.MAX_SEQUENCE_LENGTH = max([len(self.sequences[i]) for i in range(len(self.sequences))])
        print("self.MAX_SEQUENCE_LENGTH: {}".format(self.MAX_SEQUENCE_LENGTH))

        review = pad_sequences(self.sequences,maxlen=self.MAX_SEQUENCE_LENGTH)
        review_val = pad_sequences(self.sequences_val,maxlen=self.MAX_SEQUENCE_LENGTH)
        
        labels = to_categorical(self.labels)
        labels_val = to_categorical(self.labels_val)

        print("Shape of data tensor: " +str(review.shape))
        print("Shape of label tensor: " +str(labels.shape))

        return review, review_val, labels, labels_val
        
    def get_word_embedding_matrix(self,EMBEDDING_DIM=100):
        embeddings_index = {}

        if self.we_file == "rand":
            return None

        f = open(self.we_file)

        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(embeddings_index))

        self.embedding_matrix = np.zeros((len(self.word_index)+1, EMBEDDING_DIM))

        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        return self.embedding_matrix


Using TensorFlow backend.


In [6]:
import argparse
import numpy as np

from dan.custom_layers import AverageWords, WordDropout

from keras.layers import Embedding, Dense, Input, BatchNormalization, Activation, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad, Adam
from keras import backend as K

from pdb import set_trace

embedding_dim = 300
num_hidden_layers = 3
num_hidden_units = 300
num_epochs = 100
batch_size = 512
dropout_rate = 0.2
word_dropout_rate = 0.3
activation = 'relu'

args = {}
args['We']='data/glove.6B.300d.txt'
args['Wels']='' ### rand or ''
args['model']='dan'  ### nbow OR dan
args['wd']='y'

# reviews=X_train.values
# reviews_val=X_test.values
# labels=y_train.values
# labels_val=y_test.values

In [7]:
# reviews_val.values

In [8]:
pp = PreProcessor(X_train,X_test,y_train,y_test,args['We'])
pp.tokenize()

encoded_X_train,encoded_X_test,y_train,y_test = pp.make_data()

embedding_matrix = pp.get_word_embedding_matrix(embedding_dim)


I thought I should qualify my position after reading other reviews. The movie is not great, but it has a lot of great elements. The lighting and scenes along with the camera work are great. The story is slow and weak, but entertaining. The acting is bad, but no worse than you will find on the SyFy Channel. The music is pretty good and the gore is good. It has the great Leather Face in the film and is produced by Bruce Campbell. I watched the complete movie and while mostly predictable, it was still enjoyable. The women are attractive enough and the lead actor does a good job of being brooding and creepy. The movie was remarkably clean for a modern film and the violence appropriate for children 13 and up. There was no sex scenes. I gave it 7 out of 10 and I think that is fair. I would watch it again if I had nothing better to do. The gay sounding angel was the most annoying aspect of the film, the devil is quite creepy.
Found 40729 unique tokens
self.MAX_SEQUENCE_LENGTH: 1580
Shape of d

In [22]:
embedding_matrix.shape
# pp.MAX_SEQUENCE_LENGTH
len(pp.word_index)+1
embedding_dim

40730

https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/

In [11]:
%pdb

Automatic pdb calling has been turned ON


In [47]:
from keras.layers import Concatenate
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
# create the model

channels = []
inputs = []
encoded_X_trains= []
encoded_X_tests = []
for filter_len in [3,4,5]:
    inputs1 = Input(shape=(pp.MAX_SEQUENCE_LENGTH,))
    inputs.append(inputs1)
    embedding1 = Embedding(len(pp.word_index)+1,embedding_dim,weights=[embedding_matrix],\
                           input_length=pp.MAX_SEQUENCE_LENGTH,trainable=True)(inputs1)
    conv1 = Conv1D(filters=128, kernel_size=filter_len, padding='same', activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    channels.append(flat1)
    encoded_X_trains.append(encoded_X_train)
    encoded_X_tests.append(encoded_X_test)
    
# merge
merged = concatenate(channels)
# interpretation
outputs = Dense(2, activation='softmax')(merged)
model = Model(inputs=inputs, outputs=outputs)
# compile
    
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy','categorical_accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 1580)         0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 1580)         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 1580)         0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, 1580, 300)    12219000    input_16[0][0]                   
__________________________________________________________________________________________________
embedding_

In [None]:
%pdb off
batch_size = 128
num_epochs = 5

model.fit(encoded_X_trains,y_train,batch_size=batch_size,epochs=num_epochs,\
          validation_data=(encoded_X_tests,y_test))


Automatic pdb calling has been turned OFF
Train on 4500 samples, validate on 1500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5