# Movie Sentiment Analysis with Keras

In [1]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from pdb import set_trace
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/6c93685bed0026b6a1cce55ab173f6b617f6db0d1325d25489c2fd43e711/gensim-3.7.1-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K    100% |████████████████████████████████| 24.2MB 1.8MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.7.1
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('./aclImdb'):
    if not os.path.isfile('./aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('./aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

--2019-02-06 18:04:44--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2019-02-06 18:04:50 (13.2 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [0]:
time_beginning_of_notebook = time.time()
# SAMPLE_SIZE=12500
SAMPLE_SIZE=4000

positive_sample_file_list = glob.glob(os.path.join('./aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('./aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [25]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)


Positive review(s): Japanese indie film with humor and philosophy where the three main characters run literally almost t
Negative review(s): The film is severely awful and is demeaning to rape victims. On the surface, it may be a daring film


In [0]:
#ML STUDY GROUP
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

class PreProcessor:
    def __init__(self,REVIEWS,REVIEWS_VAL,LABELS,LABELS_VAL,WE_FILE):
        self.reviews = REVIEWS
        self.reviews_val = REVIEWS_VAL
        self.labels = LABELS
        self.labels_val = LABELS_VAL
        self.we_file = WE_FILE

    def tokenize(self):
#         set_trace()
        print(self.reviews[0])

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.reviews)

        self.sequences = tokenizer.texts_to_sequences(self.reviews)
        self.sequences_val = tokenizer.texts_to_sequences(self.reviews_val)

        self.word_index = tokenizer.word_index
        print("Found %s unique tokens" %(len(self.word_index)))

    def make_data(self):
        self.MAX_SEQUENCE_LENGTH = max([len(self.sequences[i]) for i in range(len(self.sequences))])
        print("self.MAX_SEQUENCE_LENGTH: {}".format(self.MAX_SEQUENCE_LENGTH))

        review = pad_sequences(self.sequences,maxlen=self.MAX_SEQUENCE_LENGTH)
        review_val = pad_sequences(self.sequences_val,maxlen=self.MAX_SEQUENCE_LENGTH)
        
        labels = to_categorical(self.labels)
        labels_val = to_categorical(self.labels_val)

        print("Shape of data tensor: " +str(review.shape))
        print("Shape of label tensor: " +str(labels.shape))

        return review, review_val, labels, labels_val
        
    def get_word_embedding_matrix(self,EMBEDDING_DIM=100):
        embeddings_index = {}

        if self.we_file == "rand":
            return None

        f = open(self.we_file)

        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(embeddings_index))

        self.embedding_matrix = np.zeros((len(self.word_index)+1, EMBEDDING_DIM))

        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        return self.embedding_matrix


In [0]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isfile('./glove.6B.300d.txt'):
    if not os.path.isfile('./glove.6B.zip'):
      !wget http://nlp.stanford.edu/data/glove.6B.zip 

    if not os.path.isfile('./glove.6B.300d.txt'):  
      !unzip glove.6B.zip 
      

In [0]:
import argparse
import numpy as np

from keras.layers import Embedding, Dense, Input, BatchNormalization, Activation, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad, Adam
from keras import backend as K

from pdb import set_trace

embedding_dim = 300
num_hidden_layers = 3
num_hidden_units = 300
num_epochs = 100
batch_size = 512
dropout_rate = 0.2
word_dropout_rate = 0.3
activation = 'relu'

args = {}
args['We']='./glove.6B.300d.txt'
args['Wels']='' ### rand or ''
args['model']='dan'  ### nbow OR dan
args['wd']='y'

# reviews=X_train.values
# reviews_val=X_test.values
# labels=y_train.values
# labels_val=y_test.values

In [29]:
pp = PreProcessor(X_train,X_test,y_train,y_test,args['We'])
pp.tokenize()

encoded_X_train,encoded_X_test,y_train,y_test = pp.make_data()

embedding_matrix = pp.get_word_embedding_matrix(embedding_dim)


First, nobody can understand why this movie is rated so poorly. Not only is this the first real horrific movie since a very long time for me who am pretty hard-boiled with a decades long experience of horror starting with driving through dark rides (ghost trains) as a child. Second, the main actress Cheri Christian has a face that lets you hope she will be the leading actress in major pictures of the future. Third, this woman is that tremendously beautiful that I suggest the directors retire all those Cameron Diazes, Eva Mendezes, and how ever the names of these ephemeral bulb-lights are. Mrs. Christian is not a light, but a sun.  However, "Dark remains" is also of considerable metaphysical importance. They idea that photographs shows creatures of the intermediary reign between reality and "imagination" that are not visible with one' own eyes is not new. But I have never seen in a movie before that those creatures are visible on the photographs only for certain people and only to certa

In [30]:
embedding_matrix.shape
# pp.MAX_SEQUENCE_LENGTH
len(pp.word_index)+1
embedding_dim

300

https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/

In [31]:
from keras.layers import Concatenate
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
# create the model

channels = []
inputs = []
encoded_X_trains= []
encoded_X_tests = []
for filter_len in [3,4,5]:
    inputs1 = Input(shape=(pp.MAX_SEQUENCE_LENGTH,))
    inputs.append(inputs1)
    embedding1 = Embedding(len(pp.word_index)+1,embedding_dim,weights=[embedding_matrix],\
                           input_length=pp.MAX_SEQUENCE_LENGTH,trainable=True)(inputs1)
    conv1 = Conv1D(filters=128, kernel_size=filter_len, padding='same', activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    channels.append(flat1)
    encoded_X_trains.append(encoded_X_train)
    encoded_X_tests.append(encoded_X_test)
    
# merge
merged = concatenate(channels)
# interpretation
outputs = Dense(2, activation='softmax')(merged)
model = Model(inputs=inputs, outputs=outputs)
# compile
    
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy','categorical_accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 2473, 300)    13812000    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_

In [0]:
%pdb off
batch_size = 64
# num_epochs = 3
num_epochs = 4

history = model.fit(encoded_X_trains,y_train,batch_size=batch_size,epochs=num_epochs,\
          validation_data=(encoded_X_tests,y_test))


Automatic pdb calling has been turned OFF
Train on 6000 samples, validate on 2000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
 576/6000 [=>............................] - ETA: 31s - loss: 0.0205 - acc: 0.9965 - categorical_accuracy: 0.9965

y_train

In [59]:
df = pd.DataFrame(history.history)
df=df[df['val_acc']==df.val_acc.max()]
df.reset_index(inplace=True)
df["title"]=["Keras CNN with pretrained embedding"]
df["sample_size"]=[SAMPLE_SIZE]
df["nb_epochs"]=[df.iloc[0]["index"]+1]
df.drop(labels="index",axis=1,inplace=True)
print(df)
df.to_csv(path_or_buf=df.iloc[0].title+".csv")

{'val_loss': [0.5589282789230346, 0.3646219997406006], 'val_acc': [0.7025, 0.8445], 'val_categorical_accuracy': [0.7025, 0.8445], 'loss': [0.775305295308431, 0.31206544399261477], 'acc': [0.5896666668256124, 0.8736666668256123], 'categorical_accuracy': [0.5896666668256124, 0.8736666668256123]}
[0, 1]
{'batch_size': 64, 'epochs': 2, 'steps': None, 'samples': 6000, 'verbose': 1, 'do_validation': True, 'metrics': ['loss', 'acc', 'categorical_accuracy', 'val_loss', 'val_acc', 'val_categorical_accuracy']}


Unnamed: 0,index,acc,categorical_accuracy,loss,val_acc,val_categorical_accuracy,val_loss
0,1,0.873667,0.873667,0.312065,0.8445,0.8445,0.364622


In [67]:
df

Unnamed: 0,index,acc,categorical_accuracy,loss,val_acc,val_categorical_accuracy,val_loss,title,sample_size,nb_epochs
0,1,0.873667,0.873667,0.312065,0.8445,0.8445,0.364622,Keras CNN with pretrained embedding,4000,2
