# Movie Sentiment Analysis with Keras

In [35]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from pdb import set_trace
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

Requirement already up-to-date: gensim in /usr/local/lib/python3.6/dist-packages (3.7.1)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('./aclImdb'):
    if not os.path.isfile('./aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('./aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [0]:
time_beginning_of_notebook = time.time()
# SAMPLE_SIZE=12500
SAMPLE_SIZE=50000 #4000

positive_sample_file_list_test = glob.glob(os.path.join('./aclImdb/test/pos', "*.txt"))
positive_sample_file_list = glob.glob(os.path.join('./aclImdb/train/pos', "*.txt"))
#positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list_test = glob.glob(os.path.join('./aclImdb/test/neg', "*.txt"))
negative_sample_file_list = glob.glob(os.path.join('./aclImdb/train/neg', "*.txt"))
#negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [39]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(len(positive_sample_file_list))}) #(SAMPLE_SIZE)
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(len(negative_sample_file_list))})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

##

df_positives_test = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list_test], 'sentiment': np.ones(len(positive_sample_file_list))})
df_negatives_test = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list_test], 'sentiment': np.zeros(len(negative_sample_file_list))})

print("Positive review(s)_test:", df_positives_test['reviews'][1][:100])
print("Negative review(s)_test:", df_negatives_test['reviews'][1][:100])

df_test = pd.concat([df_positives_test, df_negatives_test], ignore_index=True)

df_test = shuffle(df_test)


#X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)
X_train, y_train = df['reviews'], df['sentiment']
X_test, y_test = df_test['reviews'], df_test['sentiment']




Positive review(s): My watch came a little too late but am glad i watched both this and the sequel together...which make
Negative review(s): This is one of those movies that appears on cable at like two in the afternoon to entertain bored ho
Positive review(s)_test: i see there are great reviews of this film already, i've got a few points to comment on, reasons i t
Negative review(s)_test: One would think that with the incredible backdrop of WWII Stalingrad that the writers would come up 


In [0]:
X_train

In [0]:
X_test


In [0]:
#ML STUDY GROUP
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

class PreProcessor:
    def __init__(self,REVIEWS,REVIEWS_VAL,LABELS,LABELS_VAL,WE_FILE):
        self.reviews = REVIEWS
        self.reviews_val = REVIEWS_VAL
        self.labels = LABELS
        self.labels_val = LABELS_VAL
        self.we_file = WE_FILE

    def tokenize(self):
#         set_trace()
        print(self.reviews[0])

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.reviews)

        self.sequences = tokenizer.texts_to_sequences(self.reviews)
        self.sequences_val = tokenizer.texts_to_sequences(self.reviews_val)

        self.word_index = tokenizer.word_index
        print("Found %s unique tokens" %(len(self.word_index)))

    def make_data(self):
        self.MAX_SEQUENCE_LENGTH = max([len(self.sequences[i]) for i in range(len(self.sequences))])
        print("self.MAX_SEQUENCE_LENGTH: {}".format(self.MAX_SEQUENCE_LENGTH))

        review = pad_sequences(self.sequences,maxlen=self.MAX_SEQUENCE_LENGTH)
        review_val = pad_sequences(self.sequences_val,maxlen=self.MAX_SEQUENCE_LENGTH)
        
        labels = to_categorical(self.labels)
        labels_val = to_categorical(self.labels_val)

        print("Shape of data tensor: " +str(review.shape))
        print("Shape of label tensor: " +str(labels.shape))

        return review, review_val, labels, labels_val
        
    def get_word_embedding_matrix(self,EMBEDDING_DIM=100):
        embeddings_index = {}

        if self.we_file == "rand":
            return None

        f = open(self.we_file)

        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found %s word vectors.' % len(embeddings_index))

        self.embedding_matrix = np.zeros((len(self.word_index)+1, EMBEDDING_DIM))

        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector

        return self.embedding_matrix


In [0]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isfile('./glove.6B.300d.txt'):
    if not os.path.isfile('./glove.6B.zip'):
      !wget http://nlp.stanford.edu/data/glove.6B.zip 

    if not os.path.isfile('./glove.6B.300d.txt'):  
      !unzip glove.6B.zip 
      

In [0]:
import argparse
import numpy as np

from keras.layers import Embedding, Dense, Input, BatchNormalization, Activation, Dropout
from keras.models import Sequential
from keras.optimizers import Adagrad, Adam
from keras import backend as K

from pdb import set_trace

embedding_dim = 300
num_hidden_layers = 3
num_hidden_units = 300
num_epochs = 100
batch_size = 512
dropout_rate = 0.2
word_dropout_rate = 0.3
activation = 'relu'

args = {}
args['We']='./glove.6B.300d.txt'
args['Wels']='' ### rand or ''
args['model']='dan'  ### nbow OR dan
args['wd']='y'

# reviews=X_train.values
# reviews_val=X_test.values
# labels=y_train.values
# labels_val=y_test.values

In [19]:
X_train

5641    Christ. A sequel to one of the most cloying fi...
5335    Supposedly a "social commentary" on racism and...
6706    We know from other movies that the actors are ...
7764    bad acting, bad southern accents, inconsistent...
535     The only other film besides Soylent Green that...
5027    I have to congratulate the genius who approved...
580     Contrary to popular belief, this title , to me...
2966    After an anonymous phone call about a spacecra...
3516    This was a pretty good film. I'm not sure if t...
1184    I don't quite know how to explain "Darkend Roo...
5740    I gave this movie a 5 out of pure pity. My int...
2055    Hayao Miyazaki has no equal when it comes to u...
6729    Okay, let me break it down for you guys...IT'S...
3986    Read Eric's review again. He perfectly describ...
3612    After some difficulty, Johnny Yuma arrives at ...
173     So, neighbor was killing neighbor. Reminds me ...
5264    I saw the movie as a child when it was release...
4297    Not su

In [42]:
pp = PreProcessor(X_train,X_test,y_train,y_test,args['We'])
pp.tokenize()

encoded_X_train,encoded_X_test,y_train,y_test = pp.make_data()

embedding_matrix = pp.get_word_embedding_matrix(embedding_dim)


a bit slow and boring, the tale of an old man and his wife living a delapidated building and interacting with a fixed cast of characters like the mailman, the brothers sitting on the porch, the wealthy cigar smoking man. The photography of the river is marvelous, as is the interior period decoration. If you like decoration of Banana Republic stores, this is a must.
Found 88576 unique tokens
self.MAX_SEQUENCE_LENGTH: 2473
Shape of data tensor: (25000, 2473)
Shape of label tensor: (25000, 2)
Found 400000 word vectors.


In [43]:
embedding_matrix.shape
# pp.MAX_SEQUENCE_LENGTH
len(pp.word_index)+1
embedding_dim

300

https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/

In [44]:
from keras.layers import Concatenate
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
# create the model

channels = []
inputs = []
encoded_X_trains= []
encoded_X_tests = []
for filter_len in [3,4,5]:
    inputs1 = Input(shape=(pp.MAX_SEQUENCE_LENGTH,))
    inputs.append(inputs1)
    embedding1 = Embedding(len(pp.word_index)+1,embedding_dim,weights=[embedding_matrix],\
                           input_length=pp.MAX_SEQUENCE_LENGTH,trainable=True)(inputs1)
    conv1 = Conv1D(filters=128, kernel_size=filter_len, padding='same', activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    channels.append(flat1)
    encoded_X_trains.append(encoded_X_train)
    encoded_X_tests.append(encoded_X_test)
    
# merge
merged = concatenate(channels)
# interpretation
outputs = Dense(2, activation='softmax')(merged)
model = Model(inputs=inputs, outputs=outputs)
# compile
    
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy','categorical_accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 2473)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 2473, 300)    26573100    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_

In [45]:
%pdb off
batch_size = 64
# num_epochs = 3
num_epochs = 4

history = model.fit(encoded_X_trains,y_train,batch_size=batch_size,epochs=num_epochs,\
          validation_data=(encoded_X_tests,y_test))


Automatic pdb calling has been turned OFF
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


y_train

In [46]:
df = pd.DataFrame(history.history)
df=df[df['val_acc']==df.val_acc.max()]
df.reset_index(inplace=True)
df["title"]=["Keras CNN with pretrained embedding"]
df["sample_size"]=[SAMPLE_SIZE]
df["nb_epochs"]=[df.iloc[0]["index"]+1]
df.drop(labels="index",axis=1,inplace=True)
print(df)
df.to_csv(path_or_buf=df.iloc[0].title+".csv")

       acc  categorical_accuracy      loss  val_acc  val_categorical_accuracy  \
0  0.97548               0.97548  0.072373  0.88444                   0.88444   

   val_loss                                title  sample_size  nb_epochs  
0  0.314249  Keras CNN with pretrained embedding        50000          3  


In [47]:
df

Unnamed: 0,acc,categorical_accuracy,loss,val_acc,val_categorical_accuracy,val_loss,title,sample_size,nb_epochs
0,0.97548,0.97548,0.072373,0.88444,0.88444,0.314249,Keras CNN with pretrained embedding,50000,3
