# Movie Sentiment Analysis with Keras

In [1]:
# uncomment these for Google collab, will have already been installed in local environment 
# if 'pip install -r requirements.txt' has been run
#!pip install nltk
#!pip install --upgrade gensim

import numpy as np
import os
import os.path

from pdb import set_trace
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[nltk_data] Downloading package punkt to /home/jeremie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
if not os.path.isdir('../aclImdb'):
    if not os.path.isfile('../aclImdb_v1.tar.gz'):
      !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

    if not os.path.isdir('../aclImdb'):  
      !tar -xf aclImdb_v1.tar.gz 

In [3]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=3000
positive_sample_file_list = glob.glob(os.path.join('../aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('../aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [4]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1][:100])
print("Negative review(s):", df_negatives['reviews'][1][:100])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)


Positive review(s): NYC model Alison Parker (Cristina Raines) rents a room in an old brownstone where she meets a few bi
Negative review(s): The reason the DVD releases of this film are in black and white is because nobody can get their hand


In [5]:
import tensorflow as tf

# def lstm_keras():
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM, Dropout
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing import sequence

Using TensorFlow backend.


In [6]:


vocab_size = 1000

# Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',\
#           lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

tokenize = Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(X_train)

tokenized_X_train = tokenize.texts_to_sequences(X_train)
tokenized_X_test = tokenize.texts_to_sequences(X_test)

max_document_length = max([len(x) for x in np.concatenate((tokenized_X_train,tokenized_X_test),axis=0)])
encoded_X_train = sequence.pad_sequences(tokenized_X_train, maxlen=max_document_length)
encoded_X_test = sequence.pad_sequences(tokenized_X_test, maxlen=max_document_length)


encoder = LabelBinarizer()
encoder.fit(y_train)
encoded_y_train = encoder.transform(y_train)
encoded_y_test = encoder.transform(y_test)



https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/

In [9]:
from keras.layers import Concatenate
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils import to_categorical
# create the model


channels = []
inputs = []

encoded_X_trains= []
encoded_X_tests = []
for filter_len in [3,4,5]:
# for filter_len in [3,4]:
    inputs1 = Input(shape=(max_document_length,))
    inputs.append(inputs1)
    embedding1 = Embedding(vocab_size, 128, input_length=max_document_length)(inputs1)
    conv1 = Conv1D(filters=128, kernel_size=filter_len, padding='same', activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    channels.append(flat1)
    encoded_X_trains.append(encoded_X_train)
    encoded_X_tests.append(encoded_X_test)
    
# merge
merged = concatenate(channels)
# interpretation
outputs = Dense(2, activation='softmax')(merged)
model = Model(inputs=inputs, outputs=outputs)
# compile
    
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy','categorical_accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1228)         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1228)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1228, 128)    128000      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1228, 128)    128000      input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (

In [10]:

batch_size = 128
num_epochs = 5

model.fit(encoded_X_trains,to_categorical(y_train),batch_size=batch_size,epochs=num_epochs,\
          validation_data=(encoded_X_tests,to_categorical(y_test)))


Train on 4500 samples, validate on 1500 samples
Epoch 1/1


<keras.callbacks.History at 0x7fb2800f9cf8>