# Sentimental Analysis Of Movie Reviews

## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
import gc

## Loading Data

In [2]:
train = pd.read_csv('train.tsv', delimiter = '\t')
print(train.shape)
train.sample(5)

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
133399,133400,7198,at last count,2
54873,54874,2733,Snacks,2
15027,15028,646,while it may not rival the filmmaker 's period...,2
109195,109196,5782,"conventional , but",2
47891,47892,2335,that would be foreign in American teen comedies,2


In [3]:
test = pd.read_csv('test.tsv', delimiter = '\t')
print(test.shape)
test.sample(5)

(66292, 3)


Unnamed: 0,PhraseId,SentenceId,Phrase
62134,218195,11624,the first sci-fi comedy
41380,197441,10501,"as an intense , brooding character study"
21488,177549,9505,bored or frustrated by the film
6969,163030,8817,Too leisurely
25976,182037,9726,this sad-sack waste


In [4]:
samples = pd.read_csv('samplesubmission.csv')
print(samples.shape)
samples.sample(5)

(66292, 2)


Unnamed: 0,PhraseId,Sentiment
37170,193231,2
58938,214999,2
15371,171432,2
59049,215110,2
61669,217730,2


## Adding trian and test for data preprocessing

In [5]:
test['Sentiment'] = -1
df = pd.concat((train,test))
print(df.shape)
df.sample(5)

(222352, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
23893,179954,9620,deepen,-1
4322,4323,164,bet there is,2
132775,132776,7159,world dichotomy,2
110218,110219,5837,proverbial,2
120427,120428,6439,Anna Mouglalis,2


In [6]:
del train,test
gc.collect()

10

## Importing libraries for text cleaning

In [7]:
import re
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk import FreqDist

In [8]:
stemmer = SnowballStemmer('english')

## Creating function to clean reviews

In [9]:
def review_clean(all_reviews):
    corpus = []
    for i in range(0, len(all_reviews)):
        review = str(all_reviews[i])
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = [stemmer.stem(word) for word in word_tokenize(str(review).lower())]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [10]:
df['cleaned_review'] = review_clean(df.Phrase.values)

In [11]:
df.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
145016,145017,7876,during the first hour,2,dure the first hour
146672,146673,7979,of watching this film with an audience full of...,1,of watch this film with an audienc full of tee...
121777,121778,6525,deserves more,2,deserv more
18357,174418,9356,to any actress I can remember to personifying ...,-1,to ani actress i can rememb to personifi indep...
31342,187403,9992,"lend its imprimatur to , then perhaps",-1,lend it imprimatur to then perhap


## Splitting data again

In [12]:
df_train = df[df.Sentiment != -1]
df_test = df[df.Sentiment == -1]
df_test.drop('Sentiment', axis = 1).sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,cleaned_review
62744,218805,11657,` urban comedy ',urban comedi
64442,220503,11746,are so heavy-handed that they instead pummel t...,are so heavi hand that they instead pummel the...
14750,170811,9179,on the small screen,on the small screen
34480,190541,10155,of coming-of-age cliches,of come of age clich
65634,221695,11812,it 's now coming true ' bad,it s now come true bad


In [13]:
print(df_train.shape)
df_train.sample(5)

(156060, 5)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
132725,132726,7155,just copies,1,just copi
24053,24054,1093,can thank me for this,2,can thank me for this
119010,119011,6361,the love scenes all end in someone screaming .,2,the love scene all end in someon scream
13470,13471,580,This is n't a stand up and cheer flick ;,2,this is n t a stand up and cheer flick
26743,26744,1226,subtle ironies,3,subtl ironi


In [14]:
print(df_test.shape)
df_test.sample(5)

(66292, 5)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
59855,215916,11502,heard since Macy Gray 's game of Chinese whisp...,-1,heard sinc maci gray s game of chines whisper ...
34744,190805,10168,difficult-to-swallow,-1,difficult to swallow
19849,175910,9424,modern context,-1,modern context
23601,179662,9607,correctness,-1,correct
11131,167192,9005,is n't a weak or careless performance amongst ...,-1,is n t a weak or careless perform amongst them


In [15]:
del df
gc.collect()

30

## Importing libraries to process further

In [16]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam

Using TensorFlow backend.


In [17]:
train = df_train.cleaned_review.values
test = df_test.cleaned_review
y = to_categorical(df_train.Sentiment.values)

In [18]:
print(train.shape)
print(test.shape)
print(y.shape)

(156060,)
(66292,)
(156060, 5)


## Spliting training set into train and validation set

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, y, stratify = y, test_size = 0.2)

In [21]:
print(X_train.shape, X_val.shape, test.shape)

(124848,) (31212,) (66292,)


## Total number of unique words 

In [22]:
total_words = ' '.join(X_train)
total_words = word_tokenize(total_words)
unique_words = FreqDist(total_words)
max_features = len(unique_words)
print(max_features)

10341


## Maximum length of review

In [23]:
review_len = []
for text in X_train:
    word = word_tokenize(text)
    l = len(word)
    review_len.append(l)
max_review_len = np.max(review_len)
print(max_review_len)

48


## Tokenizing texts

In [24]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test)

## Sequences padding

In [25]:
X_train = sequence.pad_sequences(X_train, maxlen=max_review_len)
X_val = sequence.pad_sequences(X_val, maxlen=max_review_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_len)

## Building CNN

In [26]:
batch_size = 128
epochs = 3
num_classes = 5

In [27]:
classifier= Sequential()
classifier.add(Embedding(max_features, 100, input_length=max_review_len))
classifier.add(Dropout(0.2))

classifier.add(Conv1D(64, kernel_size=3, padding='same', activation='relu', strides=1))
classifier.add(GlobalMaxPooling1D())

classifier.add(Dense(128, activation='relu'))
classifier.add(Dropout(0.2))

classifier.add(Dense(num_classes, activation='softmax'))

classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [28]:
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 48, 100)           1034100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 100)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 645       
Total para

In [29]:
classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x159a31c25c0>

In [30]:
y_pred = classifier.predict_classes(X_test, verbose = 1)



In [31]:
y_pred

array([3, 3, 2, ..., 2, 2, 1], dtype=int64)