# Sentimental Analysis Of Movie Reviews

## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
import gc

## Loading Data

In [2]:
train = pd.read_csv('train.tsv', delimiter = '\t')
print(train.shape)
train.sample(5)

(156060, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
119145,119146,6368,are so believable that you feel what they feel,3
36149,36150,1707,is not as well-conceived as either of those films,1
79010,79011,4065,`` Spider-Man '' certainly,3
854,855,32,"-- no , paralyzed --",2
155559,155560,8514,conceivable mistake,0


In [3]:
test = pd.read_csv('test.tsv', delimiter = '\t')
print(test.shape)
test.sample(5)

(66292, 3)


Unnamed: 0,PhraseId,SentenceId,Phrase
38939,195000,10370,her hair and her lips
35077,191138,10181,is popular and powerful in this high-tech age ...
60660,216721,11548,tissues
6721,162782,8805,"sophisticated , funny and good-natured treat"
25667,181728,9713,"if not more so ,"


In [4]:
samples = pd.read_csv('samplesubmission.csv')
print(samples.shape)
samples.sample(5)

(66292, 2)


Unnamed: 0,PhraseId,Sentiment
38494,194555,2
16942,173003,2
4611,160672,2
65148,221209,2
55896,211957,2


## Adding trian and test for data preprocessing

In [5]:
test['Sentiment'] = -1
df = pd.concat((train,test))
print(df.shape)
df.sample(5)

(222352, 4)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
70628,70629,3600,pure finesse,4
25926,181987,9723,An absurdist comedy,-1
57602,57603,2905,", its true colors come out in various wet T-sh...",1
95478,95479,4986,mock the kind of folks they do n't understand ...,2
31186,31187,1456,this fascinating portrait of a modern Lothario,3


In [6]:
del train,test
gc.collect()

10

## Importing libraries for text cleaning

In [7]:
import re
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk import FreqDist

In [8]:
stemmer = SnowballStemmer('english')

## Creating function to clean reviews

In [9]:
def review_clean(all_reviews):
    corpus = []
    for i in range(0, len(all_reviews)):
        review = str(all_reviews[i])
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = [stemmer.stem(word) for word in word_tokenize(str(review).lower())]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [10]:
df['cleaned_review'] = review_clean(df.Phrase.values)

In [44]:
df.shape

NameError: name 'df' is not defined

In [12]:
df.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
37386,37387,1774,Hollywood bio-pic,2,hollywood bio pic
44343,200404,10658,'s first quarter .,-1,s first quarter
115346,115347,6144,as a good spaghetti western,2,as a good spaghetti western
4464,4465,168,whose,2,whose
101728,101729,5355,to merely bad rather than painfully awful,1,to mere bad rather than pain aw


## Splitting data again

In [45]:
df_train = df[df.Sentiment != -1]
df_test = df[df.Sentiment == -1]
df_test.drop('Sentiment', axis = 1).sample(5)

NameError: name 'df' is not defined

In [46]:
print(df_train.shape)
df_train.sample(5)

(156060, 5)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
108620,108621,5749,"skillfully assembled , highly polished and",4,skill assembl high polish and
100585,100586,5278,some surprises,3,some surpris
34097,34098,1602,so many distracting special effects and visual...,0,so mani distract special effect and visual par...
136880,136881,7403,director Tuck Tucker,2,director tuck tucker
142744,142745,7747,"a text to ` lick , ' despite the efforts of a ...",1,a text to lick despit the effort of a first ra...


In [15]:
print(df_test.shape)
df_test.sample(5)

(66292, 5)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_review
28443,184504,9849,is worth the price of admission,-1,is worth the price of admiss
22063,178124,9533,indie film,-1,indi film
5709,161770,8759,lad,-1,lad
38776,194837,10361,of said behavior,-1,of said behavior
39759,195820,10416,of Two Weddings and a Funeral,-1,of two wed and a funer


In [16]:
del df
gc.collect()

40

## Importing libraries to process further

In [36]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam

In [37]:
train = df_train.cleaned_review.values
test = df_test.cleaned_review
y = to_categorical(df_train.Sentiment.values)

In [38]:
print(train.shape)
print(test.shape)
print(y.shape)

(156060,)
(66292,)
(156060, 5)


## Spliting training set into train and validation set

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, y, stratify = y, test_size = 0.2)

In [43]:
print(X_train.shape, X_val.shape, X_test.shape)

(124848,) (31212,) (66292, 48)


## Total number of unique words 

In [47]:
total_words = ' '.join(X_train)
total_words = word_tokenize(total_words)
unique_words = FreqDist(total_words)
max_features = len(unique_words)
print(max_features)

10342


## Maximum length of review

In [48]:
review_len = []
for text in X_train:
    word = word_tokenize(text)
    l = len(word)
    review_len.append(l)
max_review_len = np.max(review_len)
print(max_review_len)

48


## Tokenizing texts

In [49]:
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test)

## Sequences padding

In [50]:
X_train = sequence.pad_sequences(X_train, maxlen=max_review_len)
X_val = sequence.pad_sequences(X_val, maxlen=max_review_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_len)

## Building CNN

In [51]:
batch_size = 128
epochs = 3
num_classes = 5

In [52]:
classifier= Sequential()
classifier.add(Embedding(max_features, 100, input_length=max_review_len))
classifier.add(Dropout(0.2))

classifier.add(Conv1D(64, kernel_size=3, padding='same', activation='relu', strides=1))
classifier.add(GlobalMaxPooling1D())

classifier.add(Dense(128, activation='relu'))
classifier.add(Dropout(0.2))

classifier.add(Dense(num_classes, activation='softmax'))

classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [53]:
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 48, 100)           1034200   
_________________________________________________________________
dropout_10 (Dropout)         (None, 48, 100)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 645       
Total para

In [54]:
classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 124848 samples, validate on 31212 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2d4bb108a58>

In [55]:
y_pred = classifier.predict_classes(X_test, verbose = 1)



In [60]:
y_pred

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)