In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
#import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
#from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

In [14]:
##Reading the input files

train = pd.read_csv('train.tsv', sep="\t")


In [15]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what is good for the goose,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is good for the goose,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for the goose,2


In [16]:
train.loc[train.SentenceId == 2]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
63,64,2,"This quiet , introspective and entertaining independent is worth seeking .",4
64,65,2,"This quiet , introspective and entertaining independent",3
65,66,2,This,2
66,67,2,"quiet , introspective and entertaining independent",4
67,68,2,"quiet , introspective and entertaining",3
68,69,2,quiet,2
69,70,2,", introspective and entertaining",3
70,71,2,introspective and entertaining,3
71,72,2,introspective and,3
72,73,2,introspective,2


Cleaning the data

In [17]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
from string import punctuation
import re

In [18]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [19]:
def clean_review(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus
   

In [20]:
train['clean_review']=clean_review(train.Phrase.values)

In [21]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of a story
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series


In [22]:
##Balancing the data by Resampling
from sklearn.utils import resample
train_2 = train[train['Sentiment']==2]
train_1 = train[train['Sentiment']==1]
train_3 = train[train['Sentiment']==3]
train_4 = train[train['Sentiment']==4]
train_5 = train[train['Sentiment']==0]
train_2_sample = resample(train_2,replace=True,n_samples=75000,random_state=123)
train_1_sample = resample(train_1,replace=True,n_samples=75000,random_state=123)
train_3_sample = resample(train_3,replace=True,n_samples=75000,random_state=123)
train_4_sample = resample(train_4,replace=True,n_samples=75000,random_state=123)
train_5_sample = resample(train_5,replace=True,n_samples=75000,random_state=123)

df_upsampled = pd.concat([train_2, train_1_sample,train_3_sample,train_4_sample,train_5_sample])

In [23]:
df_upsampled.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what is good for the goose,2,of escapade demonstrating the adage that what is good for the goose


In [24]:
text = ' '.join(df_upsampled.loc[df_upsampled.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]

In [25]:
Counter(text_trigrams).most_common(30)

[(('one', 'of', 'the'), 1644),
 (('of', 'the', 'year'), 832),
 (('of', 'the', 'best'), 677),
 (('of', 'the', 'most'), 612),
 (('is', 'one', 'of'), 407),
 (('One', 'of', 'the'), 370),
 ((',', 'and', 'the'), 333),
 (('the', 'year', "'s"), 326),
 (('It', "'s", 'a'), 323),
 (('the', 'edge', 'of'), 300),
 (('it', "'s", 'a'), 299),
 (('a', 'movie', 'that'), 297),
 (('of', 'your', 'seat'), 273),
 (('the', 'film', 'is'), 267),
 (('the', 'kind', 'of'), 267),
 (('.', 'is', 'a'), 264),
 (('the', 'film', "'s"), 264),
 (('as', 'one', 'of'), 254),
 ((',', 'the', 'film'), 253),
 (('edge', 'of', 'your'), 249),
 ((',', 'this', 'is'), 236),
 (('as', 'well', 'as'), 231),
 ((',', 'it', "'s"), 226),
 (('film', 'that', 'is'), 223),
 (('.', 'It', "'s"), 218),
 (('a', 'film', 'that'), 211),
 ((',', 'funny', ','), 208),
 (('some', 'of', 'the'), 206),
 (('year', "'s", 'best'), 188),
 (('a', 'solid', 'cast'), 178)]

In [26]:
tokenizer = TweetTokenizer()

Applying ML algorithm

In [27]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(df_upsampled['clean_review'].values)
vectorizer.fit(full_text)
df_upsampled_vectorized = vectorizer.transform(df_upsampled['clean_review'])
test_vectorized = vectorizer.transform(train['clean_review'])
test1 = train['clean_review']



In [28]:
y = df_upsampled['Sentiment']

In [29]:
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

In [30]:
%%time
ovr.fit(df_upsampled_vectorized, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 58.9 s, sys: 1min 4s, total: 2min 3s
Wall time: 1min 2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [31]:
scores = cross_val_score(ovr, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 72.04%, std 0.43.


In [32]:

%%time
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 77.44%, std 0.47.
CPU times: user 76.4 ms, sys: 40.7 ms, total: 117 ms
Wall time: 59.2 s


In [33]:

%%time
model = MultinomialNB()
#model.fit(train_vectorized, y)
scores =  cross_val_score(model, df_upsampled_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 59.51%, std 0.16.
CPU times: user 72.9 ms, sys: 34.9 ms, total: 108 ms
Wall time: 580 ms


In [34]:
from keras.utils import to_categorical
X = df_upsampled['clean_review']
#test_set = test['clean review']
#Y = train['Sentiment']
Y = to_categorical(df_upsampled['Sentiment'].values)
print(Y)

[[0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


In [37]:

from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=123)

In [36]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(284686,) (284686, 5)
(94896,) (94896, 5)


In [38]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [39]:
##Total number of words/features
all_words=' '.join(X_train)
all_words=word_tokenize(all_words)
#print(all_words)
dist=FreqDist(all_words)

num_unique_word=len(dist)
num_unique_word
#X_train.head()

13728

In [40]:
##Number of words for each phrase/text
r_len=[]
for text in X_train:
    word=word_tokenize(text)
  #  print(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

48

In [41]:

max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=5

In [43]:

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

Tokenizing the words

In [44]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

X_test = tokenizer.texts_to_sequences(test1)
#X_test

In [45]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

In [46]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
#print(X_train.shape,X_val.shape)
X_test

array([[    0,     0,     0, ...,     4,     2,    41],
       [    0,     0,     0, ...,    14,     1,  4890],
       [    0,     0,     0, ...,     0,     2,   334],
       ...,
       [    0,     0,     0, ...,     0, 10416, 10417],
       [    0,     0,     0, ...,     0,     0, 10416],
       [    0,     0,     0, ...,     0,     0, 10417]], dtype=int32)

CNN model

In [47]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential

In [48]:
model2 = Sequential()

# Input / Embdedding
model2.add(Embedding(max_features, 150, input_length=max_words))

# CNN
model2.add(SpatialDropout1D(0.2))

model2.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))

model2.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))

model2.add(Flatten())

# Output layer
model2.add(Dense(5, activation='sigmoid'))

In [49]:

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f49c51397b8>

predictions 


In [51]:
pred2=model2.predict_classes(X_test,verbose=1)


#sub.head()

  92/4877 [..............................] - ETA: 8s





In [52]:
pred2

array([1, 2, 2, ..., 3, 2, 2])

In [53]:
###testingfor new reviews
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [55]:
tokens_pad = pad_sequences(tokens, maxlen=MAX_REVIEW_LEN)
tokens_pad.shape

(8, 48)

In [56]:
pred3=model2.predict_classes(tokens_pad,verbose=1)





In [57]:
print(pred3)

[4 4 2 2 1 0 2 0]
