# Step-1:Import Libraries

In [32]:
import numpy as np
import pandas as pd
import nltk
import re # Regular expressions
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [33]:
stop = stopwords.words('english')
stop.remove('not')

# Step-2 :Load Dataset

In [34]:
dataset = pd.read_csv('train.tsv',delimiter='\t',encoding='latin-1')

In [35]:
dataset.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


# Step-3: Extracting Data Frames into List

In [36]:
print(dataset.keys())
print(dataset.info())

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
None


In [37]:
x = dataset['Phrase']
y = dataset['Sentiment']

In [38]:
x

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
5         of escapades demonstrating the adage that what...
6                                                        of
7         escapades demonstrating the adage that what is...
8                                                 escapades
9         demonstrating the adage that what is good for ...
10                                  demonstrating the adage
11                                            demonstrating
12                                                the adage
13                                                      the
14                                                    adage
15                          that what is good for the goose
16                                      

In [39]:
Review = x[0]
Review

'A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'

In [40]:
corpus = []
for Review in x:
    #  Step-1: Converting into Lower case
    review = Review.lower()
    # Step-2: Removing Special Characters and numbers
    review = re.sub('[^a-zA-Z]',' ',review)
    # Step-3:Converting into list -> since we need to work on words
    review = review.split()
    # Step-4: Stemming ->Identifying root words and removing stopwords
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stop)]
    review = ' '. join(review)
    corpus.append(review)
print(review)

chortl


# Stemming

In [41]:
corpus

['seri escapad demonstr adag good goos also good gander occasion amus none amount much stori',
 'seri escapad demonstr adag good goos',
 'seri',
 '',
 'seri',
 'escapad demonstr adag good goos',
 '',
 'escapad demonstr adag good goos',
 'escapad',
 'demonstr adag good goos',
 'demonstr adag',
 'demonstr',
 'adag',
 '',
 'adag',
 'good goos',
 '',
 'good goos',
 '',
 'good goos',
 '',
 'good goos',
 'good',
 'goos',
 '',
 'goos',
 'goos',
 'also good gander occasion amus none amount much stori',
 'also good gander occasion amus none amount much stori',
 'also',
 'also',
 'good gander occasion amus none amount much stori',
 'gander occasion amus none amount much stori',
 'gander occasion amus none amount much stori',
 'gander',
 'gander',
 'gander',
 '',
 'occasion amus none amount much stori',
 '',
 '',
 '',
 '',
 'occasion amus none amount much stori',
 'occasion',
 'amus none amount much stori',
 'amus',
 'none amount much stori',
 '',
 'none amount much stori',
 'none',
 'amount much

# Bag of word model
**Count vectorization -Frequency Based model**

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
cv = CountVectorizer()

In [44]:
X_new = cv.fit_transform('Phrase').toarray()
y = dataset['Sentiment'].values

ValueError: Iterable over raw text documents expected, string object received.

In [None]:
cv.get_feature_names()

In [None]:
print(len(cv.get_feature_names()))
print(cv.get_feature_names())

# Step-5: Spliting Data

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_new,y,test_size = 0.2,random_state = 0)

# Step - 6: Buliding Machine Learing Model

In [None]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [None]:
model_M = MultinomialNB()
model_G = GaussianNB()

In [None]:
# Training model
model_M.fit(x_train,y_train)
model_G.fit(x_train,y_train)

In [None]:
y_pred_M = model_M.predict(x_test)
y_pred_G = model_G.predict(x_test)

# Evaluation

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
cm_M = confusion_matrix(y_test,y_pred_M)
cm_G = confusion_matrix(y_test,y_pred_G)

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
sns.heatmap(cm_M,annot = True , cbar = False,cmap ='summer')
plt.title('MultinodalNB')
plt.show()
sns.heatmap(cm_G,annot = True , cbar = False,cmap ='winter')
plt.title('GaussianNB')
plt.show()

In [None]:
cr_M = classification_report(y_test,y_pred_M)
cr_G = classification_report(y_test,y_pred_G)

In [None]:
print(cr_M)
print('='*50)
print(cr_G)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(model_M,'nlp_multinodal.pkl')
joblib.dump(model_G,'nlp_gaussian.pkl')

# Application

In [None]:
text = 'I love Machine Learning Class'

In [None]:
#  Step-1: Converting into Lower case
review = Review.lower()
# Step-2: Removing Special Characters and numbers
review = re.sub('[^a-zA-Z]',' ',review)
# Step-3:Converting into list -> since we need to work on words
review = review.split()
# Step-4: Stemming ->Identifying root words and removing stopwords
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stop)]
review = ' '. join(review)
print(review)

In [None]:
review = [review]

In [None]:
review

In [None]:
test = cv.transform(review).toarray()

In [None]:
model_M.predict(test)

In [None]:
review