In [178]:
import pandas as pd
import pickle
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suresha.bc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [198]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [199]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [200]:
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], 
    df.iloc[:,-1], 
    test_size = 0.20, 
    random_state = 0)

In [201]:
#Creating corpus.
corpus = []
ps = PorterStemmer()
for index,row in X_train.iterrows():
    
    #removing non alphabets,converting to lower case and then creating a array.
    review = re.sub('[^a-zA-Z]',' ',row['Review']).lower().split()
    
    #apply stemming for non stop words.
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))-{'not'}]
    
    #join the list to re form review.
    review = ' '.join(review)
    
    #create the corpus
    corpus.append(review)

In [202]:
cv = CountVectorizer(max_features = 1500)
X_trn = cv.fit_transform(corpus).toarray()
y_trn = y_train.values

In [203]:
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

In [151]:
# Model Building
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
#classifier = BernoulliNB()
#classifier.fit(X_train, y_train)

In [204]:
#ensemble model building
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

In [205]:
hard_voting_clf = VotingClassifier(
    estimators = [('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)], voting = 'hard')

In [206]:
#Creating corpus.
corpus_tst = []
ps = PorterStemmer()
for index,row in X_test.iterrows():
    
    #removing non alphabets,converting to lower case and then creating a array.
    review = re.sub('[^a-zA-Z]',' ',row['Review']).lower().split()
    
    #apply stemming for non stop words.
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))-{'not'}]
    
    #join the list to re form review.
    review = ' '.join(review)
    
    #create the corpus
    corpus_tst.append(review)

In [207]:
#X_test preparatin
X_tst = cv.transform(corpus_tst).toarray()
y_tst = y_test.values

In [208]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf,hard_voting_clf):
    clf.fit(X_trn, y_trn)
    y_pred = clf.predict(X_tst)
    print(clf.__class__.__name__, accuracy_score(y_tst, y_pred))
            
#hvc_predict = hard_voting_clf.predict(X_tst)            
#print("Hard voting clasifier accuracy: ", accuracy_score(y_tst, hvc_predict))

LogisticRegression 0.775
RandomForestClassifier 0.755
SVC 0.78
VotingClassifier 0.78


In [210]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

soft_voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')


In [194]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
    clf.fit(X_trn, y_trn)
    y_pred = clf.predict(X_tst)
    print(clf.__class__.__name__, accuracy_score(y_tst, y_pred))

LogisticRegression 0.775
RandomForestClassifier 0.755
SVC 0.78
VotingClassifier 0.795


In [212]:
soft_voting_clf.fit(X_trn, y_trn)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(probability=True, random_state=42))],
                 voting='soft')

In [214]:
y_pred = soft_voting_clf.predict(X_tst)
accuracy_score(y_tst, y_pred)

0.795

In [215]:
# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'restaurant-sentiment-mnb-model.pkl'
pickle.dump(soft_voting_clf, open(filename, 'wb'))


In [195]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_tst, y_pred)

In [197]:
cm

array([[84, 13],
       [28, 75]], dtype=int64)

In [227]:
#Apply pre processing steps for real data.

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
message = re.sub('[^a-zA-Z]',' ',message).lower().split()
    
#apply stemming for non stop words.
message = ' '.join([ps.stem(word) for word in message if word not in set(stopwords.words('english'))-{'not'}])

print([message])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\suresha.bc\AppData\Roaming\nltk_data...


['not possibl']


[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
    
    #create the corpus
    #corpus_tst.append(review)