In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

# Cleaning Data

In [5]:
from nltk.stem.porter import PorterStemmer

In [6]:
import nltk.corpus
from nltk.corpus import stopwords

In [7]:
input = re.sub(pattern = '[^a-zA-Z]', repl=' ', string=df['Review'][0])
input

'Wow    Loved this place '

In [8]:
L = []

for i in range(0,len(df)):
    feedbacks = re.sub(pattern = '[^a-zA-Z]', repl=' ', string=df['Review'][i])
    feedbacks = feedbacks.lower() 
    feedbacks_words = feedbacks.split()
    feedback_words = [word for word in feedbacks_words if not word in set(stopwords.words('english'))]
    
    ps = PorterStemmer()
    feedbacks = [ps.stem(word) for word in feedbacks_words]
    
    feedbacks = ' '.join(feedbacks)
    
    L.append(feedbacks)

In [9]:
from sklearn.feature_extraction .text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [10]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(L).toarray()
y = df.iloc[:, 1].values

In [11]:
filename = 'cv_fit.pkl'
pickle.dump(cv, open(filename, 'wb'))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [13]:
accuracies = {}

In [14]:
from sklearn.svm import SVC 
classifier = SVC()
classifier.fit(X_train,y_train)

acc = classifier.score(X_test, y_test)*100
accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

Test Accuracy of SVM Algorithm: 81.50%


In [15]:
from sklearn.naive_bayes import GaussianNB 
model1 = GaussianNB() 
model1.fit(X_train, y_train)

acc = model1.score(X_test, y_test)*100
accuracies['Naive Bayes'] = acc
print("Test Accuracy of Naive Bayes Algorithm: {:.2f}%".format(acc))

Test Accuracy of Naive Bayes Algorithm: 73.00%


In [16]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier (n_neighbors=5)
classifier.fit(X_train, y_train)

acc = classifier.score(X_test, y_test)*100
accuracies['KNN'] = acc
print("Test Accuracy of Naive Bayes Algorithm: {:.2f}%".format(acc))             

Test Accuracy of Naive Bayes Algorithm: 67.00%


In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.2)
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)*100
accuracies['Multinomial'] = acc
print("Test Accuracy of Naive Bayes Algorithm: {:.2f}%".format(acc))

Test Accuracy of Naive Bayes Algorithm: 79.50%


In [18]:
from sklearn.linear_model import LogisticRegression 
classifier = LogisticRegression() 
classifier.fit(X_train,y_train)

acc = model.score(X_test, y_test)*100
accuracies['Logistic'] = acc
print("Test Accuracy of Naive Bayes Algorithm: {:.2f}%".format(acc))

Test Accuracy of Naive Bayes Algorithm: 79.50%


In [19]:
filename2 = 'multinb_model.pkl'
pickle.dump(model1, open(filename, 'wb'))