In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r"D:\project\zomato\zomato_reviews.csv")

In [3]:
df.head(10)

Unnamed: 0,review,rating
0,' A beautiful place to dine in.The interiors ...,1
1,' I was here for dinner with my family on a w...,1
2,' Its a restaurant near to Banashankari BDA. ...,0
3,' We went here on a weekend and one of us had...,1
4,' The best thing about the place is itÃ\x83Ã...,1
5,' Great food and pleasant ambience. Expensive...,1
6,' Good ambience with tasty food.Cheese chilli...,1
7,' You canÃ\x83Ã\x83Ã\x82Ã\x82Ã\x83Ã\x82...,1
8,' Overdelighted by the service and food provi...,1
9,' The place is nice and comfortable. Food wis...,1


In [4]:
df.tail(10)

Unnamed: 0,review,rating
1319958,' Thought of a perfect dinner party and gone ...,0
1319959,' Yet to start full fledged menu')],1
1319960,' Nice and friendly place and staff is awesom...,1
1319961,' The service is bad. We ordered 4 Stella Art...,0
1319962,' Nice bar inside Sheraton. The hosts are fri...,1
1319963,""" My review based on food drink serviceFood:-...",1
1319964,' I visited chime bar at Sheraton hotel it wa...,1
1319965,' This restaurant is situated inside grand Sh...,1
1319966,"' Great ambience , looking nice good selectio...",1
1319967,' The nest is one of the best bar in Whitefie...,1


In [5]:
df.shape

(1319968, 2)

In [6]:
df.isnull().sum()

review    0
rating    0
dtype: int64

In [7]:
df.rating.value_counts()

1    890676
0    429292
Name: rating, dtype: int64

In [8]:
# Importing essential libraries for performing Natural Language Processing on 'Restaurant_Reviews.tsv' dataset
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Cleaning the reviews
corpus = []
for i in range(0,20000):

    # Cleaning special character from the reviews
    review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['review'][i])
    
    # Converting the entire review into lower case
    review = review.lower()
    review = review.strip()
    
    # Tokenizing the review by words
    review_words = review.split()
    
    # Removing the stop words
    review_words = [word for word in review_words if not word in set(stopwords.words('english'))]
    
    # Stemming the words
    ps = PorterStemmer()
    stem_word = [ps.stem(word) for word in review_words]
    
    # Joining the stemmed words
    review = ' '.join(stem_word)
    
    # Creating a corpus
    corpus.append(review)
#print(corpus)    

In [10]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.rating[:20000]

In [12]:
import pickle
# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

In [13]:
# Model Building

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50,stratify=y)


In [14]:
from sklearn import naive_bayes
clf = naive_bayes.GaussianNB()
clf.fit(X_train, y_train)

GaussianNB()

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,clf.predict(X_test) )

0.6735

In [16]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(0.1)
classifier.fit(X_train, y_train)



MultinomialNB(alpha=0.1)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,classifier.predict(X_test) )

0.89375

In [18]:
a=["bad food but fast service"]
vect = cv.transform(a).toarray()
classifier.predict(vect)

array([0], dtype=int64)

In [19]:
# Creating a pickle file for the Multinomial Naive Bayes model
import pickle
filename = 'restaurant_model.pkl'
pickle.dump(classifier, open(filename, 'wb'))