In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [51]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [52]:
dataset.shape

(1000, 2)

### Cleaning the Texts

In [53]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i]) # replace punctuations in all the reviews by white spaces
    review = review.lower() # convert all leters into lowercase
    review = review.split() 
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = " ".join(review)
    corpus.append(review)


In [55]:
print(corpus[0:10])

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch']


### Create the Bag of Words Model

In [56]:
# tokenization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [57]:
X.shape[1] # total number of words after tokenization

1500

### Build the model

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [59]:
# classification using the Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB()

In [60]:
y_pred = classifier.predict(X_test)

In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[55 42]
 [12 91]]


In [62]:
acc = accuracy_score(y_test,y_pred)
print('Accuracy of predictions = {:.2f} %'.format(acc*100))

Accuracy of predictions = 73.00 %


In [63]:
# predict for a single review
def review_sentiment(review):
    review = re.sub('[^a-zA-Z]',' ',review) 
    review = review.lower() 
    review = review.split() 
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = " ".join(review)
    corpus = [review]
    new_features = cv.transform(corpus).toarray()
    y_pred_rev = classifier.predict(new_features)
    if y_pred_rev == 1:
        print('Positive review')
    else:
        print('Negative review')

In [64]:
review1 = 'I love this Restaurant so much'
review_sentiment(review1)

Positive review


In [65]:
review2 = 'I hate this restaurant so much'
review_sentiment(review2)

Negative review
