### Import the libraries.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("D:\Restaurant_Reviews.tsv",delimiter = '\t',quoting = 3)
df.head(3)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0


### Cleaning the texts in order to provide accurate results and cut down noise. 

In [3]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
christi = []
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  christi.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\johnm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(christi)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

In [5]:
len(christi)

1000

### Bag of Words

In [29]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(christi).toarray()
y = df.iloc[:, -1].values

In [30]:
cv.vocabulary_

{'wow': 1548,
 'love': 802,
 'place': 1029,
 'crust': 324,
 'not': 923,
 'good': 594,
 'tasti': 1363,
 'textur': 1375,
 'nasti': 900,
 'stop': 1312,
 'late': 762,
 'may': 834,
 'bank': 92,
 'holiday': 667,
 'rick': 1150,
 'steve': 1305,
 'recommend': 1116,
 'select': 1206,
 'menu': 852,
 'great': 608,
 'price': 1063,
 'get': 578,
 'angri': 33,
 'want': 1498,
 'damn': 333,
 'pho': 1017,
 'honeslti': 670,
 'tast': 1361,
 'fresh': 553,
 'potato': 1055,
 'like': 785,
 'rubber': 1164,
 'could': 297,
 'tell': 1368,
 'made': 813,
 'ahead': 15,
 'time': 1397,
 'kept': 745,
 'warmer': 1500,
 'fri': 554,
 'touch': 1415,
 'servic': 1215,
 'prompt': 1075,
 'would': 1546,
 'go': 588,
 'back': 83,
 'cashier': 213,
 'care': 205,
 'ever': 461,
 'say': 1191,
 'still': 1307,
 'end': 445,
 'wayyy': 1508,
 'overpr': 968,
 'tri': 1425,
 'cape': 201,
 'cod': 259,
 'ravoli': 1106,
 'chicken': 236,
 'cranberri': 310,
 'mmmm': 872,
 'disgust': 384,
 'pretti': 1062,
 'sure': 1347,
 'human': 687,
 'hair': 628,
 

In [31]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Train Test Split

In [32]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [33]:
# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

### Confusion Matrix

In [34]:
# Confusion Matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

### We achieved an accuarcy of 73% in determining positive and negative reviews of an restaurant online. This can be a difficult task and does not always result in a high accuracy as words that are considered positive such as 'great' can be used in the sentence 'The food was a great dissappointment', which is a negative comment. Recurrent Neural Networks are better for this and are more complicated. For our Bag of Words model we achieved a good accuracy.