## Business Problem

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [8]:
data = pd.read_csv("Restaurant_Reviews.tsv", sep = '\t', quoting = 3)

In [9]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [11]:
data['Liked'].value_counts()
# balanced dataset

1    500
0    500
Name: Liked, dtype: int64

### Cleaning Text Data

In [12]:
import nltk
import re

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.corpus import stopwords

In [15]:
data['Review'][0]

'Wow... Loved this place.'

In [16]:
review = re.sub('[^a-zA-Z]', ' ', data['Review'][0])
review

'Wow    Loved this place '

In [17]:
review = review.lower()
review
# everything become lowercase

'wow    loved this place '

In [18]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [19]:
preview = []
for word in review:
  if word not in stopwords.words('english'):
    preview.append(word)

In [20]:
preview

['wow', 'loved', 'place']

In [21]:
review = [word for word in review if word not in stopwords.words('english')]

In [22]:
review

['wow', 'loved', 'place']

In [23]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [24]:
review = [ps.stem(word) for word in review]
review

['wow', 'love', 'place']

In [25]:
review = " ".join(review)

In [26]:
print(review)

wow love place


In [27]:
corpus = []
ps = PorterStemmer()

for i in range(len(data)):
  review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
  review = " ".join(review)

  corpus.append(review)

In [28]:
print(corpus)

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could happier', 'seem like go

### Bag of word model

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer(max_features = 1500)

In [32]:
x = cv.fit_transform(corpus).toarray()

In [33]:
x.shape

(1000, 1500)

In [34]:
y = data.iloc[:, 1].values

In [35]:
y.shape

(1000,)

In [36]:
y[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1])

## Apply Naive Bayes Algorithm

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = .2, random_state = 123)

In [38]:
X_train.shape, X_test.shape

((800, 1500), (200, 1500))

In [39]:
y_train.shape, y_test.shape

((800,), (200,))

In [40]:
from sklearn.naive_bayes import GaussianNB

In [41]:
classifier = GaussianNB()

In [42]:
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [44]:
y_pred = classifier.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score(y_test, y_pred)

0.675