# Restaurant Reviews - 2

### Imports

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### The Data

In [2]:
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3)

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


### NLP Classification Task

###### Cleaning the texts

In [5]:
# Importing the libraries

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
ps = PorterStemmer()

###### Work on single review first to Clean it

###### Remove special Charcters

In [7]:
rvw = re.sub('[^a-z A-Z]', ' ', dataset['Review'][0])
print(rvw)

Wow    Loved this place 


###### change all texts to lower case

In [8]:
rvw = rvw.lower()
print(rvw)

wow    loved this place 


###### split the sentence into words

In [9]:
rvw = rvw.split()
print(rvw)

['wow', 'loved', 'this', 'place']


###### stem the words

In [10]:
rvw = [ps.stem(word) for word in rvw if not word in stopwords.words('english')] 
print(rvw)

['wow', 'love', 'place']


###### Join the words back into a sentence

In [11]:
rvw = ' '.join(rvw)
print(rvw)

wow love place


##### Now work on whole Dataset 

In [12]:
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-z A-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

#### Creating the Bag of Words model

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer()
cv = CountVectorizer(max_features=1500)

X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1]

In [15]:
X[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
y[:10]

0    1
1    0
2    0
3    1
4    1
5    0
6    0
7    0
8    1
9    1
Name: Liked, dtype: int64

In [17]:
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

#### Train Test Split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

#### Training the Model

In [19]:
X_train[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
X_test[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
y_train[:10]

382    1
994    0
982    0
47     1
521    1
636    0
19     0
366    0
340    0
952    0
Name: Liked, dtype: int64

In [22]:
y_test[:10]

507    0
818    0
452    0
368    1
242    0
929    0
262    1
810    0
318    1
49     1
Name: Liked, dtype: int64

In [23]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier1 = GaussianNB()
classifier1.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

#### Predictions and Evaluations

In [24]:
y_pred = classifier1.predict(X_test)

In [25]:
print(y_pred[:10])

[0 0 0 1 1 1 1 1 1 1]


In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report

print('--------------------------------------------------------')
print('Confusion Matrix \n', confusion_matrix(y_test, y_pred))
print('--------------------------------------------------------')
print('Accuracy Score \n', accuracy_score(y_test, y_pred))
print('--------------------------------------------------------')
print('Classification Report \n', classification_report(y_test, y_pred))

--------------------------------------------------------
Confusion Matrix 
 [[60 48]
 [15 77]]
--------------------------------------------------------
Accuracy Score 
 0.685
--------------------------------------------------------
Classification Report 
               precision    recall  f1-score   support

           0       0.80      0.56      0.66       108
           1       0.62      0.84      0.71        92

   micro avg       0.69      0.69      0.69       200
   macro avg       0.71      0.70      0.68       200
weighted avg       0.72      0.69      0.68       200



##### Accuracy can be improved   by tring different models and tuning