<a href="https://colab.research.google.com/github/obeabi/NaturalLanguageProcessing/blob/master/MainRestaurantReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written by Abiola Obembe
## Restaurant Reviews
### Objective: To predict the sentiment of customers based on reviews
### Bag of Word Model
### Date: 30th August 2020

In [143]:
# install libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 

In [144]:
# load dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter= '\t', quoting= 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [145]:
# Evaluate dataset structure and data type
print(dataset.shape)
dataset.dtypes

(1000, 2)


Review    object
Liked      int64
dtype: object

In [146]:
dataset['Review'][0]

'Wow... Loved this place.'

## Data Cleaning

In [147]:
comment = ['I am here and will be back tomorrow;however, if you don''t see me be calm ', 'Ayo Oyebisi, I love you so much and it drives me crazy. Why so?']
df = pd.DataFrame(comment, columns=['Review'])
df.head()



Unnamed: 0,Review
0,"I am here and will be back tomorrow;however, i..."
1,"Ayo Oyebisi, I love you so much and it drives ..."


In [148]:
# install libraies
import re     
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

for i in range(dataset.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()


  ps     = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

print(corpus)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashi

In [149]:
# improve the data cleaning to ensure useful words are not removed
# install libraies

corpus = []

for i in range(dataset.shape[0]):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps     = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

## Create Bag of Word Model (Tokenization)

In [150]:
# tokenization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 1350)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [151]:
# Examine X and y
print(X.shape)
print(y.shape)
print(X[0:2])

(1000, 1350)
(1000,)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [152]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(800, 1350)
(800,)
(200, 1350)
(200,)


## Machine learning Classification Models

### Logistic Regression

In [153]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [154]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 96.375%


In [155]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[80 17]
 [28 75]]
The accuracy score for this model on the test set is : 77.5%


### Naive Bayes

In [156]:
# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [157]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 91.125%


In [158]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')


[[55 42]
 [12 91]]
The accuracy score for this model on the test set is : 73.0%


### KNN

In [159]:
# Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [160]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 76.75%


In [161]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[86 11]
 [65 38]]
The accuracy score for this model on the test set is : 62.0%


### Decison Tree

In [162]:
# Training the Decision Tree Classification model on the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [163]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 99.5%


In [164]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[79 18]
 [33 70]]
The accuracy score for this model on the test set is : 74.5%


### Random Forest

In [165]:
# Training the Random Forest Classification model on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [166]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 98.875%


In [167]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[82 15]
 [26 77]]
The accuracy score for this model on the test set is : 79.5%


### SVM

In [168]:
# Training the Kernel SVM model on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [169]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 97.0%


In [170]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[89  8]
 [36 67]]
The accuracy score for this model on the test set is : 78.0%


### XGBOOST

In [171]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators= 500)
classifier.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [172]:
# Evaluate training accuracy
y_hat = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix, accuracy_score
score = accuracy_score(y_hat,  y_train)
print("The accuracy score for the model on the training set is :", str(score*100)+'%')

The accuracy score for the model on the training set is : 86.375%


In [173]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = accuracy_score(y_test, y_pred)
print("The accuracy score for this model on the test set is :", str(score*100)+'%')

[[79 18]
 [40 63]]
The accuracy score for this model on the test set is : 71.0%


### Grid Search on Random Forest Model ( since it has the largest accuracy score)

In [175]:
# Training the Random Forest Classification model on the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [176]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 79.25 %
Standard Deviation: 3.63 %


In [177]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'bootstrap': ['True', 'False'], 'ccp_alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
               'n_estimators': [10, 50, 100, 150, 200, 250]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 80.88 %
Best Parameters: {'bootstrap': 'True', 'ccp_alpha': 0.0, 'n_estimators': 50}
