In [1]:
#import libraries
import os
#Natural Language Tool Kit
import nltk
import nltk.corpus
#to convert data into dataframe
import pandas as pd
import numpy as np
import string
#to clean data
import re
#for modeling 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#read the data from 
dataset = pd.read_csv(r"E:\bridgelabz\Restaurant_Reviews (1).tsv", sep='\t', encoding='utf-8')
#dataset.Review
dataset = dataset.iloc[:800]
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
795,So good I am going to have to review this plac...,1
796,"The chips and salsa were really good, the sals...",1
797,This place is great!!!!!!!!!!!!!!,1
798,Mediocre food.,0


In [3]:
dataset.isnull().any()

Review    False
Liked     False
dtype: bool

In [4]:
dataset.drop_duplicates(keep='first', inplace=True)

In [5]:
dataset.describe()

Unnamed: 0,Liked
count,800.0
mean,0.565
std,0.496067
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [6]:
#Data cleaning and proccesing 
#Remove Punctuations, Numbers
#for remove stopword 
import re
from nltk.corpus import stopwords   
#for Stemming propose  
from nltk.stem.porter import PorterStemmer 
#Initialize empty List
corpus = []
for i in range(0, 800):
    # column : "Review", row ith 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    #converting all characters to lowercase
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
   #loop for stemming each word in string array at ith row
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #rejoin all string array elements to create back into a string
    review = ' '.join(review)
    #append each string to create array of clean text
    corpus.append(review)

In [7]:
df = pd.DataFrame(corpus)
df 

Unnamed: 0,0
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price
...,...
795,good go review place twice herea tribut place ...
796,chip salsa realli good salsa fresh
797,place great
798,mediocr food


In [8]:
count_vect =  CountVectorizer(lowercase=True,stop_words='english',min_df=2,max_features = 10000 )
print(count_vect)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=10000, min_df=2,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [9]:
x_count_vect = count_vect.fit_transform(corpus)
print(x_count_vect)

  (0, 529)	1
  (0, 279)	1
  (0, 356)	1
  (1, 107)	1
  (1, 199)	1
  (2, 471)	1
  (2, 476)	1
  (2, 312)	1
  (3, 279)	1
  (3, 454)	1
  (3, 262)	1
  (3, 387)	1
  (4, 418)	1
  (4, 298)	1
  (4, 202)	1
  (4, 370)	1
  (5, 512)	1
  (5, 111)	1
  (5, 350)	1
  (6, 469)	1
  (6, 188)	1
  (7, 367)	1
  (7, 268)	1
  (7, 473)	1
  (7, 482)	1
  :	:
  (793, 85)	1
  (794, 426)	1
  (794, 209)	1
  (794, 318)	1
  (794, 17)	1
  (794, 114)	1
  (794, 525)	1
  (794, 285)	1
  (795, 356)	2
  (795, 199)	1
  (795, 318)	1
  (795, 394)	1
  (795, 157)	1
  (795, 495)	1
  (796, 199)	1
  (796, 188)	1
  (796, 383)	1
  (796, 83)	1
  (797, 356)	1
  (797, 202)	1
  (798, 187)	1
  (798, 295)	1
  (799, 356)	1
  (799, 248)	1
  (799, 243)	1


In [10]:
x_count_vect.shape

(800, 535)

In [11]:
x_names = count_vect.get_feature_names()
x_names

['absolut',
 'acknowledg',
 'actual',
 'ad',
 'ago',
 'alway',
 'amaz',
 'ambianc',
 'ambienc',
 'anoth',
 'anyon',
 'anytim',
 'appet',
 'area',
 'arriv',
 'ask',
 'assur',
 'ate',
 'atmospher',
 'attack',
 'attent',
 'attitud',
 'authent',
 'avoid',
 'away',
 'awesom',
 'babi',
 'bachi',
 'bacon',
 'bad',
 'bakeri',
 'bar',
 'bare',
 'bartend',
 'basic',
 'bathroom',
 'batter',
 'bay',
 'bean',
 'beat',
 'beauti',
 'beef',
 'beer',
 'belli',
 'best',
 'better',
 'big',
 'bisqu',
 'bit',
 'bite',
 'black',
 'bland',
 'blow',
 'boot',
 'bowl',
 'boy',
 'bread',
 'break',
 'breakfast',
 'brick',
 'bring',
 'brought',
 'brunch',
 'buffet',
 'bunch',
 'burger',
 'busi',
 'butter',
 'cafe',
 'cake',
 'came',
 'car',
 'care',
 'cashier',
 'char',
 'cheap',
 'check',
 'chees',
 'cheeseburg',
 'chef',
 'chewi',
 'chicken',
 'chines',
 'chip',
 'choos',
 'classic',
 'clean',
 'close',
 'cocktail',
 'coffe',
 'cold',
 'combin',
 'come',
 'comfort',
 'compani',
 'complaint',
 'complet',
 'consid

In [12]:
x_count_name=pd.DataFrame(x_count_vect.toarray(), columns=x_names)

In [13]:
x_count_name.shape

(800, 535)

In [14]:
x_count_name.head()

Unnamed: 0,absolut,acknowledg,actual,ad,ago,alway,amaz,ambianc,ambienc,anoth,...,work,world,worst,worth,wow,wrap,wrong,year,yummi,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
x=x_count_name
y=dataset['Liked']
x_train,x_test,y_train,y_test=train_test_split(x_count_name,y,test_size=0.2,)

In [16]:
support_vector_classifier = SVC()
support_vector_classifier.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
random_forest_classifier = RandomForestClassifier(n_estimators = 450)
random_forest_classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=450,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [20]:
#prediction
svc_prediction = support_vector_classifier.predict(x_test)
rfc_prediction = random_forest_classifier.predict(x_test)
lr_prediction = logistic_regression.predict(x_test)
decision_tree_prediction = decision_tree_classifier.predict(x_test)

In [21]:
#Accuracy
print("SVC")
print(accuracy_score(svc_prediction, y_test)*100)
print("Random forest:")
print(accuracy_score(rfc_prediction, y_test)*100)
print("Logistic Regression:")
print(accuracy_score(lr_prediction, y_test)*100)
print("Decision Tree:")
print(accuracy_score(decision_tree_prediction, y_test)*100)

SVC
76.875
Random forest:
77.5
Logistic Regression:
76.875
Decision Tree:
74.375


In [22]:
#Confusion Matrix
print("SVC Algorithm:")
print(confusion_matrix(svc_prediction, y_test))
print("Random forest Algorithm:")
print(confusion_matrix(rfc_prediction, y_test))
print("Logistic Regression Algorithm:")
print(confusion_matrix(lr_prediction, y_test))
print("Decision Tree Algorithm:")
print(confusion_matrix(decision_tree_prediction, y_test))

SVC Algorithm:
[[54 10]
 [27 69]]
Random forest Algorithm:
[[65 20]
 [16 59]]
Logistic Regression Algorithm:
[[59 15]
 [22 64]]
Decision Tree Algorithm:
[[59 19]
 [22 60]]


In [23]:
#classification report
print("Support Vector:")
print(classification_report(svc_prediction, y_test))
print("Random forest:")
print(classification_report(rfc_prediction, y_test))
print("Logistic Regression:")
print(classification_report(lr_prediction, y_test))
print("Decision Tree:")
print(classification_report(decision_tree_prediction, y_test))

Support Vector:
              precision    recall  f1-score   support

           0       0.67      0.84      0.74        64
           1       0.87      0.72      0.79        96

    accuracy                           0.77       160
   macro avg       0.77      0.78      0.77       160
weighted avg       0.79      0.77      0.77       160

Random forest:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78        85
           1       0.75      0.79      0.77        75

    accuracy                           0.78       160
   macro avg       0.77      0.78      0.77       160
weighted avg       0.78      0.78      0.78       160

Logistic Regression:
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        74
           1       0.81      0.74      0.78        86

    accuracy                           0.77       160
   macro avg       0.77      0.77      0.77       160
weighted avg       0.7

In [27]:
#testing
def review_prediction(review):
    new_corpus = []
    review = re.sub('[^a-zA-Z]', ' ' ,review)
    #converting all characters to lowercase
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
   #loop for stemming each word in string array at ith row
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #rejoin all string array elements to create back into a string
    review = ' '.join(review)
    #append each string to create array of clean text
    corpus.append(review)
    count_vect = CountVectorizer(lowercase=True,stop_words='english',min_df=2,max_features = 10000)
    x_count_vect = count_vect.fit_transform(corpus + new_corpus).toarray()
    X = x_count_vect[-1].reshape(1, -1)
    prediction = logistic_regression.predict(X)
    if prediction == 1:
        return "Positive Review"
    else:
        return "Negative Review"
    

In [28]:
print(review_prediction(input()))

service of this restaurant is worst
Negative Review


In [30]:
print(review_prediction(input()))

Food quality is too good
Positive Review
