In [664]:
#Feature extraction for Natural Language Processing
import numpy as np
import pandas as pd


In [665]:
#Loading the dataset
#.tsv file has been used because tab separations are less likely to occur in the reviews than user written commas
#this would allow for better parsing
#quoting=3 -> ignore double quotes - for better parsing
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)


In [666]:
dataset.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [667]:
dataset.tail(5)

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [668]:
#We are going to keep only uppercase and lowercase alphabets and remove any numbers - numbers could mean anything from time, date or arbitrary ratings. Words have much more precise learning value.
#regular expressions operations - will use this to keep only alphabet and spaces
import re 


In [669]:
#We are going to remove the stop words - articles, prepositions,etc as they won't add anything of value to help our model learn
import nltk


In [670]:
#Stemming - Words like loved,love or sucked,sucks, etc essentially express the same sentiment so we will reduce them to their root word


from nltk.stem.porter import PorterStemmer 

In [671]:
corpus=[] #empty list

In [672]:
dataset.shape

(1000, 2)

In [673]:
for i in range(0,1000): #range function does not include end point so we specify 1000 instead of 999
#We don't want to remove any letter from a-z or A-Z - specified using^ ;' 'puts space in place of removed characters
    review=re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review=review.lower()
    review=review.split() #splitting list to iterate over it for further processing
    ps=PorterStemmer() #stemming done to reduce columns in our feature matrix - reducing sparcity
    review=[ps.stem(word) for word in review ] 
    review=' '.join(review)
    corpus.append(review)


In [674]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000,ngram_range=(1,4),max_df=0.2)
X = cv.fit_transform(corpus).toarray() #Learn the vocabulary dictionary and return term-document matrix.
y = dataset.iloc[:, 1].values #.values removes label axes

In [675]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [676]:

X.shape

(1000, 3000)

In [677]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [678]:

from sklearn import svm
classifier = svm.SVC(gamma='scale')
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [679]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[88  9]
 [25 78]]


In [680]:
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,88,9,97
1,25,78,103
All,113,87,200


In [681]:
print('True Positives = ',cm[0][0]+cm[1][1])

True Positives =  166


In [682]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Accuracy is  83.0 %
Precision is  0.9
Recall is  0.76
