In [1]:
#Importing libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:

#Importing Dataset

dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t', quoting = 3 ) #since the file is in tabluar seperated values format
#since the dataset contains many double quotes and so to overcome the preprocessing errors we add quoting to 3 , 3 for double quotes

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
#Cleaning the texts

import re
import nltk #for removing the stopforwards
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #this is for stemming , stemming : removing prefix,suffix,plurals,tenses such that only root word remains to get the real sense
#since we are about to use bag of words model , which creates sparse matrix for words , if it is not stemmed the child and children words have two columns making it redundant

corpus = []

for i in range(0,1000):
    review = re.sub('[^a-zA-Z]' , ' ' , dataset['Review'][i])   #re.sub is used to replace any word by any word . we will use this to replace any punctuation mark by space , since if we not replace it by space then two words join
#hat operator is for not
    review=review.lower()
    review = review.split() #it splits in different words
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)] #applying stemming to all words in 1 review other than stopwords
    review = ' '.join(review) #joining all back to a sentence with 1 space gap 
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Creating the bag of words model

# creating a sparse matrix for model and the process is called as tokenization

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray() #making it to 2d array since we use naive bayes model
Y = dataset.iloc[:,-1].values


In [10]:
len(X[0]) #no of elemnts in 1 row

1566

In [11]:
len(X)

1000

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1450) #using this parameter to reduce useless words or the words that appear only few times
X = cv.fit_transform(corpus).toarray()
Y = dataset.iloc[:,-1].values
len(X[0])

1450

In [14]:
# train test split

from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test = train_test_split(X , Y ,train_size=0.8,test_size=0.2 ,random_state=0)

In [16]:
# training the model naive bayes

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train , Y_train)


GaussianNB()

In [19]:

#predicting 

y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1),Y_test.reshape(len(Y_test),1)),1))


In [20]:
from sklearn.metrics import confusion_matrix , accuracy_score

cm = confusion_matrix(Y_test , y_pred)
print(cm)
print(accuracy_score(Y_test,y_pred))

[[55 42]
 [12 91]]
0.73


In [21]:
# predicting a single review
#positive

new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]


In [22]:
#negative
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


In [23]:
#ambiguity case

new_review = 'I love this restaurant so much and at the same time i hate this'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


In [24]:
 # trying out kernal-svm for better accuracy 
    
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, Y_train)


SVC(random_state=0)

In [26]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1),Y_test.reshape(len(Y_test),1)),1))


In [27]:
from sklearn.metrics import confusion_matrix , accuracy_score

cm = confusion_matrix(Y_test , y_pred)
print(cm)
print(accuracy_score(Y_test,y_pred))

[[88  9]
 [36 67]]
0.775


In [28]:
#ambiguity case

new_review = 'I love this restaurant so much and at the same time i hate this'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]
