In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import numpy as np
import pandas as pd


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [None]:
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [None]:
dataset.tail(10)

Unnamed: 0,Review,Liked
990,The refried beans that came with my meal were ...,0
991,Spend your money and time some place else.,0
992,A lady at the table next to us found a live gr...,0
993,the presentation of the food was awful.,0
994,I can't tell you how disappointed I was.,0
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [None]:
print(dataset)

                                                Review  Liked
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]


In [None]:
#removing all the unnecessary values other than a-z and A-Z characters
import re
print('Before Cleaning \n')
print(dataset['Review'][0])
check = re.sub('[^a-z A-Z]','',dataset['Review'][0])
print('\nafter cleaning\n')
print(check)

Before Cleaning 

Wow... Loved this place.

after cleaning

Wow Loved this place


In [None]:
#converting the entire string in lower case
check = check.lower()
print(check)

wow loved this place


In [None]:
#removing the non-significant words like ummm, the etc. may it be article etc
import nltk
#downloading the stopwords
#stopwords is a collection of irrelevant words in a reivew
nltk.download('stopwords')
from nltk.corpus import stopwords
check = check.split()
print('\n',check)
check = [word for word in check if not word in set(stopwords.words('english'))]
print('\n',check)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

 ['wow', 'loved', 'this', 'place']

 ['wow', 'loved', 'place']


In [None]:
#stemming the data
#stemming is the process of identifying the root word from a given e.g. love is the root of loved,lovable etc

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
check = [ps.stem(word) for word in check if not word in set(stopwords.words('english'))]
print('\n',check)



 ['wow', 'love', 'place']


In [None]:
#joining back the phrase together
check = ' '.join(check)
print(check)

wow love place


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# steming is used to get only the root of the word which is enough to understand what the word means
# loved to love hated to hate etc removes conjugations just keeping the verbs and adjectives
corpus = []
for i in range(0,1000): #the data contains 1000 reviews
#the sub function is used not to remove the desired content in the below case we remove everything but the letters a-z and A-Z
  review = re.sub('[^a-z A-Z^]',' ',dataset['Review'][i]) #reomving punctuations and replacing them with a space
  review = review.lower() #transforming all the capital letters into lower case
  review = review.split()  #slpitting the elements into words
  ps = PorterStemmer() #porter stemming object creation
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set()]
  review = ' '.join(review) #so that we don't end up with a single word
  corpus.append(review)

print(corpus)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['wow love thi place', 'crust is not good', 'not tasti and the textur wa just nasti', 'stop by dure the late may bank holiday off rick steve recommend and love it', 'the select on the menu wa great and so were the price', 'now i am get angri and i want my damn pho', 'honeslti it didn t tast that fresh', 'the potato were like rubber and you could tell they had been made up ahead of time be kept under a warmer', 'the fri were great too', 'a great touch', 'servic wa veri prompt', 'would not go back', 'the cashier had no care what so ever on what i had to say it still end up be wayyy overpr', 'i tri the cape cod ravoli chicken with cranberri mmmm', 'i wa disgust becaus i wa pretti sure that wa human hair', 'i wa shock becaus no sign indic cash onli', 'highli recommend', 'waitress wa a littl slow in servic', 'thi place is not worth your time let alon vega', 'did not like at

In [None]:
#CountVectorizer is a great tool provided by the scikit-learn library in Python.
#It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
from sklearn.feature_extraction.text import CountVectorizer #tokenizing the
cv = CountVectorizer(max_features=1682)
#making a sparse matrix to rmove extra english words like steve and rick etc
#max_features is the rough number of the values of X such that a sparce matrix is created to acomodate all the phrases.
X = cv.fit_transform(corpus).toarray()
y = dataset['Liked'].values

In [None]:
len(X[0])


1682

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [None]:
from sklearn.naive_bayes import GaussianNB
obj = GaussianNB()
obj.fit(X_train,y_train)

GaussianNB()

In [None]:
y_pred = obj.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[55 42]
 [12 91]]


0.73

In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0, n_estimators = 200,
                                    criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(acc)

0.81


In [None]:
# Random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0, n_estimators = 200,
                                    criterion = 'gini')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(acc)

0.81


In [None]:
# SVM (rbf)
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel = 'rbf')
classifier.fit(X_train, y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(acc)

0.81
