In [1]:
#41403
#importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#loading data set into dataframe
df = pd.read_csv("Restaurant_Reviews.tsv",delimiter="\t",quoting=3)

In [3]:
#printing starting 10 tuples from dataset
df.head(10)                              # 0 - negative response    1 - positive response

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [4]:
size = df.shape
size = size[0]
print("tuples of dataset:-",size)
print("Count of response")
count = df['Liked'].value_counts()
print(count)

tuples of dataset:- 1000
Count of response
1    500
0    500
Name: Liked, dtype: int64


In [5]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords    #Stop wards include the ,a , etc
from nltk.stem.porter import PorterStemmer #This reduces the word to stem
corpus = [] #Clean list of words

all_stopward = stopwords.words('english')
print("total stopwords:\t\t",len(all_stopward))
all_stopward.remove('not')
print("after removing 'not' word from stopword:\t",len(all_stopward))
print("All stopwords:-\n",all_stopward)


[nltk_data] Downloading package stopwords to /home/mint/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


total stopwords:		 179
after removing 'not' word from stopword:	 178
All stopwords:-
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most

In [6]:
#In order to use textual data for predictive modeling,
#the text must be parsed to remove stopwords
#this process is called tokenization.
for i in range(0,size):
    review = re.sub('[^a-zA-Z]',' ',df["Review"][i]) #Convert non character to spaces
    review = review.lower()                          
    words = review.split()
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words if not word in set(all_stopward)]
    words = ' '.join(words)
    corpus.append(words)
print(len(corpus))
print(corpus)

1000
['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid coul

In [7]:
#These words need to then be encoded as integers, or floating-point values,
#for use as inputs in machine learning algorithms.
#This process is called feature extraction (or vectorization).
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_features=1500)
vec.fit(corpus)
print(vec.vocabulary_)
vector = vec.transform(corpus)
print(vector.shape)
print(vector.toarray())

{'wow': 1484, 'love': 764, 'place': 986, 'crust': 308, 'not': 883, 'good': 565, 'tasti': 1306, 'textur': 1318, 'nasti': 861, 'stop': 1257, 'late': 724, 'may': 796, 'bank': 86, 'holiday': 633, 'rick': 1104, 'steve': 1250, 'recommend': 1071, 'select': 1156, 'menu': 813, 'great': 577, 'price': 1019, 'get': 549, 'angri': 30, 'want': 1436, 'damn': 316, 'pho': 974, 'honeslti': 636, 'tast': 1304, 'fresh': 525, 'potato': 1011, 'like': 747, 'rubber': 1117, 'could': 283, 'tell': 1311, 'made': 775, 'ahead': 13, 'time': 1340, 'kept': 708, 'warmer': 1438, 'fri': 526, 'touch': 1357, 'servic': 1165, 'prompt': 1031, 'would': 1483, 'go': 559, 'back': 78, 'cashier': 203, 'care': 195, 'ever': 434, 'say': 1142, 'still': 1252, 'end': 420, 'wayyy': 1446, 'overpr': 926, 'tri': 1366, 'cape': 191, 'cod': 247, 'ravoli': 1061, 'chicken': 226, 'cranberri': 295, 'mmmm': 833, 'disgust': 363, 'pretti': 1018, 'sure': 1290, 'human': 653, 'hair': 596, 'shock': 1172, 'sign': 1183, 'indic': 673, 'cash': 201, 'highli': 62

In [8]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,-1]

In [9]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [10]:
print(X.shape)

(1000, 1500)


In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(X,y,test_size = 0.2,random_state = 0)

In [17]:
from sklearn.naive_bayes import MultinomialNB
classifier3 = MultinomialNB()
classifier3.fit(X_train,Y_train)

MultinomialNB()

In [18]:
y_pred2 = classifier3.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score,confusion_matrix

print(accuracy_score(Y_test,y_pred2))
print(confusion_matrix(Y_test,y_pred2))

0.78
[[75 22]
 [22 81]]
