# Load the data set : Restaurant_Reviews

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t') #Tab Saperated Value

In [3]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


# Clean the dataset

In [4]:
import re #regular expression
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) #Replace
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    return text

clean = lambda x: clean_text(x)


In [5]:
dataset["Review"] = dataset["Review"].apply(clean)

In [6]:
dataset

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
...,...,...
995,i think food should have flavor and texture an...,0
996,appetite instantly gone,0
997,overall i was not impressed and would not go back,0
998,the whole experience was underwhelming and i t...,0


# Stopword

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\myrit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
stop.remove("not")

In [9]:
dataset['New_Review'] = dataset['Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [10]:
dataset

Unnamed: 0,Review,Liked,New_Review
0,wow loved this place,1,wow loved place
1,crust is not good,0,crust not good
2,not tasty and the texture was just nasty,0,not tasty texture nasty
3,stopped by during the late may bank holiday of...,1,stopped late may bank holiday rick steve recom...
4,the selection on the menu was great and so wer...,1,selection menu great prices
...,...,...,...
995,i think food should have flavor and texture an...,0,think food flavor texture lacking
996,appetite instantly gone,0,appetite instantly gone
997,overall i was not impressed and would not go back,0,overall not impressed would not go back
998,the whole experience was underwhelming and i t...,0,whole experience underwhelming think well go n...


# Apply Stemming & Lemmetazation

In [11]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
#list comprehension

#[var for var in seq]

for i in range(1,11):
    print(i)

In [None]:
[i for i in range(1,11)]

In [13]:
word = dataset["New_Review"][0].split()

In [14]:
[ps.stem(w) for w in word]

['wow', 'love', 'place']

In [15]:
result = []

for i in dataset["New_Review"]:
    word = i.split()
    stemmed_tokens = [ps.stem(w) for w in word]
    print(stemmed_tokens)

['wow', 'love', 'place']
['crust', 'not', 'good']
['not', 'tasti', 'textur', 'nasti']
['stop', 'late', 'may', 'bank', 'holiday', 'rick', 'steve', 'recommend', 'love']
['select', 'menu', 'great', 'price']
['get', 'angri', 'want', 'damn', 'pho']
['honeslti', 'didnt', 'tast', 'fresh']
['potato', 'like', 'rubber', 'could', 'tell', 'made', 'ahead', 'time', 'kept', 'warmer']
['fri', 'great']
['great', 'touch']
['servic', 'prompt']
['would', 'not', 'go', 'back']
['cashier', 'care', 'ever', 'say', 'still', 'end', 'wayyy', 'overpr']
['tri', 'cape', 'cod', 'ravoli', 'chicken', 'cranberrymmmm']
['disgust', 'pretti', 'sure', 'human', 'hair']
['shock', 'sign', 'indic', 'cash']
['highli', 'recommend']
['waitress', 'littl', 'slow', 'servic']
['place', 'not', 'worth', 'time', 'let', 'alon', 'vega']
['not', 'like']
['burritto', 'blah']
['food', 'amaz']
['servic', 'also', 'cute']
['could', 'care', 'less', 'interior', 'beauti']
['perform']
['that', 'rightth', 'red', 'velvet', 'cakeohhh', 'stuff', 'good']

['two', 'bite', 'refus', 'eat', 'anymor']
['servic', 'extrem', 'slow']
['minut', 'wait', 'got', 'tabl']
['serious', 'killer', 'hot', 'chai', 'latt']
['allergi', 'warn', 'menu', 'waitress', 'absolut', 'clue', 'meal', 'not', 'contain', 'peanut']
['boyfriend', 'tri', 'mediterranean', 'chicken', 'salad', 'fell', 'love']
['rotat', 'beer', 'tap', 'also', 'highlight', 'place']
['price', 'bit', 'concern', 'mellow', 'mushroom']
['worst', 'thai', 'ever']
['stay', 'vega', 'must', 'get', 'breakfast', 'least']
['want', 'first', 'say', 'server', 'great', 'perfect', 'servic']
['pizza', 'select', 'good']
['strawberri', 'tea', 'good']
['highli', 'unprofession', 'rude', 'loyal', 'patron']
['overal', 'great', 'experi']
['spend', 'money', 'elsewher']
['regular', 'toast', 'bread', 'equal', 'satisfi', 'occasion', 'pat', 'butter', 'mmmm']
['buffet', 'bellagio', 'far', 'anticip']
['drink', 'weak', 'peopl']
['order', 'not', 'correct']
['also', 'feel', 'like', 'chip', 'bought', 'not', 'made', 'hous']
['disappoi

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\myrit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\myrit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

result = []


for i in dataset["New_Review"]:
    word = i.split()
    lemme_tokens = [lemmatizer.lemmatize(w) for w in word]
    r = " ".join(lemme_tokens)
    result.append(r)

In [20]:
dataset["New_Review"] = result

In [None]:
#lemmatizer.lemmatize("runs")

In [21]:
dataset

Unnamed: 0,Review,Liked,New_Review
0,wow loved this place,1,wow loved place
1,crust is not good,0,crust not good
2,not tasty and the texture was just nasty,0,not tasty texture nasty
3,stopped by during the late may bank holiday of...,1,stopped late may bank holiday rick steve recom...
4,the selection on the menu was great and so wer...,1,selection menu great price
...,...,...,...
995,i think food should have flavor and texture an...,0,think food flavor texture lacking
996,appetite instantly gone,0,appetite instantly gone
997,overall i was not impressed and would not go back,0,overall not impressed would not go back
998,the whole experience was underwhelming and i t...,0,whole experience underwhelming think well go n...


# Create BOW: CountVectorizer

In [22]:
X = dataset["New_Review"]
y = dataset["Liked"]

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(X)

In [25]:
x.toarray().shape

(1000, 1500)

In [26]:
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# TfIDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf = TfidfVectorizer()
x1 = tfidf.fit_transform(X)

In [29]:
x1.toarray().shape

(1000, 1808)

# Training & Testing

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,train_size=0.7,random_state=0)

In [31]:
X_train.shape,X_test.shape

((700, 1500), (300, 1500))

# Apply Naive Bayes Algorithm: Beroulli Naive Bayes

In [32]:
from sklearn.naive_bayes import BernoulliNB

In [33]:
model = BernoulliNB()
model.fit(X_train,y_train)

BernoulliNB()

# Accuracy Testing

In [34]:
y_pred = model.predict(X_test)

In [35]:
y_pred

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1], d

In [36]:
model.predict_proba(X_test)

array([[4.20765533e-01, 5.79234467e-01],
       [8.80521993e-01, 1.19478007e-01],
       [7.11154996e-01, 2.88845004e-01],
       [5.60022457e-01, 4.39977543e-01],
       [8.33283557e-01, 1.66716443e-01],
       [7.32686982e-01, 2.67313018e-01],
       [1.08121190e-04, 9.99891879e-01],
       [9.22573378e-01, 7.74266215e-02],
       [9.32028708e-01, 6.79712919e-02],
       [7.28757962e-02, 9.27124204e-01],
       [8.09702469e-02, 9.19029753e-01],
       [1.73061621e-02, 9.82693838e-01],
       [2.53764083e-01, 7.46235917e-01],
       [1.18706904e-01, 8.81293096e-01],
       [2.39288157e-02, 9.76071184e-01],
       [9.76257394e-05, 9.99902374e-01],
       [7.94277689e-01, 2.05722311e-01],
       [9.19203422e-01, 8.07965778e-02],
       [8.11028623e-01, 1.88971377e-01],
       [3.49243513e-02, 9.65075649e-01],
       [6.68008224e-01, 3.31991776e-01],
       [5.88821164e-01, 4.11178836e-01],
       [7.58314513e-02, 9.24168549e-01],
       [3.70417359e-01, 6.29582641e-01],
       [8.423869

In [37]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [38]:
confusion_matrix(y_test,y_pred)

array([[109,  34],
       [ 39, 118]], dtype=int64)

In [39]:
accuracy_score(y_test,y_pred)

0.7566666666666667

# Apply User Testing with New Statement

In [40]:
statement = "I went to the restaurant, That food was awesome."

In [41]:
import re #regular expression
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) #Replace
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    return text



In [42]:
st = clean_text(statement)
st

'i went to the restaurant that food was awesome'

In [43]:
len(stop)

178

In [44]:
sl = [x for x in st.split() if x not in stop]
st = " ".join(sl)

In [45]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()



In [46]:
st = " ".join([lemmatizer.lemmatize(x) for x in st.split()])

In [47]:
st

'went restaurant food awesome'

In [50]:
type(X)

pandas.core.series.Series

In [52]:
data = pd.Series(st)

In [54]:
cv_data = cv.transform(data)

In [55]:
cv_data.toarray().shape

(1, 1500)

In [56]:
model.predict(cv_data)

array([1], dtype=int64)

In [61]:
result = model.predict_proba(cv_data)
result

array([[0.05796523, 0.94203477]])

In [63]:
neg_r = result[0][0]*100
neg_r

5.796522647376282

In [64]:
def review_prediction(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) #Replace
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    
    sl = [x for x in text.split() if x not in stop]
    st = " ".join(sl)
    
    st = " ".join([lemmatizer.lemmatize(x) for x in st.split()])
    
    data = pd.Series(st)
    
    cv_data = cv.transform(data)
    
    result = model.predict_proba(cv_data)
    
    print("Negative : {}% \n Positive: {}%".format(result[0][0]*100,result[0][1]*100))

In [66]:
statement = input("Enter your statment: ")

review_prediction(statement)

Enter your statment: i don't like the service...... :(
Negative : 88.74586104081659% 
 Positive: 11.254138959183322%
