In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score


[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loading the data

In [2]:
df = pd.read_csv("yelp_labelled.txt",sep="\t",header=None)
df

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


Preprocess the data

In [4]:
#Converting 0 column into lower case
dp = df[0].str.lower()
dp

0                               wow... loved this place.
1                                     crust is not good.
2              not tasty and the texture was just nasty.
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                             appetite instantly gone.
997    overall i was not impressed and would not go b...
998    the whole experience was underwhelming, and i ...
999    then, as if i hadn't wasted enough of my life ...
Name: 0, Length: 1000, dtype: object

In [5]:
#Removing URLs if present in 0 feature column
dp = dp.str.replace(r'http\S+','')
dp

0                               wow... loved this place.
1                                     crust is not good.
2              not tasty and the texture was just nasty.
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                             appetite instantly gone.
997    overall i was not impressed and would not go b...
998    the whole experience was underwhelming, and i ...
999    then, as if i hadn't wasted enough of my life ...
Name: 0, Length: 1000, dtype: object

In [6]:
#Removing punctuations
dp = dp.str.replace('[^\w\s]','')
dp

0                                   wow loved this place
1                                      crust is not good
2               not tasty and the texture was just nasty
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                              appetite instantly gone
997    overall i was not impressed and would not go back
998    the whole experience was underwhelming and i t...
999    then as if i hadnt wasted enough of my life th...
Name: 0, Length: 1000, dtype: object

In [7]:
#Removing Stop words
stop_words = nltk.corpus.stopwords.words('english')
type(stop_words)


list

In [8]:
dt = dp.apply(lambda a: " ".join(i for i in a.split() if i not in stop_words))
dt

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                            selection menu great prices
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think well go n...
999    hadnt wasted enough life poured salt wound dra...
Name: 0, Length: 1000, dtype: object

In [9]:
#Repcaling dataframe data with preprocessed data
df[0]=dt

In [10]:
def splitting(features,target,p):
  splitting_index= int(p * features.shape[0])
  train_X=features[0:splitting_index]
  train_y=target[0:splitting_index]
  test_X=features[splitting_index:]
  test_y= target[splitting_index:]
  return train_X,train_y,test_X,test_y

In [11]:
train_X,train_y,test_X,test_y = splitting(df[0],df[1],0.8)

Creation of Vocabulary

In [12]:
def vocab_words(a,vocab_list):
  for word in a:
    if word not in vocab_list:
      vocab_list.append(word)

In [13]:
vocab_list=[]
train_X.apply(lambda a: vocab_words(a.split(),vocab_list))
print(vocab_list,'\n','length of vocabulary',len(vocab_list))

['wow', 'loved', 'place', 'crust', 'good', 'tasty', 'texture', 'nasty', 'stopped', 'late', 'may', 'bank', 'holiday', 'rick', 'steve', 'recommendation', 'selection', 'menu', 'great', 'prices', 'getting', 'angry', 'want', 'damn', 'pho', 'honeslty', 'didnt', 'taste', 'fresh', 'potatoes', 'like', 'rubber', 'could', 'tell', 'made', 'ahead', 'time', 'kept', 'warmer', 'fries', 'touch', 'service', 'prompt', 'would', 'go', 'back', 'cashier', 'care', 'ever', 'say', 'still', 'ended', 'wayyy', 'overpriced', 'tried', 'cape', 'cod', 'ravoli', 'chickenwith', 'cranberrymmmm', 'disgusted', 'pretty', 'sure', 'human', 'hair', 'shocked', 'signs', 'indicate', 'cash', 'highly', 'recommended', 'waitress', 'little', 'slow', 'worth', 'let', 'alone', 'vegas', 'burrittos', 'blah', 'food', 'amazing', 'also', 'cute', 'less', 'interior', 'beautiful', 'performed', 'thats', 'rightthe', 'red', 'velvet', 'cakeohhh', 'stuff', 'never', 'brought', 'salad', 'asked', 'hole', 'wall', 'mexican', 'street', 'tacos', 'friendly',

Making train word count feature matrice

In [14]:
train_matrice = pd.DataFrame(0, index=np.arange(train_X.shape[0]), columns=vocab_list)

In [15]:
train_matrice

Unnamed: 0,wow,loved,place,crust,good,tasty,texture,nasty,stopped,late,...,roll,brother,law,works,hereas,tribute,event,held,salsa,youll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
for i in range(0,800):
  for w in train_X[i].split():
    train_matrice.loc[i][w] += 1

In [17]:
train_matrice

Unnamed: 0,wow,loved,place,crust,good,tasty,texture,nasty,stopped,late,...,roll,brother,law,works,hereas,tribute,event,held,salsa,youll
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,0,2,0,1,0,0,0,0,0,...,0,0,0,0,1,2,1,1,0,0
796,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
797,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Making test word count feature matrice

In [18]:
test_matrice = pd.DataFrame(0, index=np.arange(test_X.shape[0]), columns=vocab_list)

In [19]:
test_matrice

Unnamed: 0,wow,loved,place,crust,good,tasty,texture,nasty,stopped,late,...,roll,brother,law,works,hereas,tribute,event,held,salsa,youll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Assumption ignoring the unknown words present in test and absent in train

In [21]:
for i in range(800,1000):
  for w in test_X[i].split():
     if w in test_matrice.columns:
      test_matrice.loc[i-800][w] += 1
test_matrice

Unnamed: 0,wow,loved,place,crust,good,tasty,texture,nasty,stopped,late,...,roll,brother,law,works,hereas,tribute,event,held,salsa,youll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Naive Bayes

In [22]:
nb_model = MultinomialNB(alpha=1.0)
nb_model.fit(train_matrice,train_y)

MultinomialNB()

In [23]:
nb_model.score(train_matrice,train_y)

0.95625

In [24]:
nb_model.score(test_matrice,test_y)

0.74

In [25]:
trainy_pred = nb_model.predict(train_matrice)
trainy_pred

array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

In [26]:
f1_score(train_y,trainy_pred )

0.9614961496149614

In [27]:
testy_pred = nb_model.predict(test_matrice)
testy_pred

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0])

In [28]:
f1_score(test_y,testy_pred )

0.6338028169014084