In [1]:
import math                      
import matplotlib.pyplot as plt  
import scipy                     
import cv2                       
import numpy as np               
import glob                     
import os                        
import pandas as pd              
import tensorflow as tf       
import itertools
import random
from random import shuffle       
from tqdm import tqdm           
from PIL import Image
from scipy import ndimage
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import re

## Reading the dataset from link

In [2]:
path = '/content/sentiment labelled sentences/'

In [3]:
zip_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
import requests, zipfile, io
r = requests.get(zip_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [4]:
df_yelp = pd.read_csv(path+'yelp_labelled.txt', sep = '\t', header = None, names=['review', 'is_positive'])
df_yelp.head()

Unnamed: 0,review,is_positive
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
len(df_yelp)

1000

In [6]:
# Making sure all reviews have lowercase
df_yelp['review'] = df_yelp['review'].str.lower()

In [7]:
df_yelp.head()

Unnamed: 0,review,is_positive
0,wow... loved this place.,1
1,crust is not good.,0
2,not tasty and the texture was just nasty.,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [None]:
print(len(df_yelp[df_yelp['is_positive'] == 1]))
print(len(df_yelp[df_yelp['is_positive'] == 0]))

500
500


### LSTM/RNN Text classification

In [8]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df_yelp['review'].values)
X = tokenizer.texts_to_sequences(df_yelp['review'].values)
X = pad_sequences(X)
X

array([[  0,   0,   0, ..., 165,   8,  15],
       [  0,   0,   0, ...,   7,  12,  16],
       [  0,   0,   0, ...,   4,  46, 430],
       ...,
       [  0,   0,   0, ...,  12,  38,  31],
       [  0,   0,   0, ..., 137, 159,  40],
       [  0,   0,   0, ..., 347,   1, 248]], dtype=int32)

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 32, 128)           256000    
                                                                 
 spatial_dropout1d_3 (Spatia  (None, 32, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_3 (LSTM)               (None, 196)               254800    
                                                                 
 dense_6 (Dense)             (None, 1)                 197       
                                                                 
Total params: 510,997
Trainable params: 510,997
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
Y = df_yelp['is_positive']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(800, 32) (800,)
(200, 32) (200,)


In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
25/25 - 3s - loss: 0.6898 - accuracy: 0.5188 - 3s/epoch - 121ms/step
Epoch 2/7
25/25 - 1s - loss: 0.6145 - accuracy: 0.7337 - 1s/epoch - 55ms/step
Epoch 3/7
25/25 - 1s - loss: 0.4344 - accuracy: 0.8400 - 1s/epoch - 55ms/step
Epoch 4/7
25/25 - 1s - loss: 0.2608 - accuracy: 0.9150 - 1s/epoch - 54ms/step
Epoch 5/7
25/25 - 1s - loss: 0.1963 - accuracy: 0.9350 - 1s/epoch - 56ms/step
Epoch 6/7
25/25 - 1s - loss: 0.0979 - accuracy: 0.9725 - 1s/epoch - 54ms/step
Epoch 7/7
25/25 - 1s - loss: 0.0570 - accuracy: 0.9912 - 1s/epoch - 54ms/step


<keras.callbacks.History at 0x7f58f80870a0>

In [None]:
train_score, train_accuracy = model.evaluate(X_train, Y_train, verbose = 2, batch_size = batch_size)
test_loss, test_accuracy= model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print(f'Train Score for the first model: {round(train_score, 4)}')
print(f'Train accuracy for the first model: {round(train_accuracy*100, 2)}%')
print(f'Test loss for the first model: {round(test_loss, 4)}')
print(f'Test accuracy for the first model: {round(test_accuracy*100, 2)}%')

25/25 - 0s - loss: 0.0324 - accuracy: 0.9950 - 284ms/epoch - 11ms/step
7/7 - 0s - loss: 0.6142 - accuracy: 0.7850 - 90ms/epoch - 13ms/step
Train Score for the first model: 0.0324
Train accuracy for the first model: 99.5%
Test loss for the first model: 0.6142
Test accuracy for the first model: 78.5%


In [None]:
results = model.predict(X_test, verbose = 2, batch_size = batch_size)

7/7 - 0s - 91ms/epoch - 13ms/step


In [None]:
# Get predictions on test set

results_pred = [int(np.round(i)) for i in results]

x_test_word = tokenizer.sequences_to_texts(X_test)

test_results = pd.DataFrame({
    'Review': x_test_word,
    'Prediction': results_pred
    , 'Actual': Y_test.reset_index(drop=True)
})

test_results['Correct'] = test_results['Prediction'] == test_results['Actual']
misclassified = test_results[test_results['Correct']==False]
misclassified.head(10)


Unnamed: 0,Review,Prediction,Actual,Correct
0,if you haven't gone here go now,0,1,False
14,the meat was pretty dry i had the sliced brisk...,1,0,False
16,i went to bachi burger on a friend's recommend...,0,1,False
22,they dropped more than the ball,1,0,False
26,the only good thing was our waiter he was very...,0,1,False
28,the buffet at bellagio was far from what i ant...,1,0,False
30,this place is like chipotle but better,0,1,False
40,i go to far too many places and i've never see...,1,0,False
41,will be back again,0,1,False
45,the goat taco didn't skimp on the meat and wow...,0,1,False


Second model

In [None]:
max_fatures2 = 2000
tokenizer2 = Tokenizer(num_words=max_fatures, split=' ')
tokenizer2.fit_on_texts(df_yelp['review'].values)
X2 = tokenizer.texts_to_sequences(df_yelp['review'].values)
X2 = pad_sequences(X2)

In [None]:
embed_dim2 = 140
lstm_out2 = 200

model2 = Sequential()
model2.add(Embedding(max_fatures2, embed_dim2,input_length = X2.shape[1]))
model2.add(SpatialDropout1D(0.2))
model2.add(LSTM(lstm_out2, dropout=0.4, recurrent_dropout=0.2))
model2.add(Dense(20,activation='sigmoid'))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model2.summary())



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 32, 140)           280000    
                                                                 
 spatial_dropout1d_6 (Spatia  (None, 32, 140)          0         
 lDropout1D)                                                     
                                                                 
 lstm_6 (LSTM)               (None, 200)               272800    
                                                                 
 dense_11 (Dense)            (None, 20)                4020      
                                                                 
 dense_12 (Dense)            (None, 1)                 21        
                                                                 
Total params: 556,841
Trainable params: 556,841
Non-trainable params: 0
________________________________________________

In [None]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size = 0.3, random_state = 50)
print(X_train2.shape,Y_train2.shape)
print(X_test2.shape,Y_test2.shape)

(700, 32) (700,)
(300, 32) (300,)


In [None]:
batch_size2 = 40
model2.fit(X_train2, Y_train2, epochs = 5, batch_size=batch_size2, verbose = 2)

Epoch 1/5
18/18 - 16s - loss: 0.7628 - accuracy: 0.4871 - 16s/epoch - 889ms/step
Epoch 2/5
18/18 - 2s - loss: 0.6980 - accuracy: 0.5071 - 2s/epoch - 132ms/step
Epoch 3/5
18/18 - 2s - loss: 0.6901 - accuracy: 0.5243 - 2s/epoch - 131ms/step
Epoch 4/5
18/18 - 2s - loss: 0.6855 - accuracy: 0.5486 - 2s/epoch - 133ms/step
Epoch 5/5
18/18 - 2s - loss: 0.6659 - accuracy: 0.6086 - 2s/epoch - 132ms/step


<keras.callbacks.History at 0x7f58f7e95040>

In [None]:
train_score2, train_accuracy2 = model2.evaluate(X_train2, Y_train2, verbose = 2, batch_size = batch_size2)
test_loss2, test_accuracy2= model2.evaluate(X_test2, Y_test2, verbose = 2, batch_size = batch_size2)
print(f'Train Score for the second model: {round(train_score2, 4)}')
print(f'Train accuracy for the second model: {round(train_accuracy2*100, 2)}%')
print(f'Test loss for the second model: {round(test_loss2, 4)}')
print(f'Test accuracy for the second model: {round(test_accuracy2*100, 2)}%')

18/18 - 0s - loss: 0.6414 - accuracy: 0.8057 - 207ms/epoch - 11ms/step
8/8 - 0s - loss: 0.6793 - accuracy: 0.5967 - 102ms/epoch - 13ms/step
Train Score for the second model: 0.6414
Train accuracy for the second model: 80.57%
Test loss for the second model: 0.6793
Test accuracy for the second model: 59.67%


In [None]:
validation_size2 = 150

X_validate2 = X_test2[-validation_size2:]
Y_validate2 = Y_test2[-validation_size2:]
X_test2 = X_test2[-validation_size2:]
Y_test2 = Y_test2[-validation_size2:]
score2,acc2 = model2.evaluate(X_test2, Y_test2, verbose = 2, batch_size = batch_size2)
print("score: %.2f" % (score2))
print("acc: %.2f" % (acc2))

4/4 - 0s - loss: 0.6752 - accuracy: 0.6267 - 59ms/epoch - 15ms/step
score: 0.68
acc: 0.63


In [None]:
results2 = model2.predict(X_test2, verbose = 2, batch_size = batch_size2)

4/4 - 0s - 263ms/epoch - 66ms/step


In [None]:
# Get predictions on test set

results_pred2 = [int(np.round(i)) for i in results2]

x_test_word2 = tokenizer.sequences_to_texts(X_test2)

test_results2 = pd.DataFrame({
    'Review': x_test_word2,
    'Prediction': results_pred2
    , 'Actual': Y_test2.reset_index(drop=True)
})

test_results2['Correct'] = test_results2['Prediction'] == test_results2['Actual']
misclassified2 = test_results2[test_results2['Correct']==False]
misclassified2.head(10)

Unnamed: 0,Review,Prediction,Actual,Correct
1,this was my first and only vegas buffet and it...,0,1,False
2,nice blanket of moz over top but i feel like t...,0,1,False
3,their regular toasted bread was equally satisf...,0,1,False
4,both of them were truly unbelievably good and ...,0,1,False
5,it was not good,1,0,False
21,on the up side their cafe serves really good food,0,1,False
22,i love the fact that everything on their menu ...,0,1,False
25,the sweet potato fries were very good and seas...,0,1,False
33,the food is delicious and just spicy enough so...,0,1,False
35,service was excellent and prices are pretty re...,0,1,False


Third model

In [None]:
max_fatures3 = 2000
tokenizer3 = Tokenizer(num_words=max_fatures, split=' ')
tokenizer3.fit_on_texts(df_yelp['review'].values)
X3 = tokenizer.texts_to_sequences(df_yelp['review'].values)
X3 = pad_sequences(X3)

In [None]:
embed_dim3 = 150
lstm_out3 = 190

model3 = Sequential()
model3.add(Embedding(max_fatures3, embed_dim3,input_length = X3.shape[1]))
model3.add(SpatialDropout1D(0.6))
model3.add(LSTM(lstm_out3, dropout=0.8, recurrent_dropout=0.7))
model3.add(Dense(10,activation='sigmoid'))
model3.add(Dense(5,activation='sigmoid'))
model3.add(Dense(1,activation='sigmoid'))
model3.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model3.summary())



Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 32, 150)           300000    
                                                                 
 spatial_dropout1d_7 (Spatia  (None, 32, 150)          0         
 lDropout1D)                                                     
                                                                 
 lstm_7 (LSTM)               (None, 190)               259160    
                                                                 
 dense_13 (Dense)            (None, 10)                1910      
                                                                 
 dense_14 (Dense)            (None, 5)                 55        
                                                                 
 dense_15 (Dense)            (None, 1)                 6         
                                                      

In [None]:
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X, Y, test_size = 0.2, random_state = 50)
print(X_train3.shape,Y_train3.shape)
print(X_test3.shape,Y_test3.shape)

(800, 32) (800,)
(200, 32) (200,)


In [None]:
batch_size3 = 50
model3.fit(X_train3, Y_train3, epochs = 7, batch_size=batch_size3, verbose = 2)

Epoch 1/7
16/16 - 3s - loss: 0.6939 - accuracy: 0.5063 - 3s/epoch - 165ms/step
Epoch 2/7
16/16 - 1s - loss: 0.6929 - accuracy: 0.5063 - 905ms/epoch - 57ms/step
Epoch 3/7
16/16 - 1s - loss: 0.6921 - accuracy: 0.5125 - 903ms/epoch - 56ms/step
Epoch 4/7
16/16 - 1s - loss: 0.6897 - accuracy: 0.5738 - 937ms/epoch - 59ms/step
Epoch 5/7
16/16 - 1s - loss: 0.6850 - accuracy: 0.5938 - 938ms/epoch - 59ms/step
Epoch 6/7
16/16 - 1s - loss: 0.6715 - accuracy: 0.6550 - 907ms/epoch - 57ms/step
Epoch 7/7
16/16 - 1s - loss: 0.6594 - accuracy: 0.6925 - 935ms/epoch - 58ms/step


<keras.callbacks.History at 0x7f590fd56460>

In [None]:
train_score3, train_accuracy3 = model3.evaluate(X_train3, Y_train3, verbose = 2, batch_size = batch_size3)
test_loss3, test_accuracy3= model3.evaluate(X_test3, Y_test3, verbose = 2, batch_size = batch_size3)
print(f'Train Score for the third model: {round(train_score3, 4)}')
print(f'Train accuracy for the third model: {round(train_accuracy3*100, 2)}%')
print(f'Test loss for the third model: {round(test_loss3, 4)}')
print(f'Test accuracy for the third model: {round(test_accuracy3*100, 2)}%')

16/16 - 0s - loss: 0.6405 - accuracy: 0.8150 - 197ms/epoch - 12ms/step
4/4 - 0s - loss: 0.6616 - accuracy: 0.6900 - 57ms/epoch - 14ms/step
Train Score for the third model: 0.6405
Train accuracy for the third model: 81.5%
Test loss for the third model: 0.6616
Test accuracy for the third model: 69.0%


In [None]:
results3 = model3.predict(X_test3, verbose = 2, batch_size = batch_size3)

4/4 - 0s - 265ms/epoch - 66ms/step


In [None]:
# Get predictions on test set

results_pred3 = [int(np.round(i)) for i in results3]

x_test_word3 = tokenizer.sequences_to_texts(X_test3)

test_results3 = pd.DataFrame({
    'Review': x_test_word3,
    'Prediction': results_pred3
    , 'Actual': Y_test3.reset_index(drop=True)
})

test_results3['Correct'] = test_results3['Prediction'] == test_results3['Actual']
misclassified3 = test_results3[test_results3['Correct']==False]
misclassified3.head(10)

Unnamed: 0,Review,Prediction,Actual,Correct
4,i in the bathroom mid lunch,1,0,False
9,i always order from the vegetarian menu during...,0,1,False
14,i hope this place sticks around,0,1,False
16,the sweet potato tots were good but the onion ...,0,1,False
17,i will come back here every time i'm in vegas,0,1,False
24,we had a group of 70 when we claimed we would ...,0,1,False
30,first time there and might just be the last,1,0,False
33,every time i eat here i see caring teamwork to...,0,1,False
36,i can assure you that you won't be disappointed,0,1,False
50,the food came out at a good pace,0,1,False


##Sentiment Prediction using naive Bayes

### Multinomial NaiveBayes

In [None]:
X = df_yelp['review']
Y = df_yelp['is_positive']

In [None]:
# Converting reviews into count vectors 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X)
print(cv.vocabulary_)



In [None]:
X = cv.transform(X)
print(X)

  (0, 1046)	1
  (0, 1330)	1
  (0, 1798)	1
  (0, 2012)	1
  (1, 427)	1
  (1, 764)	1
  (1, 943)	1
  (1, 1195)	1
  (2, 64)	1
  (2, 967)	1
  (2, 1169)	1
  (2, 1195)	1
  (2, 1761)	1
  (2, 1774)	1
  (2, 1780)	1
  (2, 1940)	1
  (3, 64)	1
  (3, 139)	1
  (3, 264)	1
  (3, 557)	1
  (3, 867)	1
  (3, 945)	1
  (3, 991)	1
  (3, 1046)	1
  (3, 1087)	1
  :	:
  (999, 100)	1
  (999, 236)	1
  (999, 264)	1
  (999, 312)	1
  (999, 537)	1
  (999, 591)	1
  (999, 806)	1
  (999, 904)	1
  (999, 916)	1
  (999, 945)	1
  (999, 1011)	1
  (999, 1162)	1
  (999, 1207)	1
  (999, 1245)	1
  (999, 1363)	1
  (999, 1521)	1
  (999, 1780)	3
  (999, 1785)	1
  (999, 1786)	1
  (999, 1788)	1
  (999, 1809)	1
  (999, 1814)	1
  (999, 1825)	1
  (999, 1944)	1
  (999, 2011)	1


In [None]:
# Spliting trian/test 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=284)

In [None]:
# No hyperparameter tuning
from sklearn.naive_bayes import MultinomialNB
nb= MultinomialNB()
nb.fit(X_train, Y_train)
predictions = nb.predict(X_test)

In [None]:
print(confusion_matrix(Y_test, predictions))
print('/n')
print(classification_report(Y_test, predictions))

[[80 33]
 [17 70]]
/n
              precision    recall  f1-score   support

           0       0.82      0.71      0.76       113
           1       0.68      0.80      0.74        87

    accuracy                           0.75       200
   macro avg       0.75      0.76      0.75       200
weighted avg       0.76      0.75      0.75       200



In [None]:
# Outputting incorrect observations
misclassified_1 = np.where(Y_test != predictions)
df_mis = df_yelp.iloc[misclassified_1]
df_mis.head(10)

Unnamed: 0,review,is_positive
2,not tasty and the texture was just nasty.,0
7,the potatoes were like rubber and you could te...,0
8,the fries were great too.,1
13,"i tried the cape cod ravoli, chicken,with cran...",1
18,"this place is not worth your time, let alone v...",0
20,the burrittos blah!,0
27,this hole in the wall has great mexican street...,1
35,the only redeeming quality of the restaurant w...,1
36,ample portions and good prices.,1
40,the shrimp tender and moist.,1


In [None]:
# Using multinomial naive bayes with hyperparameter tuning
from sklearn.naive_bayes import MultinomialNB
nb= MultinomialNB(alpha=.8)
nb.fit(X_train, Y_train)

MultinomialNB(alpha=0.8)

In [None]:
predictions = nb.predict(X_test)

In [None]:
predictions

array([0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1])

In [None]:
print(confusion_matrix(Y_test, predictions))
print('/n')
print(classification_report(Y_test, predictions))

[[81 32]
 [17 70]]
/n
              precision    recall  f1-score   support

           0       0.83      0.72      0.77       113
           1       0.69      0.80      0.74        87

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.75       200
weighted avg       0.77      0.76      0.76       200



In [None]:
# Outputting incorrect observations
misclassified_2 = np.where(Y_test != predictions)
df_mis = df_yelp.iloc[misclassified_2]
df_mis.head(10)

Unnamed: 0,review,is_positive
8,the fries were great too.,1
13,"i tried the cape cod ravoli, chicken,with cran...",1
18,"this place is not worth your time, let alone v...",0
20,the burrittos blah!,0
27,this hole in the wall has great mexican street...,1
35,the only redeeming quality of the restaurant w...,1
36,ample portions and good prices.,1
40,the shrimp tender and moist.,1
46,it's too bad the food is so damn generic.,0
47,"the burger is good beef, cooked just right.",1


### Gaussain NaiveBayes

In [None]:
# Using Gaussain naive bayes
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(X_train.toarray(), Y_train)

GaussianNB()

In [None]:
# Identifying prior probabilites assigned by default
gb.class_prior_

array([0.48375, 0.51625])

In [None]:
# Train accuracy
gb.score(X_train.toarray(), Y_train)

0.95375

In [None]:
predictions = nb.predict(X_test.toarray())
predictions

array([0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1])

In [None]:
print(confusion_matrix(Y_test, predictions))
print('/n')
print(classification_report(Y_test, predictions))

[[81 32]
 [17 70]]
/n
              precision    recall  f1-score   support

           0       0.83      0.72      0.77       113
           1       0.69      0.80      0.74        87

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.75       200
weighted avg       0.77      0.76      0.76       200



In [None]:
from sklearn.model_selection import cross_validate, GridSearchCV
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=gb, 
                 param_grid=params_NB, 
                 cv=3,  
                 verbose=1, 
                 scoring='accuracy') 

In [None]:
gs_NB.fit(X_train.toarray(), Y_train)
gs_NB.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'var_smoothing': 0.006579332246575682}

In [None]:
gb_new = GaussianNB(priors = [0.7, 0.3], var_smoothing = 0.006)
gb_new.fit(X_train.toarray(), Y_train)
pred_new = gb_new.predict(X_test.toarray())

In [None]:
gb_new.score(X_train.toarray(), Y_train)

0.94625

In [None]:
print(confusion_matrix(Y_test, pred_new))
print('/n')
print(classification_report(Y_test, pred_new))

[[56 57]
 [13 74]]
/n
              precision    recall  f1-score   support

           0       0.81      0.50      0.62       113
           1       0.56      0.85      0.68        87

    accuracy                           0.65       200
   macro avg       0.69      0.67      0.65       200
weighted avg       0.70      0.65      0.64       200



In [None]:
misclassified = np.where(Y_test != pred_new)

In [None]:
misclassified

(array([  0,   2,   4,   7,   8,  13,  27,  30,  35,  36,  40,  41,  46,
         48,  49,  51,  53,  58,  59,  61,  62,  68,  73,  75,  76,  79,
         84,  86,  88,  91,  95,  99, 101, 105, 111, 112, 113, 115, 118,
        119, 123, 125, 126, 129, 132, 135, 136, 145, 146, 150, 152, 155,
        160, 161, 164, 168, 170, 173, 174, 180, 185, 187, 188, 190, 191,
        192, 193, 194, 195, 199]),)

In [None]:
df_yelp['review'][0]

'wow... loved this place.'

In [None]:
df_mis = df_yelp.iloc[misclassified]

In [None]:
df_mis.head(10)

Unnamed: 0,review,is_positive
0,wow... loved this place.,1
2,not tasty and the texture was just nasty.,0
4,the selection on the menu was great and so wer...,1
7,the potatoes were like rubber and you could te...,0
8,the fries were great too.,1
13,"i tried the cape cod ravoli, chicken,with cran...",1
27,this hole in the wall has great mexican street...,1
30,"also there are combos like a burger, fries, an...",1
35,the only redeeming quality of the restaurant w...,1
36,ample portions and good prices.,1
