### Training a model with sklearn for restorent customer sentiment anaylysis (label 1 means rating 4 or 5 and label 0 means rating 1 or 2)

In [27]:
import pandas as pd

In [28]:
df=pd.read_csv('datasets/Reviews.csv',usecols=['Summary','Score'])

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Score    568454 non-null  int64 
 1   Summary  568427 non-null  object
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


In [30]:
df.dropna(how='any',inplace=True)

In [31]:
df=df[df.Score!=3]

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 525789 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Score    525789 non-null  int64 
 1   Summary  525789 non-null  object
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


In [33]:
df0=df[(df.Score ==1) | (df.Score==2)].sample(1000, random_state=1)

df1=df[(df.Score==4) | (df.Score==5)].sample(1000,random_state=1)

df=pd.concat([df0,df1])

In [34]:
df

Unnamed: 0,Score,Summary
230527,2,WEAK
117836,1,too much of a good thing
235676,1,The most taste-less Earl Grey ever
390337,1,very disappointed
402857,1,Changed the ingredients to now include meat pr...
...,...,...
333485,5,Great Stuff!
542541,5,Almost good as real
261703,5,"IMO, the perfect canned beef frank"
517281,5,great taste!


In [35]:
# Labeling data with 0 and 1 
for i,j in enumerate(df.Score):
    if j ==4 or j==5:
        df.iloc[i,0]=1 
    elif j==1 or j==2:
        df.iloc[i,0]=0

In [36]:
df

Unnamed: 0,Score,Summary
230527,0,WEAK
117836,0,too much of a good thing
235676,0,The most taste-less Earl Grey ever
390337,0,very disappointed
402857,0,Changed the ingredients to now include meat pr...
...,...,...
333485,1,Great Stuff!
542541,1,Almost good as real
261703,1,"IMO, the perfect canned beef frank"
517281,1,great taste!


-----

In [37]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.Summary.values,df.Score.values,random_state=1)

In [38]:
# fucntion to clean list of strings. It will lower the character, keeps only english characters with space and lemmatize by nltk.stem.WordNetLemmatizer.
import re 
from nltk.stem import WordNetLemmatizer
def text_clean(text):
    '''  To clean list of strings. It will lower the character, keeps only english characters with space and lemmatize by nltk.stem.WordNetLemmatizer.'''
    final_text=[]
    wl=WordNetLemmatizer()
    pattern=re.compile('[^a-z ]')
    for i in text.split():
        token= i.lower()
        token=re.sub(pattern,'',token)
        token=wl.lemmatize(token)
        final_text.append(token)
    return ' '.join(final_text)

In [39]:
X_train_cleaned=list(map(text_clean,X_train))

In [40]:
X_train_cleaned

['overprices and faulty weight advertiesed',
 'i like salty',
 'really terrible try another flavor dont waste your money',
 'shipping cost',
 'overpriced',
 'tried several similar fillyourown pod this is the best',
 'almost good a real',
 'disappointed',
 'look somehere else',
 'cranberry lover',
 'the only bar my husband will eati wish they were organic',
 'horrible horrible product  yuck',
 'not for me',
 'coffee',
 'best price on zukes',
 'sensible food tropical',
 'greatest snack to keep weight off',
 'really great tea',
 'expired',
 'taste like licking a charred raspberry',
 'health risk',
 'betty crocker is gluten free pantry',
 'reformulated to remove all taste',
 'deep rich chocolate and mint',
 'a favorite',
 'no happy',
 'rainblo   bubble gum',
 'ginger root beer',
 'sign',
 'mailed garbage',
 'tasty  crunchy with a good balance of chocolate to cereal but a bit crumbly',
 'tasty',
 'appears to be adulterated with grapefruit juice',
 'frequent problem batch of this food',
 'co

In [41]:
# feature extraction
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
sw=list(ENGLISH_STOP_WORDS)
sw.remove('not')
sw.remove('no')
sw.remove('cant')
sw.remove('couldnt')
sw.remove('should')
sw.remove('why')
sw.remove('under')
sw.remove('nor')
sw.remove('very')
sw.remove('well')
sw.remove('more')
sw.remove('off')
sw.remove('but')
sw.remove('found')
sw.remove('top')

cv=CountVectorizer(lowercase=True,stop_words=sw)
X_train_final=cv.fit_transform(X_train_cleaned)  # cv.fit_transform() returns an sparce matrix and we need simple matrix

In [42]:
X_train_final.shape

(1500, 1493)

____

In [43]:
# feature selection wih SequentialFeatureSelector with model=Multinomial to select best features
# if try block does not work this cell can take approx 25 mint to get the output. meaning, if the features not found in local machine (selected before) 
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.naive_bayes import MultinomialNB
# to save/load sparce matrix in the local machine
from scipy.sparse import save_npz,load_npz
# to save an object
import pickle

try:
    with open('sfs.pickle', 'rb') as obj:
        sfs=pickle.load(obj)
    X_train_final=load_npz('X_train_final.npz')
except:
    model = MultinomialNB() 
    sfs=SequentialFeatureSelector(model, n_features_to_select=1400, cv = 5, direction='backward')
    X_train_final = sfs.fit_transform(X_train_final,y_train)
    with open('sfs.pickle','wb') as obj:
        pickle.dump(sfs,obj)
    save_npz('X_train_final.npz', X_train_final)

In [44]:
X_train_final.shape

(1500, 1400)

-----

In [45]:
# model building, and scoring
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB

model=MultinomialNB()   # best suitable with CountVectorizer
model.fit(X_train_final,y_train)
model.score(X_train_final,y_train)

0.944

In [46]:
# confusion matrix of training data
from sklearn.metrics import confusion_matrix
pred_train=model.predict(X_train_final)
cmatrix=confusion_matrix(y_train,pred_train)
cmatrix

array([[691,  54],
       [ 30, 725]], dtype=int64)

In [47]:
# performance on testing data
X_test_cleaned=list(map(text_clean,X_test))
X_test_final=cv.transform(X_test_cleaned)
X_test_final=sfs.transform(X_test_final)
pred_test=model.predict(X_test_final)
model.score(X_test_final,y_test)

0.782

----

In [48]:
# singale sample test
sample=['The dinner good']
s_cleaned=list(map(text_clean,sample))
s_final=cv.transform(s_cleaned)
s_final=sfs.transform(s_final)
model.predict(s_final)

array([1], dtype=int64)

👍 The result is correct as the sample provided have positive sentiment.

#### Testing with another dataset

In [49]:
# it is a balanced and without null values dataset
df_test=pd.read_csv('datasets/Restaurant_Reviews.txt',sep='\t')
df_test

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [50]:
# scoring
X_test_cleaned=list(map(text_clean,df_test.Review))
X_final=cv.transform(X_test_cleaned)
X_final=sfs.transform(X_final)
model.score(X_final,df_test.Liked)

0.744

In [51]:
# confusion matrix
pred_test=model.predict(X_final)
matrics=confusion_matrix(df_test.Liked,pred_test)

In [52]:
matrics

array([[408,  92],
       [164, 336]], dtype=int64)

----

😒 The performance of the model is not much good on unseen data but on seen data, it can be considerable. Performance of this model should be improved.

✔