In [1]:
### Scikit-Learn ML Classifier Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# Custom Module: XGBoost
from xgboost import XGBClassifier
## MultiLayer Perceptron Classifier
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pathlib import Path
import pickle

import pandas as pd
import numpy as np

from review import Review

In [49]:
class Base_Model (Review):
    def __init__ (self, X=None, y=None, debug=False, model = None):
        """ 
        X: the actual text of review
        y: the sentiment score either positive or negative
        model: should be a function object
        """
        super().__init__()
        self.model = model()
        self.model_name = model.__name__
        pickle_path = Path ("Base_Model-data.pickle")
        if not debug and Path.exists (pickle_path):
            self.vectorizer, self.X_train,self.X_test, self.y_train, self.y_test = pickle.load (open(pickle_path, "rb"))
        else:
            assert X is not None and y is not None, "Dataset X and y can't be EMPTY"
            processed_X = list(' '.join (self.pre_process(x)) for x in X)
            self.vectorizer = TfidfVectorizer(vocabulary = list(self.features))
            processed_X = self.vectorizer.fit_transform (processed_X)
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split (processed_X, y, random_state=7, test_size=0.2)
            
            data = (self.vectorizer, self.X_train, self.X_test, self.y_train, self.y_test)
            pickle.dump(data, open (pickle_path, "wb"))
            
    def train (self, model):
        self.model.fit (self.X_train, self.y_train)
        predicted = self.model.predict (self.X_test)
        self.accuracy = accuracy_score (self.y_test, predicted)
        print (f"Accuracy of {self.model_name} Model: {self.accuracy}")
        print (f"Confusion Matrix for {self.model_name} Model: ")
        print (confusion_matrix (self.y_test, predicted))
        
    def predict (self, msg):
        if not isinstance (msg, pd.Series):
            msg = pd.Series ([msg])
        msg = self.vectorizer.transform (msg)
        return self.model.predict (msg)

In [19]:
df = pd.read_csv ("../datasets/Restaurant_Reviews.tsv", sep='\t')
print (df.head())
X = df.loc[:]['Review']
y = df.loc[:]['Liked']

print (X.head())
print (y.head())
bm = Base_Model (X, y)
print ("Length of Feature Vector: ", len(bm.vectorizer.get_feature_names()))
#print (bm.features)

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object
0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64
Length of Feature Vector:  858


In [25]:
# To check if the dataset is actually balanced or not 
print(y.value_counts())

1    500
0    500
Name: Liked, dtype: int64


In [64]:
# MultinomialNB Model
class NaiveBayes_Model (Base_Model):
    def __init__ (self):
        super().__init__(model = MultinomialNB)

In [71]:
# Random Forest Classifier Model
class RandomForestClassifier_Model (Base_Model):
    def __init__ (self):
        super().__init__(model = RandomForestClassifier)

In [72]:
# Decision Tree Classifier Model
class DecisionTreeClassifier_Model (Base_Model):
    def __init__ (self):
        super().__init__(model=DecisionTreeClassifier)

In [73]:
# Support Vector Machine Classifier Model
class SVC_Model (Base_Model):
    def __init__(self):
        super().__init__ (model = SVC)

In [74]:
# K-Nearest Neighbors Classifier Model
class KNeighborsClassifier_Model (Base_Model):
    def __init__ (self):
        super().__init__ (model = KNeighborsClassifier)

In [78]:
# Multi Layer Perceptron Model
class MLPClassifier_Model (Base_Model):
    def __init__ (self):
        super().__init__(model = MLPClassifier)

In [76]:
rfm = RandomForestClassifier_Model()
rfm.train()

Accuracy of RandomForestClassifier Model: 0.75
Confusion Matrix for RandomForestClassifier Model: 
[[93 16]
 [34 57]]




In [83]:
mlpc = MLPClassifier_Model()
mlpc.model.max_iter = 1000
mlpc.train()

Accuracy of MLPClassifier Model: 0.755
Confusion Matrix for MLPClassifier Model: 
[[87 22]
 [27 64]]


In [35]:
nb.predict ("Food was not good")

array([1])

In [37]:
features = set (nb.vectorizer.get_feature_names())
print ('not' in features)

False


In [30]:
##################################################################
### Multinomial NaiveBayes Model for Sentiment Analysis Review ###
##################################################################
class NaiveBayes_Model (Review):
    def __init__(self):
        self.model = MultinomialNB()
        self.accuracy = None
        self.vectorizer = None
        
    def train(self, X=None, y=None, debug=False):
        """
            X: the training data input
            y: the correct label for the training data
        """
        pickle_path = Path ("multinomial_NB.pickle")
        if not debug and Path.exists (pickle_path):
                self.vectorizer, self.model = pickle.load (open(pickle_path, "rb"))
                print ("!!! Multinomial_NB model already trained.....skipping the training")
                print("[help]: call method train() with debug=True argument")
                return;
        
        assert X is not None and y is not None, "No data given, How do i train ? (.>_<.)"
        # self.vectorizer = TfidfVectorizer ()
        # temp = self.vectorizer.fit_transform (X)
        # features = self.vectorizer.get_feature_names()
        
        self.vectorizer = TfidfVectorizer (ngram_range=(1, 1))
        # self.vectorizer = CountVectorizer (ngram_range=(1, 1))
        X = self.vectorizer.fit_transform (X)
        
        # Cause i personally only want Adjectives to be my features
        #### The following code does the same thing  as filter is doing a couple of lines below
        """
        items = []
        for f in features:
           if (self.pos(f)[0] in (84, 100))
               items.append (f)
        features = items
        """
        ####
        
        # features = list(filter(lambda a: 84 in self.pos(a) or 100 in self.pos(a), features))
        
        
        #self.vectorizer = CountVectorizer (vocabulary = features, max_features=100)
        #X = self.vectorizer.fit_transform (X)
        self.feature_names = self.vectorizer.get_feature_names()
        
        X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=7)
        
        self.model.fit (X_train, y_train)
        predicted = self.model.predict (X_test)
        self.accuracy_score = accuracy_score (y_test, predicted)
        print (f"[Accuracy Score]: {self.accuracy_score}")
        
        pickled_data = (self.vectorizer, self.model)
        pickle.dump (pickled_data, open(pickle_path, "wb"))
    
    def predict(self, msg):
        if not isinstance (msg, pd.Series):
            msg = pd.Series ([msg])
        msg = self.vectorizer.transform (msg)
        return self.model.predict (msg)

In [19]:
model = NaiveBayes_Model ()
r = Review()

In [20]:
r.features

set()

In [21]:
df = pd.read_csv ("../datasets/Restaurant_Reviews.tsv", sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [22]:
X = df.iloc[:]['Review']
y = df.iloc[:]['Liked']

In [23]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [24]:
y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [25]:
r.pre_process (X[1])

['crust not good']

In [31]:
# r.pre_process (X[2])
processed_X = list(' '.join (r.pre_process(x)) for x in X)

In [32]:
processed_X[:5]

['wow ... loved place',
 'crust not good',
 'not tasty texture nasty',
 'stopped late bank holiday Rick Steve recommendation loved',
 'selection menu great prices']

In [34]:
print (len (r.features))
for f in r.features:
    print (f, end=' # ')

858
considering # Special # check # unbelievable # ignored # Eclectic # crawfish # stood # delicioso # unexperienced # dipping # Bad # help # thrilled # nicest # FREEZING # ask # bad # imagine # unwelcome # relaxed # seasonal # eaten # Nicest # handled # gave # stepped # Horrible # set # GO # Join # stomach # sliced # refried # wasted # traditional # deliver # uninspired # Main # best # Disappointing # live # dealing # affordable # giving # favorite # claimed # eggplant # reading # voted # white # expect # having # prime # count # eat # older # tip # Tasted # reasonable # fillet # review # visit # multiple # choose # thick # sick # Nice # Soggy # handmade # fluffy # sergeant # work # GREAT # loves # likes # fast # fuzzy # larger # rude # find # climbing # customer # Love # offers # frustrated # strange # awful # busy # mid # Interesting # average # looks # inflate # expensive # sample # prompt # informative # sitting # annoying # courteous # waiting # eating # told # flat # serve # ord

In [28]:
len (processed_X)

1000

In [29]:
model.train (processed_X, y, debug=True)

AttributeError: 'NaiveBayes_Model' object has no attribute 'features'

In [76]:
len(model.feature_names)

1565

In [77]:
model.feature_names

['00',
 '10',
 '100',
 '11',
 '12',
 '15',
 '17',
 '1979',
 '20',
 '2007',
 '23',
 '30',
 '30s',
 '35',
 '40',
 '40min',
 '45',
 '4ths',
 '70',
 '85',
 '90',
 '99',
 'absolute',
 'absolutely',
 'absolutley',
 'accident',
 'accommodation',
 'accomodate',
 'accordingly',
 'accountant',
 'ache',
 'acknowledge',
 'actual',
 'actually',
 'add',
 'affordable',
 'afternoon',
 'ago',
 'ahead',
 'airline',
 'airport',
 'ala',
 'albondigas',
 'all',
 'allergy',
 'almond',
 'amazing',
 'ambiance',
 'ambience',
 'ample',
 'andddd',
 'angry',
 'annoying',
 'anticipate',
 'anymore',
 'anytime',
 'anyways',
 'apart',
 'apologize',
 'apology',
 'app',
 'appal',
 'apparently',
 'appealing',
 'appetite',
 'appetizer',
 'apple',
 'approval',
 'area',
 'arepa',
 'aria',
 'array',
 'arrive',
 'article',
 'ask',
 'assure',
 'atmosphere',
 'atrocious',
 'attach',
 'attack',
 'attention',
 'attentive',
 'attitude',
 'auju',
 'authentic',
 'average',
 'avocado',
 'avoid',
 'away',
 'awesome',
 'awful',
 'awkwa

In [53]:
features = set(model.feature_names)

In [24]:
'like' in features

False

In [25]:
model.predict ("I didn't like the buffet.")

array([1])

In [26]:
model.predict ("I kind of not liked the service.")

array([1])

In [27]:
model.predict ("Food was good but service could have been better.")

array([1])

In [78]:
model.predict ("Buffet was ok but service ruined it.")

array([0])

In [79]:
model.predict (" I didn't like the food.")

array([0])

In [21]:
len(model.feature_names)

701

In [22]:
sen = "food was not tasty"
print (model.pos(sen))
model.predict (sen)

{100, 92, 86, 84}


array([1])