In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pathlib import Path
import pickle

import pandas as pd
import numpy as np
from review import Review

In [73]:
##################################################################
### Multinomial NaiveBayes Model for Sentiment Analysis Review ###
##################################################################
class NaiveBayes_Model (Review):
    def __init__(self):
        self.model = MultinomialNB()
        self.accuracy = None
        self.vectorizer = None
        
    def train(self, X=None, y=None, debug=False):
        """
            X: the training data input
            y: the correct label for the training data
        """
        pickle_path = Path ("multinomial_NB.pickle")
        if not debug and Path.exists (pickle_path):
                self.vectorizer, self.model = pickle.load (open(pickle_path, "rb"))
                print ("!!! Multinomial_NB model already trained.....skipping the training")
                print("[help]: call method train() with debug=True argument")
                return;
        
        assert X is not None and y is not None, "No data given, How do i train ? (.>_<.)"
        # self.vectorizer = TfidfVectorizer ()
        # temp = self.vectorizer.fit_transform (X)
        # features = self.vectorizer.get_feature_names()
        
        # self.vectorizer = TfidfVectorizer (ngram_range=(1, 1))
        self.vectorizer = CountVectorizer (ngram_range=(1, 1))
        X = self.vectorizer.fit_transform (X)
        features = self.vectorizer.get_feature_names()
        
        # Cause i personally only want Adjectives to be my features
        #### The following code does the same thing  as filter is doing a couple of lines below
        """
        items = []
        for f in features:
           if (self.pos(f)[0] in (84, 100))
               items.append (f)
        features = items
        """
        ####
        
        # features = list(filter(lambda a: 84 in self.pos(a) or 100 in self.pos(a), features))
        
        #self.vectorizer = CountVectorizer (vocabulary = features, max_features=100)
        #X = self.vectorizer.fit_transform (X)
        self.feature_names = self.vectorizer.get_feature_names()
        
        X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=7)
        
        self.model.fit (X_train, y_train)
        predicted = self.model.predict (X_test)
        self.accuracy_score = accuracy_score (y_test, predicted)
        print (f"[Accuracy Score]: {self.accuracy_score}")
        
        pickled_data = (self.vectorizer, self.model)
        pickle.dump (pickled_data, open(pickle_path, "wb"))
    
    def predict(self, msg):
        if not isinstance (msg, pd.Series):
            msg = pd.Series ([msg])
        msg = self.vectorizer.transform (msg)
        return self.model.predict (msg)

In [74]:
model = NaiveBayes_Model ()
r = Review()

In [70]:
r.pos

<bound method Review.pos of <review.Review object at 0x7f701c67da90>>

In [66]:
df = pd.read_csv ("../datasets/Restaurant_Reviews.tsv", sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [34]:
X = df.iloc[:]['Review']
y = df.iloc[:]['Liked']

In [35]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [36]:
y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [37]:
r.pre_process (X[1])

['crust not good']

In [38]:
# r.pre_process (X[2])
processed_X = list(' '.join (r.pre_process(x)) for x in X)

In [39]:
processed_X[:5]

['wow ... love place',
 'crust not good',
 'not tasty texture nasty',
 'stop late bank holiday Rick Steve recommendation love',
 'selection menu great price']

In [40]:
len (processed_X)

1000

In [75]:
model.train (processed_X, y, debug=True)

[Accuracy Score]: 0.755


In [76]:
len(model.feature_names)

1565

In [77]:
model.feature_names

['00',
 '10',
 '100',
 '11',
 '12',
 '15',
 '17',
 '1979',
 '20',
 '2007',
 '23',
 '30',
 '30s',
 '35',
 '40',
 '40min',
 '45',
 '4ths',
 '70',
 '85',
 '90',
 '99',
 'absolute',
 'absolutely',
 'absolutley',
 'accident',
 'accommodation',
 'accomodate',
 'accordingly',
 'accountant',
 'ache',
 'acknowledge',
 'actual',
 'actually',
 'add',
 'affordable',
 'afternoon',
 'ago',
 'ahead',
 'airline',
 'airport',
 'ala',
 'albondigas',
 'all',
 'allergy',
 'almond',
 'amazing',
 'ambiance',
 'ambience',
 'ample',
 'andddd',
 'angry',
 'annoying',
 'anticipate',
 'anymore',
 'anytime',
 'anyways',
 'apart',
 'apologize',
 'apology',
 'app',
 'appal',
 'apparently',
 'appealing',
 'appetite',
 'appetizer',
 'apple',
 'approval',
 'area',
 'arepa',
 'aria',
 'array',
 'arrive',
 'article',
 'ask',
 'assure',
 'atmosphere',
 'atrocious',
 'attach',
 'attack',
 'attention',
 'attentive',
 'attitude',
 'auju',
 'authentic',
 'average',
 'avocado',
 'avoid',
 'away',
 'awesome',
 'awful',
 'awkwa

In [53]:
features = set(model.feature_names)

In [24]:
'like' in features

False

In [25]:
model.predict ("I didn't like the buffet.")

array([1])

In [26]:
model.predict ("I kind of not liked the service.")

array([1])

In [27]:
model.predict ("Food was good but service could have been better.")

array([1])

In [78]:
model.predict ("Buffet was ok but service ruined it.")

array([0])

In [79]:
model.predict (" I didn't like the food.")

array([0])

In [21]:
len(model.feature_names)

701

In [22]:
sen = "food was not tasty"
print (model.pos(sen))
model.predict (sen)

{100, 92, 86, 84}


array([1])