In [50]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pathlib import Path
import pickle

import pandas as pd
import numpy as np
from review import Review

In [51]:
##################################################################
### Multinomial NaiveBayes Model for Sentiment Analysis Review ###
##################################################################
class NaiveBayes_Model:
    def __init__(self):
        self.model = MultinomialNB()
        self.accuracy = None
        self.vectorizer = None
        
    def train(self, X=None, y=None, debug=False):
        """
            X: the training data input
            y: the correct label for the training data
        """
        pickle_path = Path ("multinomial_NB.pickle")
        self.vectorizer = TfidfVectorizer ()
        if not debug and Path.exists (pickle_path):
                self.vectorizer, self.model = pickle.load (open(pickle_path, "rb"))
                print ("!!! Multinomial_NB model already trained.....skipping the training")
                print("[help]: call method train() with debug=True argument")
                return;
        
        self.feature_names = self.vectorizer.get_feature_names()
            
        assert X is not None and y is not None, "No data given, How do i train ? (.>_<.)"
        X = self.vectorizer.fit_transform (X)
        X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=7)
        
        self.model.fit (X_train, y_train)
        predicted = self.model.predict (X_test)
        self.accuracy_score = accuracy_score (y_test, predicted)
        print (f"[Accuracy Score]: {self.accuracy_score}")
        
        pickled_data = (self.vectorizer, self.model)
        pickle.dump (pickled_data, open(pickle_path, "wb"))
    
    def predict(self, msg):
        if not isinstance (msg, pd.Series):
            msg = pd.Series ([msg])
        msg = self.vectorizer.transform (msg)
        return self.model.predict (msg)

In [52]:
model = NaiveBayes_Model ()
r = Review()

In [53]:
df = pd.read_csv ("../datasets/Restaurant_Reviews.tsv", sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [54]:
X = df.iloc[:]['Review']
y = df.iloc[:]['Liked']

In [55]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [56]:
y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [57]:
processed_X = [' '.join (map(lambda span: span.text, r.pre_process (x))) for x in X]

In [58]:
processed_X[:5]

['Wow... Loved this place',
 'Crust is not good',
 'Not tasty the texture was just nasty',
 'Stopped by during the late May bank holiday off Rick Steve recommendation loved it',
 'The selection on the menu was great so were the prices']

In [59]:
len (processed_X)

1000

In [60]:
model.train (processed_X, y, debug=True)

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [25]:
model.predict ("I didn't like the buffet.")

array([0])

In [28]:
model.predict ("I kind of not liked the service.")

array([0])

In [29]:
model.predict ("Food was good but service could have been better.")

array([1])