In [1]:
import spacy
import pickle

import pandas as pd
import numpy
import pickle
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
from spacy.lang.en import STOP_WORDS

In [3]:
import string

In [66]:
class PreProcessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(STOP_WORDS)
        self.stop_words.update(string.punctuation)
        self.stop_words.remove('not')
        
        self.docs = []
        self.splits = []
        
    def split_into_sents(self, review):
        if not isinstance(review, spacy.tokens.doc.Doc):
            review = self.nlp(' '.join([r.lower() for r in review.split(' ')]))
#             review = self.nlp(' '.join([r.lower() for r in review.split(' ') if r.lower() not in self.stop_words]))
        
        sents = []
        for sentence in review.sents:
            start = 0
            counter = 0
            for token in sentence:
                # 89 -> Conjunctions,
                # 97 -> Punctuations
                # if token.pos in [89, 97] or token.text.strip() == ',':
                if token.pos in [89, 97]:
                    if counter > start: 
                        sents.append(sentence[start: counter])
                    start = counter + 1
                counter += 1
            if counter > start:
                sents.append(sentence[start: counter])
        return sents
    
    def lemmitize(self, sentence):
        return ' '.join([x.lemma_ for x in self.nlp(sentence) if x.text.lower() not in self.stop_words])
        # return ' '.join([x.lemma_ for x in self.nlp(sentence)])
    
    def feature_extraction(self, custom_sent):
        features = {}
        
        nouns = []
        adjs = []
        verbs = []
        intjs = []
        verbsAdjIntjs = []
        
        # 92 -> NOUN, 96 -> Proper Noun
        # 95 -> PRONOUN
        # 86 -> AdVerb
        # 84 -> Adjective
        # 100 -> VERB
        # 87 -> AUX. VERB
        # 94 -> Partition (mostly used alongside AUX. VERB)
        # 91 -> Interjection, like Wow, Alas, Hurraydark
        for token in custom_sent:
            if token.pos in [92, 96]:
                nouns.append(token.lemma_)
            elif token.pos in [84, 86, 100, 87, 94, 91]:
                verbsAdjIntjs.append(token.lemma_)
#             elif token.pos in [84, 86, 91]:
#                 adjs.append(token.lemma_)
#             elif token.pos in [87, 100, 94]:
#                 verbs.append(token.lemma_)

        return { 
            "entity": ', '.join(nouns),
             "features": " ".join(verbsAdjIntjs),
#               "features": ' '.join(adjs) + ' '.join(verbs)
#                 "features": ' '.join(adjs) if len(adjs) > 0 else ' '.join(verbs)
        }
                

In [5]:
class NBModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="nb-darkmodel.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = MultinomialNB()
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [6]:
class RandomForestModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="random-forest-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = RandomForestClassifier(max_depth=25, random_state=17)
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [7]:
class SupportVectorModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="svm-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = svm.LinearSVC()
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [8]:
class KNearestNeighborsModel:
    def __init__(self, X=None, Y=None, N=20, debug=False, file_path="knn-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = KNeighborsClassifier(N, weights='uniform')
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [9]:
df = pd.read_csv("../datasets/Restaurant_Reviews.tsv", sep="\t")
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
len(df)

1000

In [11]:
df.iloc[:]['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [12]:
X = df.iloc[:]['Review']
Y = df.iloc[:]['Liked']

In [13]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [14]:
p = PreProcessor()
lemma_X = [p.lemmitize(x) for x in X]

In [15]:
lemma_X[0:5]

['wow ... love place',
 'crust not good',
 'not tasty texture nasty',
 'stop late bank holiday Rick Steve recommendation love',
 'selection menu great price']

In [16]:
Y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [19]:
nb_model = NBModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7966666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.80       143
           1       0.83      0.76      0.80       157

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300



In [20]:
forest_model = RandomForestModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77       143
           1       0.89      0.56      0.69       157

    accuracy                           0.73       300
   macro avg       0.77      0.74      0.73       300
weighted avg       0.78      0.73      0.73       300



In [21]:
sv_model = SupportVectorModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7933333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.88      0.80       143
           1       0.87      0.71      0.78       157

    accuracy                           0.79       300
   macro avg       0.80      0.80      0.79       300
weighted avg       0.81      0.79      0.79       300



In [23]:
knn_model = KNearestNeighborsModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       143
           1       0.89      0.69      0.77       157

    accuracy                           0.79       300
   macro avg       0.80      0.79      0.79       300
weighted avg       0.81      0.79      0.79       300



In [82]:
def pprint(sentence):
    def parse_result(resultArr):
        return "POSITIVE" if resultArr[0] == 1 else "NEGATIVE"
    
    RJUST = 30
    for sent in p.split_into_sents(sentence):
        print("\n[Custom Sentence]: ", sent, '\n')
        print("[MultinomialNB]:".rjust(40), parse_result(nb_model.predict(str(sent))))
        print("[Random Forest Model]:".rjust(40), parse_result(forest_model.predict(str(sent))))
        print("[Support Vector Model]:".rjust(40), parse_result(sv_model.predict(str(sent))))
        print("[K Nearest Neighbors Model]:".rjust(40), parse_result(knn_model.predict(str(sent))))
        
        print(f"\n{p.feature_extraction(sent)}")

In [68]:
sent = 'I did not like the food'
print(sent)
print(nb_model.predict(sent))
print(forest_model.predict(sent))
print(sv_model.predict(sent))
print(knn_model.predict(sent))

p = PreProcessor()
for _sent in p.split_into_sents(sent):
        print(_sent)
        print(p.feature_extraction(_sent))


I did not like the food
[0]
[0]
[0]
[0]
i did not like the food
{'entity': 'food', 'features': 'do not like'}


In [83]:
pprint("I did not like the food.")


[Custom Sentence]:  i did not like the food 

                           MultinomialNB NEGATIVE
                     Random Forest Model NEGATIVE
                  Support Vector Model:  NEGATIVE
             K Nearest Neighbors Model:  NEGATIVE

{'entity': 'food', 'features': 'do not like'}


In [81]:
pprint('I liked the food, but service was awful. Ambience was damn poor.')


[Custom Sentence]:  i liked the food 

       MultinomialNB POSITIVE
 Random Forest Model NEGATIVE
Support Vector Model:  NEGATIVE
K Nearest Neighbors Model:  POSITIVE

{'entity': 'food', 'features': 'like'}

[Custom Sentence]:  service was awful 

       MultinomialNB NEGATIVE
 Random Forest Model NEGATIVE
Support Vector Model:  NEGATIVE
K Nearest Neighbors Model:  NEGATIVE

{'entity': 'service', 'features': 'be awful'}

[Custom Sentence]:  ambience was damn poor 

       MultinomialNB NEGATIVE
 Random Forest Model NEGATIVE
Support Vector Model:  NEGATIVE
K Nearest Neighbors Model:  NEGATIVE

{'entity': 'ambience', 'features': 'be damn poor'}
