In [533]:
import spacy
import pickle

import pandas as pd
import numpy
import pickle
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
from spacy.lang.en import STOP_WORDS

In [3]:
import string

In [368]:
class PreProcessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(STOP_WORDS)
        self.stop_words.update(string.punctuation)
        self.stop_words.remove('not')
        
        self.docs = []
        self.splits = []
        
    def split_into_sents(self, review):
        if not isinstance(review, spacy.tokens.doc.Doc):
            review = self.nlp(' '.join([r.lower() for r in review.split(' ') if r.lower() not in self.stop_words]))
        
        sents = []
        for sentence in review.sents:
            start = 0
            counter = 0
            for token in sentence:
                # 89 -> Conjunctions,
                # 97 -> Punctuations
                # if token.pos in [89, 97] or token.text.strip() == ',':
                if token.pos in [89, 97]:
                    if counter > start: 
                        sents.append(sentence[start: counter])
                    start = counter + 1
                counter += 1
            if counter > start:
                sents.append(sentence[start: counter])
        return sents
    
    def lemmitize(self, sentence):
        return ' '.join([x.lemma_ for x in self.nlp(sentence) if x.text.lower() not in self.stop_words])
        # return ' '.join([x.lemma_ for x in self.nlp(sentence)])
    
    def feature_extraction(self, custom_sent):
        features = {}
        
        nouns = []
        adjs = []
        verbs = []
        intjs = []
        verbsAdjIntjs = []
        
        # 92 -> NOUN, 96 -> Proper Noun
        # 95 -> PRONOUN
        # 86 -> AdVerb
        # 84 -> Adjective
        # 100 -> VERB
        # 87 -> AUX. VERB
        # 94 -> Partition (mostly used alongside AUX. VERB)
        # 91 -> Interjection, like Wow, Alas, Hurray
        for token in custom_sent:
            if token.pos in [92, 96]:
                nouns.append(token.lemma_)
#             elif token.pos in [84, 86, 100, 87, 94, 91]:
#                 verbsAdjIntjs.append(token.lemma_)
            elif token.pos in [84, 86, 91]:
                adjs.append(token.lemma_)
            elif token.pos in [87, 100, 94]:
                verbs.append(token.lemma_)
                
        return { 
            "entity": ', '.join(nouns),
#             "features": " ".join(verbsAdjIntjs),
#              "features": ' '.join(adjs) + ' '.join(verbs)
                "features": ' '.join(adjs) if len(adjs) > 0 else ' '.join(verbs)
        }
                

In [539]:
class NBModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="trained-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = MultinomialNB()
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [524]:
class RandomForestModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="random-forest-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = RandomForestClassifier(max_depth=25, random_state=17)
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [530]:
class SupportVectorModel:
    def __init__(self, X=None, Y=None, debug=False, file_path="random-forest-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = svm.LinearSVC()
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [534]:
class KNearestNeighborsModel:
    def __init__(self, X=None, Y=None, N=20, debug=False, file_path="random-forest-model.pkl", force=False):
        self.vectorizer = None,
        self.model = None,
        if not force and Path.exists(Path(file_path)):
            self.vectorizer, self.model = pickle.load(open(file_path, "rb"))
        else:
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 3), strip_accents='unicode')
            X = self.vectorizer.fit_transform(X)
            self.model = KNeighborsClassifier(N, weights='uniform')
            x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)
            
            # Training Model
            self.model.fit(x_train, y_train)
            
            # Training Results stats
            predicted = self.model.predict(x_test)
            if debug:
                print(f"Accuracy: {accuracy_score(y_test, predicted)}")
                print(f"Classification Report:")
                print(classification_report(y_test, predicted))
                
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump(pickled_tuple, open(file_path, 'wb'))
            
    def predict(self, test):
        if not isinstance(test, pd.Series):
            test = pd.Series([test])
        test = self.vectorizer.transform(test)
        return self.model.predict(test)

In [235]:
df = pd.read_csv("../datasets/Restaurant_Reviews.tsv", sep="\t")
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [169]:
len(df)

1000

In [171]:
df.iloc[:]['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [321]:
X = df.iloc[:]['Review']
Y = df.iloc[:]['Liked']

In [341]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [371]:
p = PreProcessor()
lemma_X = [p.lemmitize(x) for x in X]

In [366]:
lemma_X[0:5]

['wow ... love place',
 'crust not good',
 'not tasty texture nasty',
 'stop late bank holiday Rick Steve recommendation love',
 'selection menu great price']

In [323]:
Y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [538]:
nb_model = NBModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7966666666666666
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.83      0.80       143
           1       0.83      0.76      0.80       157

    accuracy                           0.80       300
   macro avg       0.80      0.80      0.80       300
weighted avg       0.80      0.80      0.80       300



In [528]:
forest_model = RandomForestModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.92      0.77       143
           1       0.89      0.56      0.69       157

    accuracy                           0.73       300
   macro avg       0.77      0.74      0.73       300
weighted avg       0.78      0.73      0.73       300



In [545]:
sv_model = SupportVectorModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.7933333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.88      0.80       143
           1       0.87      0.71      0.78       157

    accuracy                           0.79       300
   macro avg       0.80      0.80      0.79       300
weighted avg       0.81      0.79      0.79       300



In [546]:
knn_model = KNearestNeighborsModel(X=lemma_X, Y=Y, debug=True, force=True)

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       143
           1       0.89      0.69      0.77       157

    accuracy                           0.79       300
   macro avg       0.80      0.79      0.79       300
weighted avg       0.81      0.79      0.79       300



In [550]:
sent = 'I did not like the food'
print(sent)
print(nb_model.predict(sent))
print(forest_model.predict(sent))
print(sv_model.predict(sent))
print(knn_model.predict(sent))


I did not like the food
[0]
[0]
[0]
[0]


In [551]:
sent = p.lemmitize('I did not like the food')
print(sent)
print(nb_model.predict(sent))
print(forest_model.predict(sent))
print(sv_model.predict(sent))
print(knn_model.predict(sent))

not like food
[0]
[0]
[0]
[0]


In [343]:
p = PreProcessor()
p.split_into_sents("I liked the food, but service was awful. Ambience was damn poor.")

[liked food, service awful, ambience damn poor]

In [306]:
p = PreProcessor()
p.split_into_sents("Honeslty it didn't taste THAT fresh.)")

Sentence:  honeslty didn't taste fresh.)
token:  . 97 PUNCT
token:  ) 97 PUNCT
[honeslty didn't taste fresh] 1


[honeslty didn't taste fresh]

In [318]:
sents = p.split_into_sents(X[2])
for sent in sents:
    print(p.feature_extraction(sent))

{'entity': 'texture', 'features': 'tasty nastynot'}


In [314]:
p = PreProcessor()
features = []
for x in X[10:20]:
    for sent in p.split_into_sents(x):
        print(x)
        print(p.feature_extraction(sent))

Service was very prompt.
{'entity': 'service, prompt', 'features': ''}
Would not go back.
{'entity': '', 'features': 'not back'}
The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.
{'entity': 'cashier, care, wayyy', 'features': 'end overprice'}
I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!
{'entity': 'cape, cod, ravoli', 'features': 'try'}
I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!
{'entity': 'chicken', 'features': ''}
I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!
{'entity': 'cranberry', 'features': ''}
I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!
{'entity': 'mmmm', 'features': ''}
I was disgusted because I was pretty sure that was human hair.
{'entity': 'hair', 'features': 'disgusted pretty sure human'}
I was shocked because no signs indicate cash only.
{'entity': 'sign, cash', 'features': 'shocked indicate only'}
Highly recommended.
{'entity': '', 'features': 'highly recomm

In [259]:
p = PreProcessor()
sents = p.split_into_sents("I liked the food, but didn't like the service. Ambience was damn poor.")
for sent in sents:
    print(p.feature_extraction(sent))

{'entity': 'food', 'features': 'like'}
{'entity': 'service', 'features': 'do not like'}
{'entity': 'Ambience', 'features': 'be damn poor'}


In [13]:
type(STOP_WORDS)

set

In [128]:
nlp = spacy.load('en_core_web_sm')

In [131]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [148]:
print(sia.polarity_scores("like"))
print(sia.polarity_scores("do not like"))
print(sia.polarity_scores('damn poor'))

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3612}
{'neg': 0.513, 'neu': 0.487, 'pos': 0.0, 'compound': -0.2755}
{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.7003}


In [140]:
doc = nlp("I didn't like the food")
print(type(doc))
for token in doc:
    print(token,token.lemma_, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

<class 'spacy.tokens.doc.Doc'>
I -PRON- 95 PRON pronoun nsubj
did do 87 AUX auxiliary aux
n't not 94 PART particle neg
like like 100 VERB verb ROOT
the the 90 DET determiner det
food food 92 NOUN noun dobj


In [139]:
doc = nlp("I did not like the food")
for token in doc:
    print(token, token.lemma_, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

I -PRON- 95 PRON pronoun nsubj
did do 87 AUX auxiliary aux
not not 94 PART particle neg
like like 100 VERB verb ROOT
the the 90 DET determiner det
food food 92 NOUN noun dobj


In [17]:
doc = nlp("food was pretty bad")
for token in doc:
    print(token, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

food 92 NOUN noun nsubj
was 87 AUX auxiliary ROOT
pretty 86 ADV adverb advmod
bad 84 ADJ adjective acomp


In [120]:
doc = nlp("food was good, but Ambience was pretty awful.")
for token in doc:
    print(token, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

food 92 NOUN noun nsubj
was 87 AUX auxiliary ROOT
good 84 ADJ adjective acomp
, 97 PUNCT punctuation punct
but 89 CCONJ coordinating conjunction cc
Ambience 96 PROPN proper noun nsubj
was 87 AUX auxiliary conj
pretty 86 ADV adverb advmod
awful 84 ADJ adjective acomp
. 97 PUNCT punctuation punct


In [255]:
p = PreProcessor()
p.split_into_sents("I didn't like the food")

[I didn't like the food]

In [315]:
sent = "Both Veg and Non-Veg Items were great."
doc = nlp(sent)
for token in doc:
    print(token, token.pos, token.pos_, spacy.explain(token.pos_), token.dep_)

sia.polarity_scores(sent)

Both 90 DET determiner det
Veg 96 PROPN proper noun nmod
and 89 CCONJ coordinating conjunction cc
Non 96 PROPN proper noun dep
- 84 ADJ adjective dep
Veg 84 ADJ adjective compound
Items 92 NOUN noun nsubj
were 87 AUX auxiliary ROOT
great 84 ADJ adjective acomp
. 97 PUNCT punctuation punct


{'neg': 0.0, 'neu': 0.594, 'pos': 0.406, 'compound': 0.6249}

In [261]:
p.split_into_sents(sent)

[Honeslty, it didn't taste THAT fresh]