In [171]:
import pandas as pd
import numpy as np
import string

import re
import pickle
from pathlib import Path

import spacy
from spacy.lang.en import STOP_WORDS
from spacy.matcher import Matcher

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [73]:
class Review:
    def __init__ (self):

        """
        split_pattern will currently split on 
            word.
            word .
            word .       abc # here it will consume the spaces too
        """
        self.split_pattern = r"([a-z]+)\s*[.,]+(\s*)" #token.text.lower() in self.stop_words:
        
    def sentence_splitter(self, msg):
        msg = msg.strip()
        splits = []
        start = 0
        for match in re.finditer (self.split_pattern, msg):
            # print (match.groups(), match.span())
            splits.append (msg[start: match.span()[0]] + match.groups()[0])
            start = match.span()[1]
        
        if (len(msg) - start) > 1:
            splits.append (msg[start:])
        
        return splits
        
    def filter_review (self, msg):
        # Crux of the Class
        sentences = self.sentence_splitter (msg)

In [255]:
class Model:
    def __init__ (self, X=None, y=None):
        self.vectorizer = None
        if not Path.exists (Path('multinomialNB_model.pkl')):
            # self.vectorizer = TfidfVectorizer(ngram_range = (1, 3))
            # self.vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english')
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), strip_accents='unicode', sublinear_tf=True)
            X = self.vectorizer.fit_transform (X)
            self.feature_names = self.vectorizer.get_feature_names()
            print (f"Feature Names: {self.vectorizer.get_feature_names()}")
            
            self.model = MultinomialNB ()
            X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3)
            print ("Type of X_test: ", type(X_test));
            # Training the Model
            self.model.fit (X_train, y_train);
            predicted = self.model.predict (X_test);
            
            print ("Accuracy: ", accuracy_score (y_test, predicted))
            print ("Classification Report: ")
            print (classification_report (y_test, predicted))
    
            pickled_tuple = (self.vectorizer, self.model)
            pickle.dump (pickled_tuple, open('multinomialNB_model.pkl', 'wb'))
        else:
            print ("Model pickle found.... Not training again....")
            self.vectorizer, self.model = pickle.load (open ("multinomialNB_model.pkl", "rb"))
    
    def predict (self, test):
        if isinstance (test, pd.Series):
            pass
        else:
            test = pd.Series ([test])    
        test = self.vectorizer.transform (test)
        print ("Type of predict_review: ", type(test))
            
        return self.model.predict (test)

In [221]:
class SentimentAnalyzer (Review):
    def __init__ (self):
        Review.__init__(self)
        self.nlp = spacy.load ("en_core_web_sm")
        self.stop_words = STOP_WORDS
        self.stop_words.union(string.punctuation)
        
        self.neg_words  = pickle.load( open('neg_words.pkl', 'rb'))
        self.neg_words.add ('not')
        
        # I need to merge negative word tokens, like didn't can't wouldn't shouldn't wasn't 
        verb_patterns = [
            [{"TEXT": {"REGEX": r"(can|did|should|would|was)"}}, {"LOWER": "n't"}],
        ]
        self.matcher = Matcher (self.nlp.vocab)
        self.matcher.add ('neg_verbs', None , *verb_patterns)
        
    def merge_tokens (self, doc):
        matches = self.matcher (doc)
        for _, start, end in matches:
            span = doc[start: end]
            span.merge()
            # print ("Match Found: ",span.text)
        return doc
    
    def split_on_conjunction (self, sentence):
        """
        Do Remember: sentence should be already splitted msg through Review.sentence_splitter()
        Split the given sentences on conjunction like, but, and, although, yet """
        sentence = self.nlp (sentence)
        # print ("Original: ", len (sentence))
        sentence = self.merge_tokens (sentence)
        # print ("After merging tokens: ", len (sentence))
        
        start = 0
        counter = 0
        splits = []
        for token in sentence:
            if token.pos == 89:
                if counter > start:
                    splits.append (sentence[start: counter])
                start = counter + 1
            counter += 1
        
        #print (sentence[start: ])
        if len (sentence[start: counter]) > 0:
            splits.append (sentence[start: counter])
        return splits
        
    def drop_tokens(self, doc):
        """
        Don't drop if token is in neg_words
        
        Drop the tokens, that are
            - adverbs
            - stopwords
        """
        # doc = self.nlp (msg)
        drops = set()
    
        for token in doc:
            #  print (token, token.pos_, token.pos)
            if (token.text.lower() not in self.neg_words):
                # Verbs , Nouns or Adjectives
                if token.pos == 92 or token.pos == 84:
                    continue;
                if (token.pos == 86) or token.text.lower() in self.stop_words:   # aka token is an Adverb or a stopword
                    drops.add (token.i)
        return [x.lemma_ for x in doc if x.i not in drops]
    
    def load (self, pd_series):
        self.series = pd_series
    
    def preprocess (self, items=None, debug=False, debug_nested = False):
        splits = []
        if items is not None:
            self.series = [items];
        for review in self.series:
            data = []
            if debug:
                print ("Review: ", review)
            for sentence in self.sentence_splitter(review):
                if debug and debug_nested:
                    print (f"Sentence: {sentence}")
                for split in self.split_on_conjunction (sentence):
                    if debug and debug_nested:
                        print (f"\tSplit: {split}")
                    for msg in self.drop_tokens (split):
                        if debug and debug_nested:
                            print (f"\t\tMessage: {msg}")
                        data.append (msg)
                        
            if debug:
                print (f"After split-join: {' '.join (map(str, data))}")
            splits.append (" ".join (map(lambda x: str(x).lower(), data)))
    
        series_splits = pd.Series (splits)
        return series_splits
                    

In [212]:
df = pd.read_csv ("Restaurant_Reviews.tsv", sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [213]:
len(df)

1000

In [214]:
df.iloc[:]['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [194]:
X = df.iloc[:]['Review'] 
y = df.iloc[:]['Liked']

In [195]:
y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [196]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [167]:
sa = SentimentAnalyzer ()
sa.load (X)

In [168]:
x = sa.preprocess (debug=True)

Review:  Wow... Loved this place.
After split-join: wow love place
Review:  Crust is not good.
After split-join: Crust not good
Review:  Not tasty and the texture was just nasty.
After split-join: not tasty texture nasty
Review:  Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.
After split-join: stop late bank holiday Rick Steve recommendation love
Review:  The selection on the menu was great and so were the prices.
After split-join: selection menu great price
Review:  Now I am getting angry and I want my damn pho.
After split-join: get angry want damn pho
Review:  Honeslty it didn't taste THAT fresh.)
After split-join: Honeslty do taste fresh
Review:  The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.
After split-join: potato like rubber tell time keep warmer
Review:  The fries were great too.
After split-join: fry great
Review:  A great touch.
After split-join: great touch
Review:  Ser

In [169]:
len (x)

1000

In [256]:
model = Model(x, y)

Type of X_test:  <class 'scipy.sparse.csr.csr_matrix'>
Accuracy:  0.8066666666666666
Classification Report: 
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       148
           1       0.82      0.80      0.81       152

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300



In [238]:
model.predict ("I liked the food but service was ok.")

Type of predict_review:  <class 'scipy.sparse.csr.csr_matrix'>


array([0])

In [68]:
sa2 = SentimentAnalyzer ()
splits_series = sa2.preprocess ("I liked the food but service was not good.")

In [69]:
model2 = Model (splits_series)

Model pickle found.... Not training again....


In [38]:
model2.predict (splits_series)

Type of predict_review:  <class 'scipy.sparse.csr.csr_matrix'>


array([1])

In [39]:
sa = SentimentAnalyzer()

In [40]:
sa.split_on_conjunction ("I didn't like the food.")

[I didn't like the food.]

In [41]:
doc = sa.nlp ("Wow, it is good!")
for token in doc:
    print (token, token.pos_, token.pos, token.tag_)

Wow INTJ 91 UH
, PUNCT 97 ,
it PRON 95 PRP
is VERB 100 VBZ
good ADJ 84 JJ
! PUNCT 97 .


In [258]:
col = []
for item in model.feature_names:
    if len(item.split()) == 2:
        counter = 0
        for review in df['Review']:
            if item in review:
                counter += 1
        col.append ( (item, counter) )
    
for item in sorted (col, key=lambda x: x[1], reverse=True):
    print (item)

('good food', 7)
('30 min', 5)
('customer service', 5)
('eat place', 5)
('first time', 5)
('not good', 5)
('10 minute', 4)
('feel like', 4)
('great service', 4)
('not impress', 4)
('not impressed', 4)
('not like', 4)
('20 minute', 3)
('30 minute', 3)
('few place', 3)
('friendly staff', 3)
('go wrong', 3)
('good thing', 3)
('good way', 3)
('next time', 3)
('not recommend', 3)
('not worth', 3)
('off night', 3)
('place fo', 3)
('second time', 3)
('sweet potato', 3)
('zero star', 3)
('35 minute', 2)
('40 minute', 2)
('bad food', 2)
('beef sandwich', 2)
('beer selection', 2)
('brick oven', 2)
('double cheeseburger', 2)
('expect more', 2)
('family restaurant', 2)
('fine dining', 2)
('food arrive', 2)
('food suck', 2)
('get go', 2)
('good selection', 2)
('great experience', 2)
('great food', 2)
('great side', 2)
('hot dog', 2)
('ice cold', 2)
('ice cream', 2)
('little slow', 2)
('many other', 2)
('many time', 2)
('mediocre food', 2)
('much go', 2)
('much good', 2)
('multiple time', 2)
('next 