In [4]:
import pandas as pd
import numpy as np
import spacy
import pickle
import string
import re
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [8]:
class Sentiment:
    def __init__ (self):
        self.nlp = spacy.load ("en_core_web_sm")
        self.neg_words = pickle.load (open("../neg_words.pkl", "rb"))
        self.stop_words = spacy.lang.en.STOP_WORDS
    
        # By default, not is a stopword
        if 'not' in self.stop_words:
            self.stop_words.remove ('not')
        
        self.stop_words.update (string.punctuation)
        self.split_pattern = re.compile (r"(\s|-)")
        
    def vectorizer (self, X):
        vect = TfidfVectorizer ()
        return vect.fit_transform (X)
    
    def train_model (self, X, y):
        #gaussian_model = GaussianNB()
        multinomial_model = MultinomialNB()
        
        self.models = [multinomial_model]
        
        X = self.vectorizer (X)
        
        X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=.1, random_state=7)
        
        for model in self.models:
            model.fit (X_train, y_train)
            predicted = model.predict (X_test)
            print (f"\t Confusion Matrix: {model}")
            print (confusion_matrix (y_test, predicted))
            
            print (f"\t Classification Matrix: {model}")
            print (classification_report (y_test, predicted))
            
            print ("------------ Over -----------------")

In [None]:
class 

In [6]:
class SentimentAnalyzer:
    def __init__ (self, reviews):
        self.nlp = spacy.load ('en_core_web_sm')
        self.neg_words = pickle.load (open ("neg_words.pkl", "rb"))
        self.neg_words.add ('not')
        
        self.split_pattern = re.compile (r"(\s|-)")
        self.reviews = [self.nlp(x.lower()) for x in reviews]
        
        self.stop_words = STOP_WORDS
        self.stop_words.update (string.punctuation)
        self.stop_words = set([x for x in self.stop_words if x not in self.neg_words])
        
        self.docs = []
    
    def drop_adverbs (self):
        for index in range(len(self.splits)):
            advbs = set()
            for t in self.splits[index]:
                if t.pos == 86 and t.text not in self.neg_words:
                    advbs.add (t.i)
            self.splits[index] = [x for x in self.splits[index] if x.i not in advbs]
            
    def drop_stopwords(self):
        # Remove all stop words that are present in the neg_words, TILL NOW: only 'not'
        for i in range (len (self.splits)):
            stpwrds = set ()
            for t in self.splits[i]:
                if t.text.lower() in self.stop_words:
                    stpwrds.add (t.i)
            self.splits[i] = [x for x in self.splits[i] if x.i not in stpwrds]

    def sentence_splitter (self):
        self.splits = []
        def splitter (sentence):
            start = 0
            counter = 0
            for token in sentence:
                if token.pos == 89 or token.text.strip() == ',':
                    if counter > start:
                        self.splits.append (sentence[start: counter])
                    start = counter + 1
                counter += 1
            #print (sentence[start: ])
            if len (sentence[start: counter]) > 0:
                self.splits.append (sentence[start: counter])
        
        for doc in self.reviews:
            for sent in doc.sents:
                #print ("Sentence: ", sent)
                splitter (sent)
        #return splits
        
    def feature_extraction (self):
        # To be done when splits have SPAN Objects
        # i.e. before any drop_*** methods()
        self.features = []
        for entry in self.splits:
            # For every entry, we need to pick out a noun and an adjective
            nouns=[]
            adjs=[]
            vbs = []
            for t in entry:
                if t.pos == 92 or t.pos == 96:
                    nouns.append (t)
                elif t.pos == 84:
                    adjs.append (t)
                elif t.pos == 100:
                    vbs.append (t)
            if len (adjs) == 0:
                adjs = vbs
            self.features.append ( 
                                        ( ', '.join(map(str, nouns)), ', '.join(map(str, adjs)) ),
                                  )
        
    def prepare(self):
        self.sentence_splitter()
        self.feature_extraction()
        self.drop_adverbs()
        
    def pprint (self):
        counter = 0
        for split in sa.splits:
            sent = ' '.join (map(str, split))
            print (sent)
            scores = sia.polarity_scores (sent)
            print (scores)
            print (f"ENTITY: {sa.features[counter][0]:<10} \tFEATURES: {sa.features[counter][1]:<10}", end="\t")
            if scores['compound'] > 0:
                print ("POSITIVE")
            elif scores['compound'] == 0:
                print ("NEUTRAL")
            else:
                print ("NEGATIVE");
            print()
            counter += 1
        print ()

In [51]:
df = pd.read_csv ("../Resturant_Reviews/Restaurant_Reviews.tsv", sep='\t')

In [52]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [53]:
X = df.iloc[:]['Review']
y = df.iloc[:]['Liked']http://codeforces.com/problemset/problem/158/B

In [54]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object

In [55]:
y.head()

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [56]:
sa = Sentiment()

In [57]:
sa.train_model (X, y)

	 Confusion Matrix: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
[[42 16]
 [10 32]]
	 Classification Matrix: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
              precision    recall  f1-score   support

           0       0.81      0.72      0.76        58
           1       0.67      0.76      0.71        42

   micro avg       0.74      0.74      0.74       100
   macro avg       0.74      0.74      0.74       100
weighted avg       0.75      0.74      0.74       100

------------ Over -----------------


In [9]:
import spacy

In [10]:
nlp = spacy.load ("en_core_web_sm")

In [13]:
doc = nlp ("Food was amazing here.")
for token in doc:
    print (token, token.pos, token.pos_, token.tag_)

Food 92 NOUN NN
was 100 VERB VBD
amazing 84 ADJ JJ
here 86 ADV RB
. 97 PUNCT .
