### Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

import spacy
from spacy import displacy
import xml.etree.ElementTree as ET

import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
nlp= spacy.load("en_core_web_sm")

# Text preprocessing, tokenizing and filtering of stopwords
# CountVectorizer supports counts of N-grams of words or consecutive characters. Once fitted, the vectorizer has built a 
# dictionary of feature indices
from sklearn.feature_extraction.text import CountVectorizer 
# To transform a count matrix to a normalized tf or tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset

from collections import Counter, defaultdict
import re
import gensim
from gensim import models

import warnings
warnings.filterwarnings('ignore')



### Model Building to Predict Aspects 

In [3]:
tree = ET.parse("Restaurants_Train.xml", ET.XMLParser(encoding= "utf-8"))
root = tree.getroot()

In [6]:
labeled_reviews = []
for sentence in root.findall("sentence"):
    entry = {}
    aterms = []
    aspects = []
    sentiment = []
    if sentence.find("aspectTerms"):
        for aterm in sentence.find("aspectTerms").findall("aspectTerm"):
            aterms.append(aterm.get("term"))
    if sentence.find("aspectCategories"):
        for aspect in sentence.find("aspectCategories"):
            aspects.append(aspect.get("category"))
        for aspect in sentence.find("aspectCategories"):
            sentiment.append(aspect.get("polarity"))
            
    entry["text"], entry["terms"], entry["aspects"], entry["sentiment"] = sentence[0].text, aterms, aspects, sentiment
    labeled_reviews.append(entry)
labeled_df = pd.DataFrame(labeled_reviews)
print("We have", len(labeled_reviews), "labeled reviews.")

We have 3044 labeled reviews.


In [7]:
# Save annotated reviews in a pickle file
labeled_df.to_pickle("annotated_reviews_df.pkl")
labeled_df.head(10)

Unnamed: 0,text,terms,aspects,sentiment
0,But the staff was so horrible to us.,[staff],[service],[negative]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","[positive, negative]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],[positive]
3,Where Gabriela personaly greets you and recomm...,[],[service],[positive]
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous],[positive]
5,"Not only was the food outstanding, but the lit...","[food, perks]","[food, service]","[positive, positive]"
6,It is very overpriced and not very tasty.,[],"[food, price]","[negative, negative]"
7,Our agreed favorite is the orrechiete with sau...,"[orrechiete with sausage and chicken, waiters,...","[food, service]","[positive, positive]"
8,The Bagels have an outstanding taste with a te...,[Bagels],[food],[positive]
9,Nevertheless the food itself is pretty good.,[food],[food],[positive]


In [8]:
# Read annotated reviews df -> labeled dataset for training
annotated_reviews_df = pd.read_pickle("annotated_reviews_df.pkl")
annotated_reviews_df.head()

Unnamed: 0,text,terms,aspects,sentiment
0,But the staff was so horrible to us.,[staff],[service],[negative]
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","[positive, negative]"
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],[positive]
3,Where Gabriela personaly greets you and recomm...,[],[service],[positive]
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous],[positive]


In [11]:
# Convert the multi-labels into arrays
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(annotated_reviews_df.aspects) # aspects
X = annotated_reviews_df["text"] # reviews

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.25, random_state= 0)

# save the the fitted binarizer labels
filename = 'mlb.pkl'
pickle.dump(mlb, open(filename, 'wb'))

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2283,)
(761,)
(2283, 5)
(761, 5)


In [17]:
# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

# Calculate accuracy
np.mean(predicted == y_test)

0.8662286465177398

In [19]:
# Test if SVM performs better
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=6, random_state=42)))])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

#Calculate accuracy
np.mean(predicted_svm == y_test)

0.8633377135348226

In [20]:
# Train naive bayes on full dataset and save model
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))),])
text_clf = text_clf.fit(X, y)

# save the model to disk
filename = 'naive_model1.pkl'
pickle.dump(text_clf, open(filename, 'wb'))

In [21]:
#mlb.inverse_transform(predicted)
pred_df = pd.DataFrame(
    {'reviews': X_test,
     'pred_category': mlb.inverse_transform(predicted)
    })
pred_df

### Using Glove Embeddings and SpaCY to Find Sentiment Scores across Aspects

In [4]:
# Loading positive and negative words

neg_file = open("neg_words.txt", encoding = "ISO-8859-1")
pos_file = open("pos_words.txt", encoding = "ISO-8859-1")

neg = [line.strip() for line in neg_file.readlines()] # Readlines returns a list of the lines in the file
pos = [line.strip() for line in pos_file.readlines()]

opinion_words = neg + pos

In [None]:
# Run below code only if running for the first time

# Word2Vec consists of models for generating word embedding. 
# Words that occur in similar context tend to be closer to each other in vector space
 
# glove_input_file = 'glove.6B.100d.txt' # A pre-trained model for sentiment analysis
# glove_vec_file = 'glove.6B.100d.txt.word2vec'

# word2vec = gensim.models.KeyedVectors.load_word2vec_format(glove_vec_file, binary= False)
# KeyedVectors:  a mapping between keys and vectors.

#pickle.dump(word2vec, open("word2vec_glove.pkl", "wb"))

In [None]:
# load above saved word embedding
word2vec = pickle.load(open("./word2vec_google.pkl", "rb"))

# load the multi label binarizer from the aspect model that we've build above
mlb = pickle.load(open("mlb.pkl", "rb"))

# load the fitted Naive Bayes Model
naive_model1 = pickle.load(open("naive_model1.pkl", "rb"))

In [5]:
# Classes in Multi Label Binarizer
mlb.classes_

array(['ambience', 'anecdotes/miscellaneous', 'food', 'price', 'service'],
      dtype=object)

In [6]:
def check_similarity(aspects, word):
    # checks for similarity between the aspect and given word and returns the most similar aspect for the given word
    similarity = []
    for aspect in aspects:
        similarity.append(word2vec.n_similarity([aspect], [word]))
        # setting throeshold for maximum similarity
    if max(similarity) > .20:
        return aspects[np.argmax(similarity)] # Returns the indices of the maximum value
    else:
        return None
    
    
def assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred):
    
    # This function takes in a Sentiment dictionary and appends the aspect dictionary 
    # aspect_sent: Total sentiment tally
    # terms_dict: Dictionary with individual aspects and their associated sentiments
    # sent_dict: Counter of the form : Counter(term: sentiment score)
    # returns two types of aspect dictionaries: updated terms_dict and aspect_sent
    
    aspects = ["ambience", "food", "price", "service"]
    
    # checking word2vec
    
    for term in sent_dict:
        try:
            # Conditions for when to use the NB Classifier by default vs word2vec
            if check_similarity(aspects, term.split()[-1]): # use .split() because word2vec can't process Compund Nouns
                terms_dict[check_similarity(aspects, term.split()[-1])][term] += 1
                if sent_dict[term] > 0:
                    aspect_sent[check_similarity(aspects, term.split)]["pos"] += sent_dict[term]
                else:
                    aspect_sent[check_similarity(aspects, term.split)]["neg"] += abs(sent_dict[term])
            elif(pred[0] ==  "anecdotes/miscellaneous"):
                continue
            elif(len(pred) == 1):
                terms_dict[pred[0]][term] += 1
                if sent_dict[term] > 0:
                    aspect_sent[pred[0]]["pos"] += sent_dict[term]
                else:
                    aspect_sent[pred[0]]["neg"] += abs(sent_dict[term])
                    
            # if unable to classify via NB Classifier or word2vec and then put it in miscellaneous bucket
            else:
                terms_dict["misc"][term] += 1
                if sent_dict[term] > 0:
                    aspect_sent["misc"]["pos"] += sent_dict[term]
                else:
                    aspect_sent["misc"]["neg"] += abs(sent_dict[term])
                    
        except:
            continue
    return aspect_sent, terms_dict


def feature_sentiment(sentence):
    # input: dictionary and sentence
    # this function appends dictionary with new features if features didn't exist previously then updates sentiments
    # to each of the new and oexistingld features
    # returns updated dictionary
    
    sent_dict = Counter()
    sentence = nlp(sentence)
    debug = 0
    for token in sentence:
        # check if word is an opinion word then assign a sentiment
        if token.text in opinion_words:
            sentiment = 1 if token.text in pos else -1
            # if target is an adverb modifier (eg: pretty, highly, etc.) but also an opinion word, ignore and pass
            if token.dep_ == "advmod":
                continue
            elif token.dep_ == "amod": # opinion words that are adjectives, verbs, adverbs, etc.
                sent_dict[token.head.text] += sentiment
                
            else:
                for child in token.children:
                    #  It checks for child tokens for each adjective and picks up the adverbs
                    # if there is an adjective modifier (eg: pretty, very, etc.), then add more weight to sentiment
                    if((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # add negative sentiment for negative words
                    if child.dep_ == "neg":
                        sentiment *= -1
                    
                for child in token.children:
                    # if it's a verb then check if it's a direct object
                    # direct object is the noun or noun phrase that's receiving the action of the verb
                    if (token.pos_ == "VERB") & (child.dep_ == "dobj"):
                        sent_dict[child.text] += sentiment
                    
                        # check for conjugates (both a and b) and add them to dictionary
                        subchildren = []
                        conj = 0
                        for subchild in child.children:
                            if subchild.text == "and":
                                conj = 1
                            if (conj == 1) and (subchild.text != "and"):
                                subchildren.append(subchild.text)
                                conj = 0
                        for subchild in subchildren:
                            sent_dict[subchild] += sentiment
                            
                # check for negation
                for child in token.head.children:
                    noun = ""
                    if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                        sentiment *= 1.5
                    # check for negation words and flip the sign of sentiment
                    if (child.dep_ == "neg"): 
                        sentiment *= -1
                        
                # check for nouns
                for child in token.head.children:
                    noun = ""
                    if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                        noun = child.text
                        # Also, check for compound nouns
                        for subchild in child.children:
                            if subchild.dep_ == "compound":
                                noun = subchild.text + " " + noun
                        sent_dict[noun] += sentiment
                    debug += 1
                
    return sent_dict

def classify_and_sent(sentence, aspect_sent, terms_dict):
    # classifies the sentence into a category and assign a sentiment
    # aspect_dict: parent dictionary with all aspects
    # input the sentence and aspect dictionary which is going to be updated
    # output will be the updated aspect dictionary
    
    # classify the sentence using NB Classifier
    predicted = naive_model1.predict([sentence])
    pred = mlb.inverse_transform(predicted)

    # this will take your labels and transform them back to the classes with the encoding.     
    # get aspect names and repective sentiments in dictionary form
    sent_dict = feature_sentiment(sentence)
    
    # categorize the aspect names into given 4 aspects in aspect_dict
    aspect_sent, terms_dict = assign_term_to_aspect(aspect_sent, terms_dict, sent_dict, pred[0])
    return aspect_sent, terms_dict

def split_sentence(text):
    # splits review into list of sentences using spacy's sentence parser
    
    review = nlp(text)
    bag_sentence = []
    start = 0
    for token in review:
        if token.sent_start == 1:
            bag_sentence.append(review[start: (token.i-1)])
            start = token.i # index
            
        if token.i == len(review)-1:
            bag_sentence.append(review[start: (token.i+1)])
    return bag_sentence

# remove special characters using regex
def remove_special_char(sentence):
    return re.sub(r"[^a-zA-Z0-9.',:;?]+", " ", sentence)

def review_pipe(review, aspect_sent, terms_dict= {"ambience": Counter(), "food": Counter(), "price": Counter(), 
                                                  "service": Counter(), "misc": Counter()}):
    sentences = split_sentence(review)
    for sentence in sentences:
        sentence = remove_special_char(str(sentence))
        aspect_sent, terms_dict = classify_and_sent(sentence.lower(), aspect_sent, terms_dict)
    return aspect_sent, terms_dict

#### Test Codes

In [8]:
# test code for feature_sentiment

sentence= "I came here with my friends on a Tuesday night. The sushi here is amazing. Our waiter was very helpful, but the music was terrible."
feature_sentiment(sentence)

Counter({'sushi': 1, 'waiter': 1, 'music': -1})

In [9]:
# test code for review_pipe

terms_dict = {"ambience": Counter(), "food": Counter(), "price": Counter(), "service": Counter(), "misc": Counter()}
aspect_sent = {"ambience": Counter(), "food": Counter(), "price": Counter(), "service": Counter(), "misc": Counter()}
review = "Our waiter was not very helpful, and the music was terrible."
review_pipe(review, aspect_sent, terms_dict)

({'ambience': Counter(),
  'food': Counter(),
  'price': Counter(),
  'service': Counter(),
  'misc': Counter()},
 {'ambience': Counter({'music': 1}),
  'food': Counter({'waiter': 1}),
  'price': Counter(),
  'service': Counter(),
  'misc': Counter()})

In [11]:
## Test code for split_sentence

split_sentence("I came here with my friends on a Tuesday night. The sushi here is amazing. Our waiter was very helpful, but the music was terrible.")

[I came here with my friends on a Tuesday night,
 The sushi here is amazing,
 Our waiter was very helpful, but the music was terrible.]

### Restaurant Ratings

### User Weights

### Combined Dataset