In [18]:
# imports
from __future__ import print_function
from time import time
import numpy as np
import pandas as pd
import pickle 
import random
import csv
import timeit
import os
import spacy
import string 

# Sklearn
from pprint import pprint
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# NLP libraries
import re
import matplotlib.pyplot as plt

# xml
from bs4 import BeautifulSoup

In [2]:
def generate_corpus(filename):
    corpus = []
    soup = BeautifulSoup(open(filename, "r"), "lxml")
    for r in soup.find_all('review'):
        doc = ""
        for sentence in r.find_all('text'):
            doc += " " + sentence.text
        corpus.append(doc)
    return corpus

In [3]:
def w2v(word, model, dim, oov_model=None):
    try:
        return model.loc[word].values
    except:
        if oov_model is not None:
            try:
                return oov_model.loc[word].values
            except:
                return np.zeros((dim,))
        return np.zeros((dim,))

In [4]:
#TODO: replace split with tokenize

def get_target_embeddings(filename, glove_model, domain_model, use_fastText=False):
    punctuation_string = "".join(string.punctuation)
    targets = []
    categories = []
    
    words = []
    embeddings = []
    dom_embeddings = []
    
    soup = BeautifulSoup(open(filename, "r"), "lxml")
    i = 0
    for r in soup.find_all('opinion'):
        targets.append(r.get('target'))
        categories.append(r.get('category')[:r.get('category').index("#")])
    
    target_to_cat = dict(zip(targets, categories))
    targets = set(targets)
    categories = []
    
    #let us handle OOV words.
    oov_model = None
    if use_fastText:
        oov_words = []
        for t in targets:
            if len(t.split()) > 1:
                for w in t.split():
                    try:
                        emb = domain_model.loc[w.strip(punctuation_string)]
                    except:
                        oov_words.append(w)
            else:
                try:
                    emb = domain_model.loc[t.strip(punctuation_string)]
                except:
                    oov_words.append(t)
        oov_words = set(oov_words)
        
        with open("./oov_words.txt", "w") as file:
            for w in oov_words:
                file.write(w + "\n")
        
        start_time = timeit.default_timer()
        
        bashCommand = "./fastText-0.2.0/fasttext print-word-vectors ./data/embedding/restaurant_emb.vec.bin < ./oov_words.txt > ./oov.vec"
        os.system(bashCommand)
        
        elapsed = timeit.default_timer() - start_time
        print("Word vectors written, it took " + str(elapsed) + " seconds")
        oov_model = pd.read_csv("./oov.vec", sep=" ", header=None, index_col=0, skiprows=1)
        oov_model = oov_model.drop(labels=101, axis=1)
    
    for t in targets:
        if len(t.split()) > 1:
            emb = np.max([w2v(w.strip(punctuation_string), glove_model, 300) for w in t.split()], axis=0)
            dom_emb = np.max([w2v(w.strip(punctuation_string), domain_model, 100, oov_model) for w in t.split()], axis=0)
        else:
            emb = w2v(t.strip(punctuation_string), glove_model, 300)
            dom_emb = w2v(t.strip(punctuation_string), domain_model, 100, oov_model)
        embeddings.append(emb)
        dom_embeddings.append(dom_emb)
        words.append(t)
        categories.append(target_to_cat[t])        
        
        i+=1
        print(str(i) + str(emb.shape) + " " + str(dom_emb.shape))
    
    
    #normalize:
    embeddings = [e/np.linalg.norm(e) if np.linalg.norm(e) != 0 else e for e in embeddings]
    dom_embeddings = [e/np.linalg.norm(e) if np.linalg.norm(e) != 0 else e for e in dom_embeddings]
    
    le = LabelEncoder()
    return words, le.fit_transform(categories), embeddings, dom_embeddings

In [5]:
filename = "./data/official_data/ABSA16_Restaurants_Train_SB1_v2.xml"

In [6]:
glove_model = pd.read_csv("./data/embedding/gen.vec", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [7]:
domain_model = pd.read_csv("./data/embedding/restaurant_emb.vec", sep=" ", header=None, index_col=0, skiprows=1)
domain_model = domain_model.drop(labels=101, axis=1)

In [6]:
doc_set = generate_corpus(filename)

In [11]:
words, categories, embeddings, dom_embeddings = get_target_embeddings(filename, glove_model, domain_model, use_fastText=True)

Word vectors written, it took 8.619101969525218 seconds
1(300,) (100,)
2(300,) (100,)
3(300,) (100,)
4(300,) (100,)
5(300,) (100,)
6(300,) (100,)
7(300,) (100,)
8(300,) (100,)
9(300,) (100,)
10(300,) (100,)
11(300,) (100,)
12(300,) (100,)
13(300,) (100,)
14(300,) (100,)
15(300,) (100,)
16(300,) (100,)
17(300,) (100,)
18(300,) (100,)
19(300,) (100,)
20(300,) (100,)
21(300,) (100,)
22(300,) (100,)
23(300,) (100,)
24(300,) (100,)
25(300,) (100,)
26(300,) (100,)
27(300,) (100,)
28(300,) (100,)
29(300,) (100,)
30(300,) (100,)
31(300,) (100,)
32(300,) (100,)
33(300,) (100,)
34(300,) (100,)
35(300,) (100,)
36(300,) (100,)
37(300,) (100,)
38(300,) (100,)
39(300,) (100,)
40(300,) (100,)
41(300,) (100,)
42(300,) (100,)
43(300,) (100,)
44(300,) (100,)
45(300,) (100,)
46(300,) (100,)
47(300,) (100,)
48(300,) (100,)
49(300,) (100,)
50(300,) (100,)
51(300,) (100,)
52(300,) (100,)
53(300,) (100,)
54(300,) (100,)
55(300,) (100,)
56(300,) (100,)
57(300,) (100,)
58(300,) (100,)
59(300,) (100,)
60(300,) 

487(300,) (100,)
488(300,) (100,)
489(300,) (100,)
490(300,) (100,)
491(300,) (100,)
492(300,) (100,)
493(300,) (100,)
494(300,) (100,)
495(300,) (100,)
496(300,) (100,)
497(300,) (100,)
498(300,) (100,)
499(300,) (100,)
500(300,) (100,)
501(300,) (100,)
502(300,) (100,)
503(300,) (100,)
504(300,) (100,)
505(300,) (100,)
506(300,) (100,)
507(300,) (100,)
508(300,) (100,)
509(300,) (100,)
510(300,) (100,)
511(300,) (100,)
512(300,) (100,)
513(300,) (100,)
514(300,) (100,)
515(300,) (100,)
516(300,) (100,)
517(300,) (100,)
518(300,) (100,)
519(300,) (100,)
520(300,) (100,)
521(300,) (100,)
522(300,) (100,)
523(300,) (100,)
524(300,) (100,)
525(300,) (100,)
526(300,) (100,)
527(300,) (100,)
528(300,) (100,)
529(300,) (100,)
530(300,) (100,)
531(300,) (100,)
532(300,) (100,)
533(300,) (100,)
534(300,) (100,)
535(300,) (100,)
536(300,) (100,)
537(300,) (100,)
538(300,) (100,)
539(300,) (100,)
540(300,) (100,)
541(300,) (100,)
542(300,) (100,)
543(300,) (100,)
544(300,) (100,)
545(300,) (100

In [14]:
import pickle
with open("words_cats_emb_dom_emb.pkl", "wb") as pickle_in:
    pickle.dump((words, categories, embeddings, dom_embeddings), pickle_in)

In [None]:
concat_embeddings = [np.hstack((ge,de)) for ge,de in zip(embeddings, dom_embeddings)]

In [None]:
#seeds for kmeansp.mean


#food
food_seed = np.mean([concat_embeddings[words.index('bread')], concat_embeddings[words.index('burgers')], 
                     concat_embeddings[words.index('noodles')], concat_embeddings[words.index('rice')], 
                     concat_embeddings[words.index('bread')], concat_embeddings[words.index('steak')]], axis=0)

drinks_seed = np.mean([concat_embeddings[words.index('wine')], concat_embeddings[words.index('beer')], 
                     concat_embeddings[words.index('noodles')], concat_embeddings[words.index('coffee')], 
                     concat_embeddings[words.index('drinks')]], axis=0)

service_seed = np.mean([concat_embeddings[words.index('service')], concat_embeddings[words.index('staff')], 
                     concat_embeddings[words.index('crew')], concat_embeddings[words.index('hostess')], 
                     concat_embeddings[words.index('waitstaff')]], axis=0)

ambience_seed = np.mean([concat_embeddings[words.index('ambience')], concat_embeddings[words.index('decor')], 
                     concat_embeddings[words.index('seating')], concat_embeddings[words.index('atmosphere')], 
                     concat_embeddings[words.index('furnishings')]], axis=0)

misc_seed = np.mean([concat_embeddings[words.index('place')], concat_embeddings[words.index('spot')]], axis=0)

In [None]:
from sklearn.cluster import KMeans, DBSCAN

cluster_centers = np.asarray([food_seed, drinks_seed, service_seed, ambience_seed, misc_seed])
kmeans = KMeans(n_clusters=5, random_state=0, init=cluster_centers).fit(concat_embeddings)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
tsne_results = tsne.fit_transform(concat_embeddings)       

In [None]:
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(1, 1, 1, title='tSNE' )
# Create the scatter
ax.scatter(
    x=tsne_results[:, 0], 
    y=tsne_results[:, 1], 
    c=kmeans.labels_, 
    cmap=plt.cm.get_cmap('Paired'))
ax.legend()
# ax.relim()
# # update ax.viewLim using the new dataLim
# ax.autoscale_view()

In [None]:
dic = {}
for w, l in zip(words, kmeans.labels_):
    if dic.get(l):
        dic[l].append(w)
    else:
        dic[l] = [w]
dic

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

adjusted_rand_score(categories, kmeans.labels_)

In [14]:
def parse_input(filename):
    rows = []
    soup = BeautifulSoup(open(filename, "r"), "lxml")
    for r in soup.find_all('review'):
        restaurant_name = re.match(".*_(.*)_.*", r.get('rid')).groups()[0] #modify for visualizing
#         restaurant_name = r.get('rid') #modify for visualizing
        doc = ""
        aspect_words = []
        aspect_categories = []
        polarities = []
        for sentence in r.find_all('sentence'):
            sent = sentence.text.strip()
            doc += " " + sent
            for opinion in sentence.find_all('opinion'):
                aspect_words.append(opinion.get('target'))
                aspect_categories.append(opinion.get('category')[:opinion.get('category').index('#')])
                polarities.append(opinion.get('polarity'))
        rows.append((restaurant_name, doc, aspect_words, aspect_categories, polarities))
                                
    return  pd.DataFrame(rows, columns=['restaurant_name', 'text', 'aspect_words', 'aspect_categories', 'polarities'])

In [15]:
df = parse_input("./data/official_data/EN_REST_SB1_TEST.xml.gold")

In [16]:
df

Unnamed: 0,restaurant_name,text,aspect_words,aspect_categories,polarities
0,BlueRibbonSushi,Yum! Serves really good sushi. Not the bigges...,"[NULL, sushi, portions, Green Tea creme brulee...","[FOOD, FOOD, FOOD, FOOD, FOOD]","[positive, positive, neutral, positive, positive]"
1,BlueRibbonSushi,No Comparison – I can't say enough about this...,"[NULL, place, sushi, service, staff, restaurant]","[RESTAURANT, RESTAURANT, FOOD, SERVICE, SERVIC...","[positive, positive, positive, positive, posit..."
2,SchoonerOrLater,Snotty Attitude – We were treated very rudely...,"[NULL, NULL, owner]","[SERVICE, SERVICE, SERVICE]","[negative, negative, negative]"
3,SchoonerOrLater,Good food! – We love breakfast food. This is ...,"[food, meal, NULL, staff, onion rings, NULL]","[FOOD, FOOD, SERVICE, SERVICE, FOOD, FOOD]","[positive, positive, positive, positive, posit..."
4,PagodaRestaurant,Overrated – I was highly disappointed in the ...,"[NULL, food, lemon chicken, honey walnut prawn...","[RESTAURANT, FOOD, FOOD, FOOD, FOOD, AMBIENCE,...","[negative, negative, negative, negative, negat..."
5,ParkChaletGardenRestaurant,Worst Service I Ever Had – A group of 5 of us...,"[Service, service, waiter, manager, NULL]","[SERVICE, SERVICE, SERVICE, SERVICE, RESTAURANT]","[positive, negative, negative, negative, negat..."
6,MiopostoCaffe,Fabulous Italian Food! – I highly recommend M...,"[Italian Food, Mioposto, Italian restaurant, w...","[FOOD, RESTAURANT, RESTAURANT, DRINKS, FOOD]","[positive, positive, positive, positive, posit..."
7,Murphy's,I love this restaurant – I will never forget ...,"[restaurant, meal, service, ambiance, NULL, wi...","[RESTAURANT, FOOD, SERVICE, AMBIENCE, RESTAURA...","[positive, positive, positive, positive, posit..."
8,OpenSesame,Mmm... good! – Went there last night with a f...,"[NULL, food, food, food, Kafta plate, NULL]","[RESTAURANT, FOOD, FOOD, FOOD, FOOD, RESTAURANT]","[positive, positive, positive, positive, posit..."
9,Sage,Finally a meal that you will remember for a l...,"[meal, food, service, place, food, food]","[FOOD, FOOD, SERVICE, AMBIENCE, FOOD, FOOD]","[positive, positive, positive, positive, posit..."


In [9]:
#TODO: replace split with tokenize

def get_embeddings(words, glove_model, domain_model, use_fastText=False):
    punctuation_string = "".join(string.punctuation)
    embeddings = []
    dom_embeddings = []
    
    #let us handle OOV words.
    oov_model = None
    if use_fastText:
        oov_words = []
        for w in words:
            try:
                emb = domain_model.loc[w.strip(punctuation_string)]
            except:
                oov_words.append(t)
        oov_words = set(oov_words)
        
        with open("./oov_words.txt", "w") as file:
            for w in oov_words:
                file.write(w + "\n")
        
        start_time = timeit.default_timer()
        
        bashCommand = "./fastText-0.2.0/fasttext print-word-vectors ./data/embedding/restaurant_emb.vec.bin < ./oov_words.txt > ./oov.vec"
        os.system(bashCommand)
        
        elapsed = timeit.default_timer() - start_time
        print("Word vectors written, it took " + str(elapsed) + " seconds")
        oov_model = pd.read_csv("./oov.vec", sep=" ", header=None, index_col=0, skiprows=1)
        oov_model = oov_model.drop(labels=101, axis=1)
    
    for w in targets:
        emb = w2v(w.strip(punctuation_string), glove_model, 300)
        dom_emb = w2v(w.strip(punctuation_string), domain_model, 100, oov_model)
        embeddings.append(emb)
        dom_embeddings.append(dom_emb)
        words.append(t)      
        i+=1
        print(str(i) + str(emb.shape) + " " + str(dom_emb.shape))
    
    
    #normalize:
    embeddings = [e/np.linalg.norm(e) if np.linalg.norm(e) != 0 else e for e in embeddings]
    dom_embeddings = [e/np.linalg.norm(e) if np.linalg.norm(e) != 0 else e for e in dom_embeddings]
    
    return [np.hstack((ge,de)) for ge,de in zip(embeddings, dom_embeddings)]

In [32]:
@interact
def visualize_restaurant(x=df.restaurant_name.unique()):
    return df.loc[df['restaurant_name'] == x]
    
    #replace with model
    aspect_words = set(df.loc[df['restaurant_name'] == x]['aspect_words'].to_list())
    
    #get dual embeddings
    embeddings = get_embeddings(aspect_words)
    
    #cluster
    kmeans = KMeans(n_clusters=5, random_state=0).fit(concat_embeddings)
    
    #find sentiment of each cluster and plot
    
    
    #tsne plot of the embeddings
    tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
    tsne_results = tsne.fit_transform(concat_embeddings)       
    


interactive(children=(Dropdown(description='x', options=('BlueRibbonSushi', 'SchoonerOrLater', 'PagodaRestaura…

In [19]:
nlp = spacy.load('en')

In [20]:
doc = nlp(df.iloc[0].text)
for token in doc:
    if token.dep_ == ""
    print(str(token) + " | " + str(token.dep_))

  | 
Yum | ROOT
! | punct
Serves | nmod
really | advmod
good | amod
sushi | ROOT
. | punct
Not | neg
the | det
biggest | amod
portions | ROOT
but | cc
adequate | conj
. | punct
Green | compound
Tea | compound
creme | nsubj
brulee | appos
is | ROOT
a | det
must | attr
! | punct
Do | aux
n't | neg
leave | ROOT
the | det
restaurant | dobj
without | prep
it | pobj
. | punct
