In [1]:
import os
import sys
import spacy
import pandas as pd

from tqdm import tqdm

In [2]:
file_path = "datasets/Restaurant_reviews/Restaurants_Train_v2.csv"
# raw_data = pd.read_table(file_path, error_bad_lines=False, nrows=100)
raw_data = pd.read_csv(file_path)
print(raw_data.head())
reviews = raw_data[["id", "Sentence", "Aspect Term"]]
reviews.head()

     id                                           Sentence Aspect Term  \
0  3121               But the staff was so horrible to us.       staff   
1  2777  To be completely fair, the only redeeming fact...        food   
2  1634  The food is uniformly exceptional, with a very...        food   
3  1634  The food is uniformly exceptional, with a very...     kitchen   
4  1634  The food is uniformly exceptional, with a very...        menu   

   polarity  from   to  
0  negative     8   13  
1  positive    57   61  
2  positive     4    8  
3  positive    55   62  
4   neutral   141  145  


Unnamed: 0,id,Sentence,Aspect Term
0,3121,But the staff was so horrible to us.,staff
1,2777,"To be completely fair, the only redeeming fact...",food
2,1634,"The food is uniformly exceptional, with a very...",food
3,1634,"The food is uniformly exceptional, with a very...",kitchen
4,1634,"The food is uniformly exceptional, with a very...",menu


In [3]:
reviews.rename(columns={"id": "id", "Sentence": "text", "Aspect Term": "original_aspects"}, inplace=True)
reviews.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews.rename(columns={"id": "id", "Sentence": "text", "Aspect Term": "original_aspects"}, inplace=True)


Index(['id', 'text', 'original_aspects'], dtype='object')

In [4]:
# list all the aspects of a sentence in one column
results = {}
for row in reviews.itertuples():
    if row.text in results:
        results[row.text].append(row.original_aspects)
    else:
        results[row.text] = [row.original_aspects]

In [5]:
# proper formatting
all_reviews = []
for key, val in results.items():
    all_reviews.append({"review": key, "actual_aspects": list(set(val))}) # getting unique actual aspects

In [6]:
all_reviews[0]

{'review': 'But the staff was so horrible to us.', 'actual_aspects': ['staff']}

In [7]:
# !python3 -m spacy download en_core_web_lg

In [8]:
nlp = spacy.load("en_core_web_lg")

In [9]:
def apply_extraction(review_body, nlp):
    # review_body = row["Sentence"]
    # review_id = row["id"]
    
    doc = nlp(review_body)
    
    # rule1
    r1_pairs = []
    for token in doc:
        if token.dep_ == "amod":
            r1_pairs.append((token.head.text, token.text))
            
    # rule2
    r2_pairs = []
    for token in doc:
        A = ""
        M = ""
        for child in token.children:
            if child.dep_ == "nsubj":
                A = child.text
                
            if not M and child.dep_ == "dobj":
                M = child.text
            
            if not M and child.dep_ == "acomp":
                M = child.text
        if A and M:
            r2_pairs.append((A, M))
            
    # rule3
    r3_pairs = []
    prev_token_pos = ""
    for token in doc:
        # print(token, [i for i in token.children])
        A = ""
        M = ""
        for child in token.children:
            if token.pos_ == "NOUN" and child.dep_ == "prep":
                A = token.text
            # if prev_token_pos != "NOUN" and token.dep_ =="prep" and token.pos_ == "ADP" and child.dep_ == "pobj":
            #     A = child.text
        if A and not M:
            r3_pairs.append((A, M))
        prev_token_pos = token.pos_

            
    # rule4
    r4_pairs = []
    for token in doc:
        children = token.children
        A = ""
        M = ""
        for child in children:
            if child.dep_ == "nsubjpass":
                A = child.text
                
            if child.dep_ == "advmod":
                M = child.text
        
        if A and M:
            r4_pairs.append((A, M))
            
    # rule5
    r5_pairs = []
    for token in doc:
        children = token.children
        A = ""
        M = ""
        for child in children:
            if child.dep_ == "nsubj":
                A = child.text
                
            if child.dep_ == "cop":
                M = child.text
        
        if A and M:
            r5_pairs.append((A, token.text))
            
    # aspects = []
    aspects_pairs = r1_pairs + r2_pairs + r3_pairs + r4_pairs + r5_pairs
    # aspect_dict = {"review_id": review_id, "review_body": review_body, "aspect_pairs": aspects}
    return aspects_pairs

In [10]:
all_reviews[0], len(all_reviews)

({'review': 'But the staff was so horrible to us.',
  'actual_aspects': ['staff']},
 2019)

In [11]:
def extract_aspects(pairs):
    aspects = list(set([i[0] for i in pairs]))
    return aspects
# extract_aspects(all_reviews[0]["extracted_aspects"])

In [12]:
import math
from collections import Counter

def find_similarity(l1, l2):
    c1 = Counter(l1)
    c2 = Counter(l2)
    
    # cosine similarity
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [13]:
for data in tqdm(all_reviews):
    aspect_pairs = apply_extraction(data["review"], nlp)
    data["extracted_aspects"] = extract_aspects(aspect_pairs)
    try:
        data["similarity"] = find_similarity(data["actual_aspects"], data["extracted_aspects"])
    except ZeroDivisionError:
        data["similarity"] = 0
    if data["similarity"] >=0.9:
        data["check"] = True
    else:
        data["check"] = False

100%|████████████████████████████████| 2019/2019 [00:19<00:00, 101.56it/s]


In [14]:
all_reviews[0]

{'review': 'But the staff was so horrible to us.',
 'actual_aspects': ['staff'],
 'extracted_aspects': ['staff'],
 'similarity': 1.0,
 'check': True}

Check how many of them are correct if we use the rules given

In [16]:
# import os

# os.makedirs("output/")

In [17]:
pd.DataFrame(all_reviews).to_csv("output/extracted_aspects_v1.csv")

In [22]:
reviews.head()

Unnamed: 0,id,text,original_aspects
0,3121,But the staff was so horrible to us.,staff
1,2777,"To be completely fair, the only redeeming fact...",food
2,1634,"The food is uniformly exceptional, with a very...",food
3,1634,"The food is uniformly exceptional, with a very...",kitchen
4,1634,"The food is uniformly exceptional, with a very...",menu


In [9]:
from sklearn import cluster
from collections import defaultdict

In [10]:
NUM_CLUSTERS = 4
aspects = []
for review in review_decomp:
    aspect_pairs = review["aspect_pairs"]
    for noun, adj in aspect_pairs:
        # print(noun, adj)
        aspects.append(noun)
        
unique_aspects = list(set(aspects))

In [11]:
len(unique_aspects)

64

In [12]:
unique_aspects

['deficiencies',
 'design',
 'you',
 'stomach',
 'spot',
 'mussels',
 'factor',
 'temperatures',
 'experience',
 'Bagels',
 'people',
 'bagels',
 'price',
 'menu',
 'decor',
 'choices',
 'perks',
 'seats',
 'all',
 'atmosphere',
 'money',
 'rice',
 'size',
 'dressing',
 'food',
 'favorite',
 'selection',
 'variety',
 'decoration',
 'staff',
 'list',
 'dinner',
 'It',
 'I',
 'kitchen',
 'me',
 'waiters',
 'pizza',
 'desert',
 'concept',
 'taste',
 'Faan',
 'quantity',
 'lovers',
 'above',
 'dishes',
 'broth',
 'vibe',
 'restaurants',
 'They',
 'setting',
 'meal',
 'texture',
 'wine',
 'questions',
 'benches',
 'owner',
 'we',
 'which',
 'prices',
 'He',
 'service',
 'floor',
 'place']

In [20]:
aspects_map = defaultdict(int)
for asp in aspects:
    aspects_map[asp] += 1

In [24]:
asp_vectors = []
for aspect in unique_aspects:
    token = nlp(aspect)
    asp_vectors.append(token.vector)
    
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(asp_vectors)
labels = kmeans.labels_
asp_to_cluster_map = dict(zip(unique_aspects, labels))
print(asp_to_cluster_map)

{'benches': 2, 'food': 0, 'questions': 2, 'taste': 0, 'prices': 2, 'place': 2, 'concept': 2, 'setting': 2, 'spot': 0, 'rice': 0, 'above': 2, 'dinner': 0, 'selection': 2, 'dishes': 0, 'experience': 2, 'you': 3, 'decoration': 2, 'Faan': 0, 'I': 3, 'He': 1, 'variety': 2, 'mussels': 0, 'staff': 2, 'which': 2, 'lovers': 0, 'bagels': 0, 'decor': 0, 'choices': 2, 'desert': 0, 'we': 3, 'vibe': 0, 'money': 2, 'size': 2, 'quantity': 2, 'menu': 0, 'perks': 0, 'wine': 0, 'floor': 2, 'pizza': 0, 'broth': 0, 'deficiencies': 2, 'factor': 2, 'people': 2, 'list': 2, 'favorite': 0, 'restaurants': 0, 'price': 2, 'me': 3, 'owner': 0, 'all': 2, 'Bagels': 0, 'They': 2, 'service': 2, 'design': 2, 'meal': 0, 'stomach': 0, 'waiters': 0, 'seats': 2, 'kitchen': 0, 'atmosphere': 2, 'dressing': 2, 'temperatures': 2, 'It': 2, 'texture': 2}


In [28]:
cluster_map = defaultdict()
cluster_to_asp_map = defaultdict()
for i in range(NUM_CLUSTERS):
    cluster_nouns = [k for k, v in asp_to_cluster_map.items() if v==i]
    freq_map = {k:v for k, v in aspects_map.items() if k in cluster_nouns}
    freq_map = sorted(freq_map.items(), key=lambda x: x[1], reverse=True)
    cluster_map[i] = freq_map[0][0]
    cluster_to_asp_map[i] = cluster_nouns

In [30]:
cluster_to_asp_map

defaultdict(None,
            {0: ['food',
              'taste',
              'spot',
              'rice',
              'dinner',
              'dishes',
              'Faan',
              'mussels',
              'lovers',
              'bagels',
              'decor',
              'desert',
              'vibe',
              'menu',
              'perks',
              'wine',
              'pizza',
              'broth',
              'favorite',
              'restaurants',
              'owner',
              'Bagels',
              'meal',
              'stomach',
              'waiters',
              'kitchen'],
             1: ['He'],
             2: ['benches',
              'questions',
              'prices',
              'place',
              'concept',
              'setting',
              'above',
              'selection',
              'experience',
              'decoration',
              'variety',
              'staff',
              'which',
            