# Development of a Noun - Adjective Pair Summarizer

In [None]:
import nltk
import spacy
import pandas as pd
import ast
import tqdm as tqdm
import heapq
from collections import Counter

In [None]:
df = pd.read_csv("reviews.csv")

In [None]:
def debug_sentence(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    for k, token in enumerate(doc):
        print(f"TOKEN {k}: {token.text, token.pos_, token.tag_, token.dep_}")
        print(f"HEAD: {token.head.text, token.head.pos_, token.head.tag_, token.head.dep_}")
        print("CHILDREN:")
        for child in token.children:
            print(child.text, child.pos_, child.tag_, child.dep_)
        print("ANCESTORS:")
        for ancestor in token.ancestors:
            print(ancestor.text, ancestor.pos_, ancestor.tag_, ancestor.dep_)
        print("\n")

In [None]:
# Note token has attribute 'i' that denotes its index in the document
def get_noun_adjective_pairs_3(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    noun_adj_pairs = [] # [((noun_index, noun_text), (adj_index, adj_text)), ...]
    
    for token in doc:
        '''
        Intuition: Look for noun tokens and adjectives in local subtrees
        '''
        children = list(token.children)
        if (token.tag_ in ['NN', 'NNS', 'NNP', 'NNPS']):
            for child in children:
                if (child.tag_ in ['JJ', 'JJR', 'JJS']):
                    noun = [token.i, token.text.lower()]
                    adj = [child.i, child.text.lower()]
                    noun_adj_pairs.append([noun, adj])        
        for i in range(len(children)-1):
            for j in range(i+1, len(children)):
                if children[i].tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and\
                children[j].tag_ in ['JJ', 'JJR', 'JJS']:
                    noun = [children[i].i, children[i].text.lower()]
                    adj = [children[j].i, children[j].text.lower()]
                    noun_adj_pairs.append([noun, adj])
                if children[i].tag_ in ['JJ', 'JJR', 'JJS'] and\
                children[j].tag_ in ['NN', 'NNS', 'NNP', 'NNPS']:
                    noun = [children[j].i, children[j].text.lower()]
                    adj = [children[i].i, children[i].text.lower()]
                    noun_adj_pairs.append([noun, adj])
    
    '''
    Accounting for 'and'
    '''
    list_of_adj = [noun_adj_pair[1][0] for noun_adj_pair in noun_adj_pairs] # in terms of index
    list_of_noun = [noun_adj_pair[0][0] for noun_adj_pair in noun_adj_pairs] # in terms of index
    flag = True
    while flag:
        flag = False
        for token in doc:
            if token.tag_ in ['JJ', 'JJR', 'JJS']\
            and token.i not in list_of_adj\
            and token.head.i in list_of_adj:
                head_adj_position = list_of_adj.index(token.head.i)
                noun_position = list_of_noun[head_adj_position]
                list_of_adj.append(token.i)
                list_of_noun.append(noun_position)
                noun_adj_pairs.append([[noun_position, doc[noun_position].text.lower()],\
                                       [token.i, token.text.lower()]])
                flag = True

    '''
    Accounting for 'not'
    '''
    # Case one: head of 'not' is ADJ
    for token in doc:
        if token.text.lower() == 'not' and token.head.tag_ in ['JJ', 'JJR', 'JJS']:
            to_negate = token.head.i
            for noun_adj_pair in noun_adj_pairs:
                if noun_adj_pair[1][0] == to_negate:
                    if noun_adj_pair[1][1].startswith('not '):
                        noun_adj_pair[1][1] = noun_adj_pair[1][1][4:]
                    else:
                        noun_adj_pair[1][1] = 'not ' + noun_adj_pair[1][1] 
    # Case two: head of 'not' is not ADJ -> Go to head,
    # find the noun-adj pair under that head, and modify that.
    for token in doc:
        if token.text.lower() == 'not' and token.head.tag_ not in ['JJ', 'JJR', 'JJS']:
            head_children = list(token.head.children)
            # We just modify first noun-adj pair found under head
            to_negate = None
            for i in range(len(head_children)-1):
                for j in range(i+1, len(head_children)):
                    if head_children[i].tag_ in ['JJ', 'JJR', 'JJS'] and\
                    head_children[j].tag_ in ['NN', 'NNS', 'NNP', 'NNPS']:
                        to_negate = [head_children[j].i, head_children[i].i]
                        break
                    elif head_children[i].tag_ in ['NN', 'NNS', 'NNP', 'NNPS'] and\
                    head_children[j].tag_ in ['JJ', 'JJR', 'JJS']:
                        to_negate = [head_children[i].i, head_children[j].i]
                        break
            if to_negate == None:
                continue
            for noun_adj_pair in noun_adj_pairs:
                if noun_adj_pair[0][0] == to_negate[0] and noun_adj_pair[1][0] == to_negate[1]:
                    if noun_adj_pair[1][1].startswith('not '):
                        noun_adj_pair[1][1] = noun_adj_pair[1][1][4:]
                    else:
                        noun_adj_pair[1][1] = 'not ' + noun_adj_pair[1][1]
         
    filtered_noun_adj_pairs = [(noun_adj_pair[0][1], noun_adj_pair[1][1]) for noun_adj_pair in noun_adj_pairs]
        
    return filtered_noun_adj_pairs 

In [None]:
def get_noun_adjective_pairs_2(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    noun_adj_pairs = []
    
    for token in doc:
        child_pos = [child.pos_ for child in token.children]
        '''
        Intuition: Look for noun tokens and adjectives in local subtrees
        '''
        if (token.pos_ == 'NOUN'):
            for child in token.children:
                if (child.pos_ == 'ADJ'):
                    noun_adj_pairs.append((token.text.lower(), child.text.lower()))        
        elif (('NOUN' in child_pos) and ('ADJ' in child_pos)):
            children = [child for child in token.children]
            noun_adj_pairs.append((children[child_pos.index('NOUN')].text.lower(), children[child_pos.index('ADJ')].text.lower()))
    
    return noun_adj_pairs

In [None]:
def get_noun_adjective_pairs(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    noun_adj_pairs = []
    for i, token in enumerate(doc):
        if token.tag_ in ['NN']:
            j = 1
            direction = 1
            while True:
                if i+j*direction < 0  or i+j*direction >= len(doc):
                    break
                if doc[i+j*direction].tag_ in ['JJ']:
                    noun_adj_pairs.append((token.text.lower(),\
                              doc[i+j*direction].text.lower()))
                    break
                direction *= -1
                if direction == 1:
                    j += 1    
    return noun_adj_pairs

In [None]:
# business_ids = ['R4R7ttLXfKKWM0VEMoaW4w', '0kPm1zEpeXFRg8D2phqgCQ', '6RbCJLiwNYwS6ab9vzD_zg',\
#                 'mF2EW3twSrFPmT_RVV1-Qg', 'caq9CTtWB-8K0tdFUhTfAQ']
business_ids = df['business_id'].sample(5).tolist()
id_pair_counts = {}
df2 = df[df["business_id"].isin(business_ids)]
df2 = df2.reset_index(drop=True)

for i, row in df2.iterrows():
    print(f"{i}/{len(df2)}")
    business_id = row['business_id']
    if business_id not in business_ids:
        continue
    if business_id not in id_pair_counts:
        id_pair_counts[business_id] = {}
    review = row['text']
    sentences = nltk.tokenize.sent_tokenize(review)
    noun_adj_pairs = []
    for sentence in sentences:
        noun_adj_pairs += get_noun_adjective_pairs_3(sentence)
    row_pair_counts = dict(Counter(noun_adj_pairs))
    print(row_pair_counts)
    for pair, count in row_pair_counts.items():
        if pair in id_pair_counts[business_id]:
            id_pair_counts[business_id][pair] += count
        else:
            id_pair_counts[business_id][pair] = count

In [None]:
id_top_5 = {}
for business_id in id_pair_counts:
    pair_counts = id_pair_counts[business_id]
    top_5 = heapq.nlargest(5, pair_counts.items(), key=lambda item: item[1])
    id_top_5[business_id] = top_5

In [None]:
with open("noun_adj.txt", 'w') as f:
    for business_id in id_top_5:
        f.write(f"business_id : {business_id}" + '\n')
        for pair_count in id_top_5[business_id]:
            f.write(f"pair : {pair_count[0][0]}-{pair_count[0][1]} count: {pair_count[1]}" + '\n')
        f.write('\n')