# Clarity Scoring Function


In [200]:
import sys
import re
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from numpy import log
from scipy.special import rel_entr
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [201]:
header_columns = 'Author_ID|Author_name|Comment_number|Sentence_number|Domain_Relevance|Sentiment|Entity|Attribute|Sentence|Source_file|Annotator|Aspect'.split('|')
# domain_relevance_classes = ['0', '9']
# entity_classes = ['g', 'p', 'f', 'c', 'cg', 'cp', 'cf', 'cc', 'gg']
attribute_classes = ['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
# sentiment_classes = ['p', 'n', '0']
# print(len(attribute_classes) * len(entity_classes))

In [202]:
%%time
df = pd.read_csv('../dataframe.csv', sep='|', names = header_columns)
print(len(df))
df.sample(5)

8824
Wall time: 81 ms


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
3066,Andrew-Wolff-2,Andrew Wolff,648,2,9,n,p,g,"At least, I do not have a strong preference fo...",quora.json,sarthak,p-g
6734,A-S-Ramprasad,A.S Ramprasad,53,12,0,,,,In fact they would not come from ripened fruit.,quora.json,hannah,nan-nan
3573,Ron-DePaepe,Ron DePaepe,36,10,0,,,,":) Gluten affects your body in good ways, as t...",quora.json,hannah,nan-nan
7356,Adeela-Masood,Adeela Masood,1306,5,9,0,p,g,And I tell them that at least there is a chanc...,quora.json,florian,p-g
5776,Tejomaye-Upmanvaya,Tejomaye Upmanvaya,1265,5,9,n,g,p,Buying organic food daily can be heavy on the ...,quora.json,florian,g-p


In [203]:
# taking only relvant
df_relevant = df.loc[df['Domain_Relevance'] == '9']
print(len(df_relevant))
df_relevant.sample(5)

4687


Unnamed: 0,Author_ID,Author_name,Comment_number,Sentence_number,Domain_Relevance,Sentiment,Entity,Attribute,Sentence,Source_file,Annotator,Aspect
564,Raja-Sekhar-Bandaru-1,Raja Sekhar Bandaru,1756,8,9,n,c,l,None of the natural farmers around the city su...,quora.json,felix,c-l
7844,Matt-Ackeret,Matt Ackeret,745,2,9,n,p,h,"Also, there is no proof that organic food is a...",quora.json,sarthak,p-h
4228,Janice-Boelk,Janice Boelk,85,11,9,n,cp,s,Other ingredients would probably be toxic chem...,quora.json,hannah,cp-s
5936,Tejashri-Khade-1,Tejashri Khade,670,2,9,0,f,c,Organic cultivation excludes the use of artifi...,quora.json,sarthak,f-c
3143,Addison-Manning,Addison Manning,47,4,9,p,f,g,"Full disclosure: I am an organic farmer, I kno...",quora.json,hannah,f-g


In [213]:
# lemmatize and stop word
lemmatize = True
remove_stopwords = True

if lemmatize:
    lemmatizer = WordNetLemmatizer()
else:
    lemmatizer = None

if remove_stopwords:
    stop_words = set(stopwords.words('english'))
else:
    stop_words = set()

In [214]:
token_pattern = re.compile(r'(?u)\b\w\w+\b')

In [215]:
# taking aspects to be attributes
aspects = attribute_classes
print(aspects)

# a dictionary of aspects and segments associated with that aspect
aspect_segments = dict([(aspect, []) for aspect in aspects])
print(aspect_segments)

# a list for all segments
all_segs = []

['g', 'p', 't', 'q', 's', 'h', 'c', 'll', 'or', 'l', 'av', 'e', 'a', 'pp']
{'g': [], 'p': [], 't': [], 'q': [], 's': [], 'h': [], 'c': [], 'll': [], 'or': [], 'l': [], 'av': [], 'e': [], 'a': [], 'pp': []}


In [216]:
# labelling scheme of annotated dataset round 3
df_asp = pd.read_excel('../Labeling Workshop_updated_18-10-19.xlsx', sheet_name='Labeling Scheme')

# a dictionary with aspect codes and its labels
asp_meaning = {}
for index, row in df_asp.iterrows():
    if(row[0] == 'attribute'):
        asp_meaning[row[1]] = row[2]
pprint(asp_meaning, sort_dicts= False)

{'g': 'general',
 'p': 'price',
 't': 'taste',
 'q': 'nutritional quality/freshness/appearance',
 's': 'safety',
 'h': 'healthiness',
 'c': 'chemicals/pesticides',
 'll': 'label',
 'or': 'origin, source',
 'l': 'local',
 'av': 'availability',
 'e': 'environment',
 'a': 'animal welfare',
 'pp': 'productivity'}


In [217]:
%%time
for i, row in df_relevant.iterrows():  
    if row['Attribute']!='NaN':
        seg_body = row['Sentence']
        seg_aspect = row['Attribute']
        seg_words = [word for word in token_pattern.findall(seg_body.lower())
                              if word not in stop_words]

        if lemmatizer is not None:
            seg_words = [lemmatizer.lemmatize(word) for word in seg_words]
        # prepared segment
        seg_prep = ' '.join(seg_words)
        # add to aspect-segment dictionary
        aspect_segments[seg_aspect].append(seg_prep)
        # add to all segments list
        all_segs.append(seg_prep)
print(len(all_segs))

4687
Wall time: 2.37 s


In [218]:
# compute tfidf scores
vectorizer = TfidfVectorizer(stop_words='english' if remove_stopwords else None,
        norm='l1', use_idf=True)
vectorizer.fit(all_segs)
gl_freq = vectorizer.transform([' '.join(all_segs)]).toarray()[0]
print(len(gl_freq))

5828


In [219]:
# global scores
gl_scores = {}
for term, idx in vectorizer.vocabulary_.items():
    gl_scores[term] = gl_freq[idx]
print (len(gl_scores))


5828


In [220]:
%%time
asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
sorted_asp_scores = dict([(aspect, {}) for aspect in aspect_segments.keys()])
score_seed_words = dict([(aspect, []) for aspect in asp_meaning.values()])

for aspect, segments in aspect_segments.items():
    # aspect-specific scores
    asp_freq = vectorizer.transform([' '.join(segments)]).toarray()[0]
    
    # entropies correspond to clarity scores
    entropies = rel_entr(asp_freq, gl_freq) / log(2)
    for term, idx in vectorizer.vocabulary_.items():
        asp_scores[aspect][term] = entropies[idx]
        
    # sort by score and write to sorted_asp_scores if > 0
    scores = sorted(asp_scores[aspect].items(), reverse=True, key=lambda x:x[1])
    for term, cla in scores[0:10]:
        if cla > 0:
            sorted_asp_scores[aspect][term] = cla
            score_seed_words[asp_meaning[aspect]].append(term)

pprint(sorted_asp_scores, sort_dicts=False)
pprint(score_seed_words, sort_dicts=False)

df_out = pd.DataFrame.from_dict(data=score_seed_words, orient='index')
if remove_stopwords:
    df_out.to_excel('../score_seed_words_remove_stop.xlsx')
else:
    df_out.to_excel('../score_seed_words.xlsx')

{'g': {'gmo': 0.003483305824339285,
       'farming': 0.003318456952304896,
       'store': 0.0027445582194592874,
       'organic': 0.002513442297520986,
       'india': 0.0022430018742386993,
       'definition': 0.002232780185675268,
       'online': 0.0019182282518315594,
       'farm': 0.001891181489807445,
       'seed': 0.0018605691653413587,
       'start': 0.0016999618119770872},
 'p': {'price': 0.07027722837749699,
       'expensive': 0.06539105340635938,
       'cost': 0.057431297271537375,
       'pay': 0.023146861379406955,
       'money': 0.02123499553408327,
       'demand': 0.020216749973225374,
       'higher': 0.018490050547252494,
       'extra': 0.018220274952386796,
       'charge': 0.014430447027442842,
       'afford': 0.014028622130207051},
 't': {'taste': 0.2687317959602853,
       'better': 0.07084940625304122,
       'flavor': 0.050404405110374,
       'difference': 0.03837601208118237,
       'tasty': 0.02259570188815016,
       'fruit': 0.02038661851720556,

Wall time: 949 ms
