NOTE: This notebook copied and adapted from [https://github.com/maria-antoniak/power-frames/blob/main/example_notebook.ipynb]

In [3]:
from collections import defaultdict
import os
import pickle
import random

import numpy as np
import pandas as pd

import sys
sys.path.append('power-frames/')
#sys.path.append('power-frames-ORIGINAL/')
import power_frames as pf

In [4]:
import json

### Load dataset

In [5]:
#dataset_dir_path = 'narratives.csv' 
dataset_dir_path = '../narrative_detection/narrative_posts_by_trained_classification.csv'

In [6]:
df = pd.read_csv(dataset_dir_path)
stories = df['selftext'].tolist()
len(stories)

7676

In [7]:
story_ids = [str(i) for i in range(len(stories))]
len(story_ids)

7676

### Define Personas

In [8]:
persona_pattern_dict = {'narrator': r'I|me',
                        'reader': r'you',
                        'SO_male': r'BF|bf|boyfriend|husband',
                        'SO_female': r'GF|gf|girlfriend|wife',
                        'SO_neutral': r'partner|spouse',
                        'SO_any': r'partner|spouse|bf|boyfriend|husband|GF|gf|girlfriend|wife',
                        'ex_SO': r'ex|ex bf|ex gf|ex boyfriend|ex girlfriend|ex husband|ex wife|ex spouse',
                        'friend': r'friend|friends|bff|best friend|bestfriend',
                        'family': r'family|mom|mum|dad|mother|father|sibling|brother|sister|son|daughter|grandmother|grandfather|grandma|grandpa|grandson|gran',
                        'ED': r'ED|eating disorder|anorexia|bulimia|binge-eating|binge eating|binge|disorder',
                        'nurse': r'nurse|nurses|np',
                        'doctor': r'doctor|doctors|dr',
                        'therapist': r'therapist|therapists',
                        'medical_prof': r'nurse|nurses|np|doctor|doctors|dr|therapist|therapists|psychologist|psychiatrist',
                        'food': r'food|drink|snack|meal|breakfast|lunch|dinner|dessert|coffee|tea|alcohol|wine|beer|cocktail|soda|juice|water|seltzer'}

### Measure power across dataset

In [9]:
lexicon_path = 'FramesAgencyPower/agency_power.csv'
verb_power_dict = pf.get_verb_power_dict(lexicon_path)
len(verb_power_dict)

2142

In [10]:
verb_agency_dict = pf.get_verb_agency_dict(lexicon_path)
len(verb_agency_dict)

2142

In [11]:
id_persona_power_dict, \
    id_persona_agency_dict, \
    id_persona_total_dict, \
    id_nsubj_verb_count_dict, \
    id_dobj_verb_count_dict = pf.measure_power(verb_power_dict,
                                               verb_agency_dict,
                                               persona_pattern_dict, 
                                               stories, 
                                               story_ids)

2023-05-08 17:39:41 Processed 0 out of 7676
2023-05-08 17:40:16 Processed 100 out of 7676
2023-05-08 17:40:56 Processed 200 out of 7676
2023-05-08 17:41:39 Processed 300 out of 7676
2023-05-08 17:42:36 Processed 400 out of 7676
2023-05-08 17:43:30 Processed 500 out of 7676
2023-05-08 17:44:34 Processed 600 out of 7676
2023-05-08 17:45:11 Processed 700 out of 7676
2023-05-08 17:45:46 Processed 800 out of 7676
2023-05-08 17:46:21 Processed 900 out of 7676
2023-05-08 17:46:58 Processed 1000 out of 7676
2023-05-08 17:47:34 Processed 1100 out of 7676
2023-05-08 17:48:10 Processed 1200 out of 7676
2023-05-08 17:49:01 Processed 1300 out of 7676
2023-05-08 17:49:56 Processed 1400 out of 7676
2023-05-08 17:50:32 Processed 1500 out of 7676
2023-05-08 17:51:17 Processed 1600 out of 7676
2023-05-08 17:51:59 Processed 1700 out of 7676
2023-05-08 17:52:38 Processed 1800 out of 7676
2023-05-08 17:53:19 Processed 1900 out of 7676
2023-05-08 17:54:04 Processed 2000 out of 7676
2023-05-08 17:54:40 Proce

In [12]:
# save all outputs
folder = 'output/coref_only_outputs/'
if not os.path.exists(folder):
    os.makedirs(folder)

with open(folder + 'id_persona_power_dict.json', 'w') as f:
    json.dump(id_persona_power_dict, f, indent=4)
with open(folder + 'id_persona_agency_dict.json', 'w') as f:
    json.dump(id_persona_agency_dict, f, indent=4)
with open(folder + 'id_persona_total_dict.json', 'w') as f:
    json.dump(id_persona_total_dict, f, indent=4)
with open(folder + 'id_nsubj_verb_count_dict.pkl', 'wb') as f:
    pickle.dump(id_nsubj_verb_count_dict, f)
with open(folder + 'id_dobj_verb_count_dict.pkl', 'wb') as f:
    pickle.dump(id_dobj_verb_count_dict, f)


In [13]:
stories[-2]

"Sometimes I come to reddit to read about people's purging and restricting experiences bc I'm so tempted to fall back on old habits. I started going to a nutritionist last November and when I started going I was eating max 2 meals a day and purging routinely. But throughout this process it been so frustrating to find articles and stuff about people who restrict eating and purge. I feel like I am all alone. I hear about people who restrict and people who binge eat because that's what gets all the attention when talk about eating disorders. Given I can't even say disorder out loud, but it'd make me feel less alone if I could put myself into some category that wasn't as vague as 'disordered eating' or 'purging disorder' because I feel like the description of purging disorder j talks about purging and not restricting. Also when you read stuff about eating disorders online and talk to people about it nobody ever talks about how FLUID it is. Like I used to purge a lot a lot and then I tried 

### Examine scores

In [14]:
id_persona_score_dict = defaultdict(lambda: defaultdict(float))
for _id, _persona_power_dict in id_persona_power_dict.items():
    for _persona, _polarity_score_dict in _persona_power_dict.items():
        _score = _polarity_score_dict['positive'] - _polarity_score_dict['negative']
        _score /= id_persona_total_dict[_id][_persona]
        id_persona_score_dict[_id][_persona] = _score

In [15]:
id_persona_agency_score_dict = defaultdict(lambda: defaultdict(float))
for _id, _persona_agency_dict in id_persona_agency_dict.items():
    for _persona, _polarity_score_dict in _persona_agency_dict.items():
        _score = _polarity_score_dict['positive'] - _polarity_score_dict['negative']
        _score /= id_persona_total_dict[_id][_persona]
        id_persona_agency_score_dict[_id][_persona] = _score

In [16]:
# save all outputs
with open(folder + 'id_persona_score_dict.json', 'w') as f:
    json.dump(id_persona_score_dict, f, indent=4)
with open(folder + 'id_persona_agency_score_dict.json', 'w') as f:
    json.dump(id_persona_agency_score_dict, f, indent=4)

In [17]:
id_persona_agency_dict['9']

defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
            {'SO_male': defaultdict(int,
                         {'equal': 2, 'positive': 5, 'negative': 5}),
             'SO_any': defaultdict(int,
                         {'equal': 2, 'positive': 5, 'negative': 5}),
             'friend': defaultdict(int,
                         {'equal': 2, 'positive': 5, 'negative': 5}),
             'narrator': defaultdict(int, {'negative': 1, 'positive': 0})})

In [18]:
id_persona_power_dict['9']

defaultdict(<function power_frames.measure_power_per_document.<locals>.<lambda>()>,
            {'SO_male': defaultdict(int, {'negative': 5, 'positive': 4}),
             'SO_any': defaultdict(int, {'negative': 5, 'positive': 4}),
             'friend': defaultdict(int, {'negative': 5, 'positive': 4}),
             'narrator': defaultdict(int, {'negative': 3, 'positive': 1})})

In [19]:
id_persona_total_dict['9']

defaultdict(int, {'SO_male': 15, 'SO_any': 15, 'friend': 15, 'narrator': 7})

In [20]:
id_persona_power_dict

{'0': defaultdict(<function power_frames.measure_power_per_document.<locals>.<lambda>()>,
             {'food': defaultdict(int, {'positive': 1, 'negative': 0}),
              'therapist': defaultdict(int, {'positive': 1, 'negative': 0}),
              'medical_prof': defaultdict(int, {'positive': 1, 'negative': 0}),
              'narrator': defaultdict(int, {'negative': 2, 'positive': 0})}),
 '1': defaultdict(<function power_frames.measure_power_per_document.<locals>.<lambda>()>,
             {'reader': defaultdict(int, {'positive': 1, 'negative': 0}),
              'narrator': defaultdict(int, {'positive': 3, 'negative': 0})}),
 '2': defaultdict(<function power_frames.measure_power_per_document.<locals>.<lambda>()>,
             {'food': defaultdict(int, {'negative': 1, 'positive': 0}),
              'narrator': defaultdict(int, {'negative': 1, 'positive': 0})}),
 '3': defaultdict(<function power_frames.measure_power_per_document.<locals>.<lambda>()>,
             {}),
 '4': default

In [21]:
id_persona_agency_dict

{'0': defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
             {'food': defaultdict(int, {'negative': 1, 'positive': 0}),
              'therapist': defaultdict(int, {'positive': 1, 'negative': 0}),
              'medical_prof': defaultdict(int,
                          {'positive': 1, 'negative': 0})}),
 '1': defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
             {'reader': defaultdict(int, {'negative': 1, 'positive': 0}),
              'narrator': defaultdict(int, {'positive': 1, 'negative': 0})}),
 '2': defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
             {}),
 '3': defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
             {}),
 '4': defaultdict(<function power_frames.measure_agency_per_document.<locals>.<lambda>()>,
             {'narrator': defaultdict(int,
                          {'positive': 1, 'equal': 1, 'ne

In [22]:
persona_counts_power = defaultdict(int)
for _id, _persona_power_dict in id_persona_power_dict.items():
    for _persona in _persona_power_dict.keys():
        persona_counts_power[_persona] += _persona_power_dict[_persona]['positive'] + _persona_power_dict[_persona]['negative']

persona_counts_agency = defaultdict(int)
for _id, _persona_agency_dict in id_persona_agency_dict.items():
    for _persona in _persona_agency_dict.keys():
        persona_counts_agency[_persona] += _persona_agency_dict[_persona]['positive'] + _persona_agency_dict[_persona]['negative']

In [23]:
# save persona counts
with open(folder + 'persona_counts_power.json', 'w') as f:
    json.dump(persona_counts_power, f, indent=4)
with open(folder + 'persona_counts_agency.json', 'w') as f:
    json.dump(persona_counts_agency, f, indent=4)

In [24]:
persona_scores_dict = defaultdict(list)
for _id, _persona_score_dict in id_persona_score_dict.items():
    for _persona, _score in _persona_score_dict.items():
        persona_scores_dict[_persona].append(_score)

persona_score_dict = {_persona: np.mean(_scores) for _persona, _scores in persona_scores_dict.items()}

In [25]:
persona_agency_scores_dict = defaultdict(list)
for _id, _persona_agency_score_dict in id_persona_agency_score_dict.items():
    for _persona, _score in _persona_agency_score_dict.items():
        persona_agency_scores_dict[_persona].append(_score)

persona_agency_score_dict = {_persona: np.mean(_scores) for _persona, _scores in persona_agency_scores_dict.items()}

In [26]:
print('Power Scores:')
for _persona, _score in persona_score_dict.items():
    print(_persona, round(_score, 2), '(n=' + str(persona_counts_power[_persona]) + ')')

Power Scores:
food -0.62 (n=5298)
therapist -0.11 (n=324)
medical_prof -0.1 (n=1263)
narrator -0.32 (n=14008)
reader 0.42 (n=4053)
ex_SO -0.54 (n=1513)
family 0.08 (n=3527)
SO_female 0.14 (n=403)
SO_any 0.14 (n=1491)
friend -0.0 (n=2015)
SO_male 0.2 (n=780)
ED -0.58 (n=2109)
doctor -0.13 (n=778)
SO_neutral -0.03 (n=308)
nurse -0.04 (n=95)


In [27]:
print('Agency Scores:')
for _persona, _score in persona_agency_score_dict.items():
    print(_persona, round(_score, 2), '(n=' + str(persona_counts_agency[_persona]) + ')')

Agency Scores:
food 0.29 (n=652)
therapist 0.36 (n=371)
medical_prof 0.41 (n=1065)
reader 0.13 (n=3458)
narrator 0.15 (n=3906)
ex_SO 0.41 (n=500)
family 0.35 (n=3619)
SO_female 0.11 (n=418)
SO_any 0.28 (n=1692)
friend 0.3 (n=2029)
SO_male 0.32 (n=946)
ED 0.54 (n=574)
doctor 0.43 (n=559)
SO_neutral 0.3 (n=328)
nurse 0.51 (n=52)


In [28]:
# save score dicts
with open(folder + 'persona_power_score_dict.json', 'w') as f:
    json.dump(persona_score_dict, f, indent=4)
with open(folder + 'persona_agency_score_dict.json', 'w') as f:
    json.dump(persona_agency_score_dict, f, indent=4)

<br><br><br><br>

# Examine the verb coverage

Print the verbs that were most frequently matched to the lexicon

In [29]:
id_nsubj_verb_count_dict

{'0': defaultdict(int,
             {('food', 'have'): 1,
              ('therapist', 'recommend'): 1,
              ('medical_prof', 'recommend'): 1}),
 '1': defaultdict(int,
             {('narrator', 'be'): 1,
              ('reader', 'know'): 1,
              ('narrator', 'give'): 1,
              ('SO_male', 'fell'): 1,
              ('SO_any', 'fell'): 1,
              ('friend', 'fell'): 1}),
 '2': defaultdict(int, {}),
 '3': defaultdict(int, {('ED', 'be'): 1, ('narrator', 'be'): 1}),
 '4': defaultdict(int,
             {('narrator', 'hurt'): 1,
              ('narrator', 'feel'): 1,
              ('reader', 'regret'): 1,
              ('reader', 'have'): 1,
              ('reader', 'want'): 1,
              ('narrator', 'be'): 1}),
 '5': defaultdict(int, {('ex_SO', 'do'): 1, ('ex_SO', 'scream'): 1}),
 '6': defaultdict(int, {}),
 '7': defaultdict(int,
             {('family', 'be'): 3,
              ('family', 'decide'): 1,
              ('family', 'talks'): 1,
              ('f

In [30]:
verb_count_dict = pf.evaluate_verb_coverage(id_nsubj_verb_count_dict)

for _verb, _count in sorted(verb_count_dict.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(_count, '\t', _verb) # + ' (' + str(_count) + ')')

3581 	 be
1224 	 have
1055 	 feel
1014 	 say
813 	 eat
785 	 get
780 	 know
775 	 want
763 	 tell
683 	 go
673 	 do
599 	 make
541 	 think
327 	 start
321 	 see
294 	 look
279 	 come
275 	 try
268 	 ‚Äôs
255 	 take


Find verbs missing from lexicon

In [31]:
missing_verbs = set()
all_verbs = set()

for _id in id_nsubj_verb_count_dict.keys():
    for nsubj, _verb in id_nsubj_verb_count_dict[_id].keys():
        if _verb not in verb_power_dict.keys():
            missing_verbs.add(_verb)
        all_verbs.add(_verb)

for _id in id_dobj_verb_count_dict.keys():
    for dobj, _verb in id_dobj_verb_count_dict[_id].keys():
        if _verb not in verb_power_dict.keys():
            missing_verbs.add(_verb)
        all_verbs.add(_verb)

In [32]:
len(all_verbs)

2141

In [33]:
len(missing_verbs)

1073

In [34]:
79/309

0.255663430420712

In [35]:
missing_verbs

{'sleepin',
 'happy',
 'harbor',
 'homeschool',
 '‚Ä¶‚Ä¶',
 'diet',
 'transition',
 'input',
 'snack',
 'leading',
 'fatten',
 'is',
 'fluffy',
 'gift',
 'year',
 'hard',
 'gaslighitng',
 'crummy',
 'clue',
 'slamming',
 'excercises',
 'flaw',
 'dang',
 'coddle',
 'me',
 'trivialize',
 'butt',
 'got',
 'forcefeed',
 'salad',
 'disrespect',
 'alternate',
 'cripple',
 'havr',
 'approaching',
 'callled',
 'attenting',
 'receiving',
 'malnourish',
 'miserable',
 'wisks',
 'realllyyyy',
 'myeating',
 'commend',
 'deduce',
 'are',
 'incase',
 'crave',
 'funny',
 'derail',
 'brag',
 'tick',
 'transcend',
 'combat',
 'flatter',
 'untrusting',
 'sweet',
 'summarise',
 'uneasy',
 'teenage',
 'restricted',
 'consider&gt;!35',
 'ogle',
 'uncomfortable',
 'ca',
 'apologise',
 'dissappear',
 'perpetuate',
 "wan't",
 'single',
 'overthink',
 'eats',
 'portion',
 'feeling',
 'consist',
 'harder',
 'lightheaded',
 'ate',
 '‚Äôre',
 'ofc',
 'nervous',
 'bigger',
 'inundate',
 'playing',
 'coming',
 'sor

<br><br><br><br>

# Examine the persona coverage

For each persona, print how often the persona was used with a verb that was matched or not matched to the lexicon

In [36]:
persona_found_dict, persona_missed_dict, persona_total_dict = pf.evaluate_persona_coverage(id_persona_total_dict, id_persona_power_dict)

In [37]:
persona_found_dict

defaultdict(int,
            {'food': 5298,
             'therapist': 324,
             'medical_prof': 1263,
             'narrator': 14008,
             'reader': 4053,
             'ex_SO': 1513,
             'family': 3527,
             'SO_female': 403,
             'SO_any': 1491,
             'friend': 2015,
             'SO_male': 780,
             'ED': 2109,
             'doctor': 778,
             'SO_neutral': 308,
             'nurse': 95})

In [38]:
agency_found_dict, agency_missed_dict, agency_total_dict = pf.evaluate_persona_coverage(id_persona_total_dict, id_persona_agency_dict)

In [39]:
agency_found_dict

defaultdict(int,
            {'food': 652,
             'therapist': 371,
             'medical_prof': 1065,
             'reader': 3458,
             'narrator': 3906,
             'ex_SO': 500,
             'family': 3619,
             'SO_female': 418,
             'SO_any': 1692,
             'friend': 2029,
             'SO_male': 946,
             'ED': 574,
             'doctor': 559,
             'SO_neutral': 328,
             'nurse': 52})

In [40]:
for _persona, _total in persona_total_dict.items():
    print(_persona)
    print('Matched:', persona_found_dict[_persona], '(' + str(round((persona_found_dict[_persona] / _total) * 100, 1)) + '%)')
    print('Not Matched:', persona_missed_dict[_persona], '(' + str(round((persona_missed_dict[_persona] / _total) * 100, 1)) + '%)')
    print()

food
Matched: 5298 (80.5%)
Not Matched: 1284 (19.5%)

therapist
Matched: 324 (59.8%)
Not Matched: 218 (40.2%)

medical_prof
Matched: 1263 (66.2%)
Not Matched: 646 (33.8%)

narrator
Matched: 14008 (63.0%)
Not Matched: 8210 (37.0%)

reader
Matched: 4053 (65.2%)
Not Matched: 2168 (34.8%)

ex_SO
Matched: 1513 (77.7%)
Not Matched: 433 (22.3%)

family
Matched: 3527 (56.1%)
Not Matched: 2755 (43.9%)

SO_female
Matched: 403 (49.7%)
Not Matched: 408 (50.3%)

SO_any
Matched: 1491 (52.3%)
Not Matched: 1362 (47.7%)

friend
Matched: 2015 (53.8%)
Not Matched: 1729 (46.2%)

SO_male
Matched: 780 (52.7%)
Not Matched: 699 (47.3%)

ED
Matched: 2109 (84.6%)
Not Matched: 385 (15.4%)

doctor
Matched: 778 (73.3%)
Not Matched: 283 (26.7%)

SO_neutral
Matched: 308 (55.6%)
Not Matched: 246 (44.4%)

nurse
Matched: 95 (74.8%)
Not Matched: 32 (25.2%)

