In [1]:
from collections import defaultdict
import os
import pickle
import random

import numpy as np
import pandas as pd

import power_frames as pf

<br><br><br><br>

# Load an example dataset

We'll use the annotated subset of a Reddit personal stories corpus curated by Jessica Ouyang.

You can learn about this dataset and download it here: http://www.cs.columbia.edu/~ouyangj/reddit-data/

Direct download link: http://www.cs.columbia.edu/~ouyangj/reddit-data/reddit-data-annotated.tar.gz

In [2]:
dataset_directory_path = '/Users/maria/Documents/data/reddit-stories-ouyang/story'

In [3]:
stories = []

for _file_name in os.listdir(dataset_directory_path):
    stories.append(' '.join([_line.strip() for _line in open(dataset_directory_path + '/' + _file_name, 'r')]))

len(stories)

476

In [4]:
story_ids = [str(i) for i in range(len(stories))]
len(story_ids)

476

<br><br><br><br>

# Define the personas

In [5]:
persona_pattern_dict = {'masculine pronouns': r'he|him',
                        'feminine pronouns': r'she|her',
                        'neutral pronouns': r'they|them'}

<br><br><br><br>

# Load the power dictionary

In [6]:
lexicon_path = '/Users/maria/Documents/data/FramesAgencyPower/agency_power.csv'

In [7]:
verb_power_dict = pf.get_verb_power_dict(lexicon_path)
len(verb_power_dict)

2115

In [8]:
for _verb, _power in random.sample(verb_power_dict.items(), 20):
    print(_power, '\t', _verb)

power_equal 	 guess
power_agent 	 free
power_agent 	 insure
power_agent 	 pour
power_theme 	 detect
power_agent 	 warm
power_equal 	 endanger
nan 	 rocket
nan 	 curve
power_agent 	 compos
power_agent 	 spread
power_agent 	 classify
power_agent 	 reverse
power_theme 	 object
power_agent 	 invest
power_agent 	 presume
power_agent 	 show
power_equal 	 pet
nan 	 screech
nan 	 swoop


<br><br><br><br>

# Measure power across the dataset

In [16]:
id_persona_power_dict, \
    id_persona_total_dict, \
    id_nsubj_verb_count_dict, \
    id_dobj_verb_count_dict = pf.measure_power(verb_power_dict,
                                               persona_pattern_dict, 
                                               stories, 
                                               story_ids)

2022-10-22 13:11:48 Processed 0 out of 476
2022-10-22 13:11:52 Processed 100 out of 476
2022-10-22 13:11:57 Processed 200 out of 476
2022-10-22 13:12:01 Processed 300 out of 476
2022-10-22 13:12:05 Processed 400 out of 476


<br><br><br><br>

# Examine the power scores

Print the mean scores for each persona across the whole dataset.

You'll need to decide how to combine the positive and negative power scores for each persona. Here, we'll subtract the negative score from the positive score and divide by the total number of entity mentions in each document.

In [10]:
id_persona_score_dict = defaultdict(lambda: defaultdict(float))
for _id, _persona_power_dict in id_persona_power_dict.items():
    for _persona, _polarity_score_dict in _persona_power_dict.items():
        _score = _polarity_score_dict['positive'] - _polarity_score_dict['negative']
        _score /= id_persona_total_dict[_id][_persona]
        id_persona_score_dict[_id][_persona] = _score

In [11]:
persona_scores_dict = defaultdict(list)
for _id, _persona_score_dict in id_persona_score_dict.items():
    for _persona, _score in _persona_score_dict.items():
        persona_scores_dict[_persona].append(_score)

person_score_dict = {_persona: np.mean(_scores) for _persona, _scores in persona_scores_dict.items()}

In [12]:
for _persona, _score in person_score_dict.items():
    print(_persona, round(_score, 2))

masculine pronouns 0.0
feminine pronouns 0.08
neutral pronouns 0.28


<br><br><br><br>

# Examine the verb coverage

Print the verbs that were most frequently matched to the lexicon

In [13]:
verb_count_dict = pf.evaluate_verb_coverage(id_nsubj_verb_count_dict)

for _verb, _count in sorted(verb_count_dict.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(_count, '\t', _verb) # + ' (' + str(_count) + ')')

457 	 be
230 	 have
167 	 say
163 	 go
154 	 get
122 	 tell
119 	 come
105 	 do
100 	 start
78 	 take
76 	 know
72 	 look
69 	 call
64 	 want
61 	 ask
59 	 see
55 	 try
52 	 think
48 	 give
44 	 leave


<br><br><br><br>

# Examine the persona coverage

For each persona, print how often the persona was used with a verb that was matched or not matched to the lexicon

In [14]:
persona_found_dict, persona_missed_dict, persona_total_dict = pf.evaluate_persona_coverage(id_persona_total_dict, id_persona_power_dict)

In [15]:
for _persona, _total in persona_total_dict.items():
    print(_persona)
    print('Matched:', persona_found_dict[_persona], '(' + str(round((persona_found_dict[_persona] / _total) * 100, 1)) + '%)')
    print('Not Matched:', persona_missed_dict[_persona], '(' + str(round((persona_missed_dict[_persona] / _total) * 100, 1)) + '%)')
    print()

masculine pronouns
Matched: 3492 (54.7%)
Not Matched: 2888 (45.3%)

feminine pronouns
Matched: 1059 (53.2%)
Not Matched: 931 (46.8%)

neutral pronouns
Matched: 377 (60.2%)
Not Matched: 249 (39.8%)

