In [1]:
import os
import re
import csv

from typing import List

import numpy as np
from nltk.stem import WordNetLemmatizer
import pandas as pd

In [2]:
pipeline_dir = '/home/ptoroisaza/fair-fairytale-nlp/data/pipeline/'

In [3]:
story_id = 'bamboo-cutter-moon-child'

In [4]:
story_dir_with_prefix = pipeline_dir + story_id + '/' + story_id
characters_temporal_df = pd.read_csv(story_dir_with_prefix + '.characters_temporal_events.csv')

In [5]:
characters_df = pd.read_csv(story_dir_with_prefix + '.character_attributes.csv').rename(columns = {'coref_idx': 'coref_id'})

In [6]:
characters_temporal_df

Unnamed: 0,sentence_id,event_id,event,verb_start_byte,verb_end_byte,verb_start_byte_text,verb_end_byte_text,coref_id,arg_start_byte_sentence,arg_end_byte_sentence,...,supersense_category,temporal_rank,argument,name,name_mentions,pronoun_mentions,total_mentions,gender,gender_certainty,importance
0,1,2,sent,55,59,110,114,10,48.0,54.0,...,verb.communication,0,subject,Heaven,1.0,0.0,1.0,unknown,0.0,tertiary
1,1,3,cheer,63,68,118,123,10,48.0,54.0,...,verb.emotion,1,subject,Heaven,1.0,0.0,1.0,unknown,0.0,tertiary
2,1,5,died,143,147,198,202,17,140.0,142.0,...,verb.change,2,subject,an old bamboo wood - cutter,1.0,15.0,16.0,male,1.0,tertiary
3,1,7,laid,156,160,211,215,17,140.0,142.0,...,verb.contact,3,subject,an old bamboo wood - cutter,1.0,15.0,16.0,male,1.0,tertiary
4,3,1,made,12,16,368,372,17,5.0,7.0,...,verb.social,4,subject,an old bamboo wood - cutter,1.0,15.0,16.0,male,1.0,tertiary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1077,236,1,giving,65,71,29469,29475,29,72.0,76.0,...,verb.possession,584,direct_object,the old man,24.0,81.0,105.0,male,1.0,primary
1078,236,1,giving,65,71,29469,29475,62,72.0,76.0,...,verb.possession,584,direct_object,the Princess,36.0,118.0,154.0,female,1.0,primary
1079,236,2,asked,97,102,29501,29506,231,103.0,106.0,...,verb.communication,585,direct_object,The Emperor,13.0,39.0,52.0,male,1.0,secondary
1080,236,2,asked,97,102,29501,29506,29,103.0,106.0,...,verb.communication,585,direct_object,the old man,24.0,81.0,105.0,male,1.0,primary


## By Character

### Event Counts

In [7]:
char_event_counts = pd.DataFrame(characters_temporal_df['coref_id'].value_counts()).reset_index().rename(columns = {'coref_id': 'event_n', 'index': 'coref_id'})

In [8]:
char_event_counts

Unnamed: 0,coref_id,event_n
0,62,190
1,29,127
2,11,124
3,137,92
4,231,69
...,...,...
124,187,1
125,262,1
126,403,1
127,103,1


### Argument Counts

In [9]:
char_arg_counts = pd.DataFrame(characters_temporal_df.groupby('coref_id')['argument'].value_counts()).rename(columns = {'argument': 'arg_n'}).reset_index()

In [10]:
char_arg_counts

Unnamed: 0,coref_id,argument,arg_n
0,10,subject,2
1,11,direct_object,65
2,11,subject,59
3,12,direct_object,2
4,12,subject,2
...,...,...,...
176,389,subject,1
177,402,subject,2
178,403,subject,1
179,404,subject,1


In [11]:
char_arg_counts = pd.pivot(char_arg_counts, index = ['coref_id'], columns = ['argument'], values = ['arg_n'])

In [12]:
char_arg_counts.columns = char_arg_counts.columns.to_flat_index()

In [13]:
char_arg_counts = char_arg_counts.rename(columns = {('arg_n', 'subject'): 'subject_n', ('arg_n', 'direct_object'): 'direct_object_n'}).reset_index().fillna(0)

In [14]:
char_arg_counts

Unnamed: 0,coref_id,direct_object_n,subject_n
0,10,0.0,2.0
1,11,65.0,59.0
2,12,2.0,2.0
3,13,4.0,0.0
4,14,0.0,1.0
...,...,...,...
124,389,1.0,1.0
125,402,0.0,2.0
126,403,0.0,1.0
127,404,0.0,1.0


### Supersense Counts

In [15]:
char_supersense_counts = pd.DataFrame(characters_temporal_df.groupby('coref_id')['supersense_category'].value_counts()).rename(columns = {'supersense_category': 'supersense_total_n'}).reset_index()

In [16]:
char_supersense_counts

Unnamed: 0,coref_id,supersense_category,supersense_total_n
0,10,verb.communication,1
1,10,verb.emotion,1
2,11,verb.communication,38
3,11,verb.motion,17
4,11,verb.social,15
...,...,...,...
345,402,verb.communication,1
346,402,verb.social,1
347,403,verb.communication,1
348,404,verb.communication,1


In [17]:
char_supersense_counts = pd.pivot(char_supersense_counts, index = ['coref_id'], columns = ['supersense_category'], values = ['supersense_total_n'])

In [18]:
char_supersense_counts.columns = char_supersense_counts.columns.to_flat_index()

In [19]:
supersense_rename = {('supersense_total_n', 'verb.body'): 'supersense_body_total_n',
                     ('supersense_total_n', 'verb.change'): 'supersense_change_total_n',
                     ('supersense_total_n', 'verb.cognition'): 'supersense_cognition_total_n',
                     ('supersense_total_n', 'verb.communication'): 'supersense_communication_total_n',
                     ('supersense_total_n', 'verb.competition'): 'supersense_competition_total_n',
                     ('supersense_total_n', 'verb.consumption'): 'supersense_consumption_total_n',
                     ('supersense_total_n', 'verb.contact'): 'supersense_contact_total_n',
                     ('supersense_total_n', 'verb.creation'): 'supersense_creation_total_n',
                     ('supersense_total_n', 'verb.emotion'): 'supersense_emotion_total_n',
                     ('supersense_total_n', 'verb.motion'): 'supersense_motion_total_n',
                     ('supersense_total_n', 'verb.perception'): 'supersense_perception_total_n',
                     ('supersense_total_n', 'verb.possession'): 'supersense_possession_total_n',
                     ('supersense_total_n', 'verb.social'): 'supersense_social_total_n'                    
                    }

In [20]:
char_supersense_counts = char_supersense_counts.rename(columns = supersense_rename).reset_index().fillna(0)

In [21]:
char_supersense_counts

Unnamed: 0,coref_id,supersense_body_total_n,supersense_change_total_n,supersense_cognition_total_n,supersense_communication_total_n,supersense_competition_total_n,supersense_consumption_total_n,supersense_contact_total_n,supersense_creation_total_n,supersense_emotion_total_n,supersense_motion_total_n,supersense_perception_total_n,supersense_possession_total_n,supersense_social_total_n
0,10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,11,0.0,7.0,14.0,38.0,0.0,0.0,9.0,0.0,4.0,17.0,13.0,7.0,15.0
2,12,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,13,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,389,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
125,402,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
126,403,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,404,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Supersense Argument Counts

In [22]:
char_supersense_arg_counts = pd.DataFrame(characters_temporal_df.groupby('coref_id')[['supersense_category', 'argument']].value_counts()).rename(columns = {'supersense_category': 'supersense_total_n', 0: 'arg_n'}).reset_index()

In [23]:
char_supersense_arg_counts

Unnamed: 0,coref_id,supersense_category,argument,arg_n
0,10,verb.communication,subject,1
1,10,verb.emotion,subject,1
2,11,verb.communication,direct_object,23
3,11,verb.communication,subject,15
4,11,verb.motion,direct_object,10
...,...,...,...,...
456,402,verb.communication,subject,1
457,402,verb.social,subject,1
458,403,verb.communication,subject,1
459,404,verb.communication,subject,1


In [24]:
char_supersense_arg_counts = pd.pivot(data = char_supersense_arg_counts, index = ['coref_id'], columns = ['argument', 'supersense_category'], values = ['arg_n'])

In [25]:
char_supersense_arg_counts.columns = char_supersense_arg_counts.columns.to_flat_index()

In [26]:
char_supersense_arg_counts.columns

Index([      ('arg_n', 'subject', 'verb.communication'),
                   ('arg_n', 'subject', 'verb.emotion'),
       ('arg_n', 'direct_object', 'verb.communication'),
              ('arg_n', 'direct_object', 'verb.motion'),
                 ('arg_n', 'subject', 'verb.cognition'),
                    ('arg_n', 'subject', 'verb.social'),
          ('arg_n', 'direct_object', 'verb.perception'),
                    ('arg_n', 'subject', 'verb.motion'),
                   ('arg_n', 'subject', 'verb.contact'),
              ('arg_n', 'direct_object', 'verb.social'),
          ('arg_n', 'direct_object', 'verb.possession'),
                ('arg_n', 'subject', 'verb.perception'),
           ('arg_n', 'direct_object', 'verb.cognition'),
              ('arg_n', 'direct_object', 'verb.change'),
                    ('arg_n', 'subject', 'verb.change'),
             ('arg_n', 'direct_object', 'verb.contact'),
             ('arg_n', 'direct_object', 'verb.emotion'),
                ('arg_n', 'subj

In [27]:
supersense_arg_rename = {('arg_n', 'subject', 'verb.communication'): 'supersense_communication_subj_n',
                         ('arg_n', 'subject', 'verb.emotion'): 'supersense_emotion_subj_n',
                         ('arg_n', 'direct_object', 'verb.communication'): 'supersense_communication_dobj_n',
                         ('arg_n', 'direct_object', 'verb.motion'): 'supersense_motion_dobj_n',
                         ('arg_n', 'subject', 'verb.cognition'): 'supersense_cognition_subj_n',
                         ('arg_n', 'subject', 'verb.social'): 'supersense_social_subj_n',
                         ('arg_n', 'direct_object', 'verb.perception'): 'supersense_perception_dobj_n',
                         ('arg_n', 'subject', 'verb.motion'): 'supersense_motion_subj_n',
                         ('arg_n', 'subject', 'verb.contact'): 'supersense_contact_subj_n',
                         ('arg_n', 'direct_object', 'verb.social'): 'supersense_social_dobj_n',
                         ('arg_n', 'direct_object', 'verb.possession'): 'supersense_possession_dobj_n',
                         ('arg_n', 'subject', 'verb.perception'): 'supersense_perception_subj_n',
                         ('arg_n', 'direct_object', 'verb.cognition'): 'supersense_cognition_dobj_n',
                         ('arg_n', 'direct_object', 'verb.change'): 'supersense_change_dobj_n',
                         ('arg_n', 'subject', 'verb.change'): 'supersense_change_subj_n',
                         ('arg_n', 'direct_object', 'verb.contact'): 'supersense_contact_dobj_n',
                         ('arg_n', 'direct_object', 'verb.emotion'): 'supersense_emotion_dobj_n',
                         ('arg_n', 'subject', 'verb.possession'): 'supersense_possession_subj_n',
                         ('arg_n', 'subject', 'verb.creation'): 'supersense_creation_subj_n',
                         ('arg_n', 'direct_object', 'verb.body'): 'supersense_body_dobj_n',
                         ('arg_n', 'subject', 'verb.body'): 'supersense_body_subj_n',
                         ('arg_n', 'subject', 'verb.competition'): 'supersense_competition_subj_n',
                         ('arg_n', 'direct_object', 'verb.competition'): 'supersense_competition_dobj_n',
                         ('arg_n', 'subject', 'verb.consumption'): 'supersense_consumption_subj_n',
                         ('arg_n', 'direct_object', 'verb.consumption'): 'supersense_consumption_dobj_n',
                         ('arg_n', 'direct_object', 'verb.creation'): 'supersense_creation_dobj_n'
                        }

In [28]:
char_supersense_arg_counts = char_supersense_arg_counts.rename(columns = supersense_arg_rename).reset_index().fillna(0)

In [29]:
char_supersense_arg_counts

Unnamed: 0,coref_id,supersense_communication_subj_n,supersense_emotion_subj_n,supersense_communication_dobj_n,supersense_motion_dobj_n,supersense_cognition_subj_n,supersense_social_subj_n,supersense_perception_dobj_n,supersense_motion_subj_n,supersense_contact_subj_n,...,supersense_emotion_dobj_n,supersense_possession_subj_n,supersense_creation_subj_n,supersense_body_dobj_n,supersense_body_subj_n,supersense_competition_subj_n,supersense_competition_dobj_n,supersense_consumption_subj_n,supersense_consumption_dobj_n,supersense_creation_dobj_n
0,10,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11,15.0,2.0,23.0,10.0,10.0,9.0,8.0,7.0,6.0,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,402,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,403,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127,404,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
characters_df

Unnamed: 0,clustered_names,coref_id,name_mentions,pronoun_mentions,total,easy_name,gender,gender_certainty,importance
0,the Princess/The Princess,62,36,118,154,the Princess,female,1.0,primary
1,the old man/The old man,29,24,81,105,the old man,male,1.0,primary
2,the Princess Moonlight/Princess Moonlight,11,20,79,99,the Princess Moonlight,female,1.0,secondary
3,The Knight/the Knight,137,10,54,64,The Knight,male,1.0,secondary
4,The Emperor/the Emperor,231,13,39,52,The Emperor,male,1.0,secondary
...,...,...,...,...,...,...,...,...,...
159,an old man,375,1,0,1,an old man,male,1.0,tertiary
160,the Princess Moonlight/Princess Moonlight--her...,103,1,0,1,the Princess Moonlight's own father,female,0.5,tertiary
161,the old man/The old man--my own child,374,1,0,1,the old man's own child,unknown,0.0,tertiary
162,a heavenly being,373,1,0,1,a heavenly being,unknown,0.0,tertiary


In [31]:
characters_df.merge(char_event_counts, on = 'coref_id', how = 'left')

Unnamed: 0,clustered_names,coref_id,name_mentions,pronoun_mentions,total,easy_name,gender,gender_certainty,importance,event_n
0,the Princess/The Princess,62,36,118,154,the Princess,female,1.0,primary,190.0
1,the old man/The old man,29,24,81,105,the old man,male,1.0,primary,127.0
2,the Princess Moonlight/Princess Moonlight,11,20,79,99,the Princess Moonlight,female,1.0,secondary,124.0
3,The Knight/the Knight,137,10,54,64,The Knight,male,1.0,secondary,92.0
4,The Emperor/the Emperor,231,13,39,52,The Emperor,male,1.0,secondary,69.0
...,...,...,...,...,...,...,...,...,...,...
159,an old man,375,1,0,1,an old man,male,1.0,tertiary,
160,the Princess Moonlight/Princess Moonlight--her...,103,1,0,1,the Princess Moonlight's own father,female,0.5,tertiary,1.0
161,the old man/The old man--my own child,374,1,0,1,the old man's own child,unknown,0.0,tertiary,2.0
162,a heavenly being,373,1,0,1,a heavenly being,unknown,0.0,tertiary,2.0


In [32]:
count_dfs = [char_event_counts, char_arg_counts, char_supersense_counts, char_supersense_arg_counts]

In [33]:
for i, df in enumerate(count_dfs):
    if i == 0:
        characters_stats_df = characters_df.merge(df, on = 'coref_id', how = 'left')
    else:
        characters_stats_df = characters_stats_df.merge(df, on = 'coref_id', how = 'left')

0 129
164
1 129
164
2 129
164
3 129
164


In [34]:
characters_stats_df = characters_stats_df.fillna(0)

In [35]:
character_stats_json = characters_stats_df.set_index('coref_id').to_dict('index')

### Individual Event Counts

Do we want to lemmatize events? Probably.

In [46]:
from nltk.stem import WordNetLemmatizer

In [49]:
lemmatizer = WordNetLemmatizer()

In [52]:
characters_temporal_df['event_lemma'] = characters_temporal_df['event'].apply(lemmatizer.lemmatize, args = ['v'])

In [53]:
for coref_id in character_stats_json.keys():
    char_events_df = characters_temporal_df[characters_temporal_df['coref_id'] == coref_id]
    character_stats_json[coref_id]['event_counts_total'] = char_events_df['event_lemma'].value_counts().to_dict()
    character_stats_json[coref_id]['event_counts_subj'] = char_events_df[char_events_df['argument'] == 'subject']['event_lemma'].value_counts().to_dict()
    character_stats_json[coref_id]['event_counts_dobj'] = char_events_df[char_events_df['argument'] == 'direct_object']['event_lemma'].value_counts().to_dict()

## Aggregate