# Create character feature dataset for STM
Based on fanfiction_gender_roles_data.ipynb

## Postag, parse character assertions

In [1]:
# Load assertions for characters
import json
from tqdm.notebook import tqdm
import os

fandoms = [
    'homestuck',
#     'startrek',
#     'dragonage',
#     'buffy',
#     'jojo',
#     'naruto',
#     'teenwolf',
#     'song_ice_fire',
#     'shadowhunter',
#     'walking_dead'
]
dataset_name = 'complete_en_1k-5k'
for fandom in fandoms:
    pipeline_output_path = f'/data/fanfiction_ao3/{fandom}/{dataset_name}/output/'
    char_assertions = {}
    assertions_dirpath = os.path.join(pipeline_output_path, 'assertion_extraction')

    # for fic, chars in tqdm(fic_chars.items()):
    for fname in tqdm(os.listdir(assertions_dirpath)):
        fic_assertions_path = os.path.join(assertions_dirpath, fname)
    #     if not os.path.exists(fic_assertions_path):
    #         continue

        with open(fic_assertions_path) as f:
            char_assertions[fname[:-5]] = json.load(f)

    len(char_assertions)

  0%|          | 0/22412 [00:00<?, ?it/s]

In [3]:
fic = list(char_assertions.keys())[0]
print(fic)
char_assertions[fic]

229832


{'Nepeta': [{'position': [0, 90],
   'text': "Nepeta was well - attuned to romance - she has to be , to keep a lookout for any possible ships among her friends . Naturally , she was good at spotting quadrant opportunities for herself . After all , had n't she been the first to know that she and Equius were going to be the greatest of moirails , when he was still denying that he held any affection for the troll who kept bothering and bothering and bothering him ? Look how that had turned out !"},
  {'position': [90, 187],
   'text': 'She never had much interest in getting a kismesis or an auspitice , so that was fine for now . A matesprit , though ... Around her were trolls growing flushed feelings for each other , something she watched with amazement . She and her friends were too young to mate , of course , but the electric connection was still there , the intense feelings that drew you to a person and made you want to listen to them for hours and kiss them even when you also wanted t

In [4]:
# Load coref info to replace pronouns with names
pipeline_output_path = f'/data/fanfiction_ao3/{fandom}/{dataset_name}/output/char_coref'
fpath = os.path.join(pipeline_output_path, fic + '.json')
with open(fpath) as f:
    coref = json.load(f)

In [32]:
import pdb

def name_from_char(charname):
    name = ' '.join([part for part in charname.split('_') if part[0].isupper()])
    if name.endswith(')'):
        name = name[:-1]
    return name

def normalize_names(text):
    text_split = text.split()
    
    for i, word in enumerate(text_split):
        if word.startswith('($_'):
            name = name_from_char(word)
            text_split[i-1] = name
            text_split[i] = ''

    return ' '.join(text_split)

# Process assertions
import spacy
from spacy.tokenizer import Tokenizer
import re
nlp = spacy.load('en', disable=['ner'])
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(f'\S+').match)

from tqdm.notebook import tqdm
import json

pipeline_output_path = f'/data/fanfiction_ao3/{fandom}/{dataset_name}/output/'

output_dirpath = os.path.join(pipeline_output_path, 'char_features')
if not os.path.exists(output_dirpath):
    os.mkdir(output_dirpath)
    
stops = ['was', 'were', 'to', 'for', 'in', 'on', 'by', 'has', 'had', 'been', 'be', "'re", "'s"]

# for fic in list(char_assertions.keys())[:1]:
for fic in tqdm(char_assertions):
    char_features = {}
    
    # Load coref info to replace pronouns with names
    coref_fpath = os.path.join(pipeline_output_path, 'char_coref', fic + '.json')
    with open(coref_fpath) as f:
        coref = json.load(f)
        
    for char in char_assertions[fic]:
#         name = name_from_char(char)
#         assertions = ' '.join(char_assertions[fic][char])
        for assertion in char_assertions[fic][char]:
            text = assertion['text']
            
            # postag and parse
            annotated = nlp(text)
        
            cluster_matches = [clus for clus in coref['clusters'] if clus.get('name', '') == char]
            if len(cluster_matches) == 0: continue
            cluster = cluster_matches[0]
            mention_inds = [list(range(m['position'][0], m['position'][1])) for m in cluster['mentions']] # token IDs of that character in the text
            mention_inds = [i for el in mention_inds for i in el]
            
            # Verbs where character was the subject
            offset = assertion['position'][0]
            verbs_subj = [tok.head.text.lower() for tok in annotated if tok.i + offset in mention_inds and \
                          (tok.dep_=='nsubj' or tok.dep_=='agent')]

            # Verbs where character was the object
            verbs_obj = [tok.head.text.lower() for tok in annotated if tok.i + offset in mention_inds and \
                         (tok.dep_=='dobj' or tok.dep_=='nsubjpass' or tok.dep_=='dative' or tok.dep_=='pobj')]
#             if len(verbs_obj) > 0:
#                 print(verbs_obj)

            # Adjectives that describe the character
            adjs = [tok.text.lower() for tok in annotated if tok.head.i + offset in mention_inds and \
                    (tok.dep_=='amod' or tok.dep_=='appos' or tok.dep_=='nsubj' or tok.dep_=='nmod')] \
                + [tok.text.lower() for tok in annotated if tok.dep_=='attr' and (tok.head.text=='is' or tok.head.text=='was') and \
                   any([c.i + offset in mention_inds for c in tok.head.children])]

#             if not fic in char_features:
#                 char_features[fic] = {}
            if not char in char_features:
                char_features[char] = []
            
            final_list = [w for w in verbs_subj + verbs_obj + adjs if not w in stops]
            char_features[char].extend(final_list)

    # Save out
    with open(os.path.join(output_dirpath, f'{fic}.json'), 'w') as f:
        json.dump(char_features, f)
        
#     print(char_features)

  0%|          | 0/22412 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [95]:
# Test loading processed JSON files
os.listdir(output_dirpath)

['183523.json',
 '918622.json',
 '309535.json',
 '726061.json',
 '16391837.json',
 '11032020.json',
 '5480774.json',
 '4465862.json',
 '4012087.json',
 '7514581.json',
 '4459220.json',
 '8531686.json',
 '54247.json']

In [98]:
with open(os.path.join(output_dirpath, os.listdir(output_dirpath)[0])) as f:
    test_dict = json.load(f)
    
test_dict

{'Valentines Day': [],
 'Sirius': ['shakes', 'says', 'move'],
 'Remus': ['does',
  'makes',
  'hate',
  'sitting',
  'inserts',
  'remarks',
  'wriggles',
  'continues',
  'is',
  'with'],
 'James': ['explains',
  'concludes',
  'glowers',
  'finishes',
  'buried',
  'asks',
  'jumps',
  'endearing',
  'pines',
  'raced',
  'fights',
  'says',
  'stop',
  'behind',
  'see',
  'of',
  'after'],
 'Evans': ['had', 'into'],
 'God Moony': [],
 'Wormtail': ['got'],
 'Lily': ['been', 'see', 'get', "'re", 'set', 'on', 'hate']}

## Annotate character gender

In [1]:
# Annotate characters for gender
# Load all character names

import json
import os
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from IPython.core.debugger import set_trace

fandoms = [
    'homestuck',
#     'startrek',
#     'dragonage',
#     'buffy',
#     'jojo',
#     'naruto',
#     'teenwolf',
#     'song_ice_fire',
#     'shadowhunter',
#     'walking_dead'
]
for fandom in fandoms:
    pipeline_output_path = f'/data/fanfiction_ao3/{fandom}/complete_en_1k-5k/output'
    output_dirpath = os.path.join(pipeline_output_path, 'char_features')
    character_names = []
    for fname in tqdm(sorted(os.listdir(output_dirpath))):
        with open(os.path.join(output_dirpath, fname)) as f:
            char_features = json.load(f)

        character_names.extend(list(char_features.keys()))

    len(character_names)

HBox(children=(IntProgress(value=0, max=71943), HTML(value='')))




944371

In [3]:
with open('/data/fanfiction_gender_roles/tmp/hp_character_names.json', 'w') as f:
    json.dump(character_names, f)

In [9]:
character_names_ctr = Counter(character_names)
print(len(character_names_ctr))

# Refine characters
# characters_ctr_filtered = [el for el in character_names_ctr.most_common() if el[0] != '' and el[1] > 1]
characters_ctr_filtered = Counter({name: count for name,count in character_names_ctr.items() if name != '' and count > 1})
len(characters_ctr_filtered)

# Load canonical list of characters
char_list_fpath = '/projects/fanfiction_gender_roles/scripts/harrypotter_characters.txt'
with open(char_list_fpath) as f:
    canonical_characters = f.read().splitlines()
    
extra = ['Dobby']
remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
         ]
canonical_characters += extra
canonical_characters = [c for c in canonical_characters if not c in remove]

len(canonical_characters)

116251


245

In [70]:
canonical_character_name_parts = set([part for name in canonical_characters for part in name.split()])
exclude = set(['The'])
canonical_character_name_parts -= exclude
# print(len(canonical_character_name_parts))
# list(canonical_character_name_parts)[:10]

remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
          'Weasley', # ambiguous
          'Weasleys',
          'Hufflepuff',
         ]

name_transform = { # Might then miss when stories use these terms, though (same with canonicalize): might want to spread gender info afterward
    'Potter': 'Harry Potter',
    'Mr. Potter': 'Harry Potter',
    'Mr Potter': 'Harry Potter',
    'The Harry': 'Harry Potter',
    'Mr Weasley': 'Ron Weasley',
    'Tom': 'Tom Riddle',
    'Black': 'Sirius Black',
    'The Draco': 'Draco Malfoy',
    'The Dark Lord': 'Voldemort',
    'The Dark Lord Voldemort': 'Voldemort',
    'Dark Lord': 'Voldemort',
    'Lord': 'Voldemort',
    'Malfoy': 'Draco Malfoy',
    'James': 'James Potter I',
    'James Potter': 'James Potter I',
    'Lily': 'Lily J. Potter',
    'Albus': 'Albus Dumbledore',
    'Teddy': 'Teddy Lupin',
    'Newt': 'Newton Scamander',
    'Rose': 'Rose Granger-Weasley',
    'Lily Potter': 'Lily J. Potter',
    'Regulus': 'Regulus Black I',
    'Mrs Weasley': 'Molly Weasley',
}
    

def canonicalize(name):
    
    name_parts = name.split()
    new_name_parts = []
    
    for name_part in name_parts:
        if name_part in canonical_character_name_parts:
            new_name_parts.append(name_part)
            
    new_name = ' '.join(new_name_parts)
    
    if new_name in remove:
        new_name = ''
        
    if new_name in name_transform:
        new_name = name_transform[new_name]
            
    return new_name

In [10]:
characters_ctr_filtered.most_common(200)

[('Harry', 24784),
 ('Hermione', 17673),
 ('Ron', 15328),
 ('Draco', 13679),
 ('Harry Potter', 13553),
 ('Hogwarts', 13501),
 ('Sirius', 12153),
 ('Remus', 10555),
 ('Ginny', 10156),
 ('James', 9511),
 ('Draco Malfoy', 8696),
 ('Voldemort', 8579),
 ('Severus', 7984),
 ('Potter', 7404),
 ('Dumbledore', 7304),
 ('Lily', 7167),
 ('Merlin', 6246),
 ('Malfoy', 5929),
 ('Snape', 5706),
 ('Severus Snape', 5541),
 ('Hermione Granger', 4785),
 ('Ministry', 4512),
 ('Neville', 4395),
 ('Albus', 4149),
 ('Peter', 4137),
 ('George', 4134),
 ('Slytherin', 4054),
 ('Lucius', 3868),
 ('Pansy', 3849),
 ('Luna', 3845),
 ('James Potter', 3567),
 ('Gryffindor', 3499),
 ('Fred', 3382),
 ('Great Hall', 3287),
 ('Lucius Malfoy', 3258),
 ('Dark Lord', 3242),
 ('Narcissa', 3238),
 ('Weasley', 3214),
 ('Christmas', 2945),
 ('Molly', 2919),
 ('Albus Dumbledore', 2902),
 ('Moony', 2894),
 ('Sirius Black', 2836),
 ('Professor Snape', 2787),
 ('Remus Lupin', 2781),
 ('Mum', 2735),
 ('Granger', 2691),
 ('Scorpius',

In [65]:
# Remove characters who don't have matches with canonical characters

# Filter, canonicalize names
characters_ctr_matches = Counter()
for name, count in characters_ctr_filtered.items():
    
    new_name = canonicalize(name)
    
    # Remove names
    if len(new_name) <= 1 or new_name in remove:
        continue
        
    # Transform names
    if new_name in name_transform:
        new_name = name_transform[new_name]
        
    if new_name in characters_ctr_matches:
        characters_ctr_matches[new_name] += count
        
    else:
        characters_ctr_matches[new_name] = count
    
print(len(characters_ctr_matches))
characters_ctr_matches.most_common()

1971


[('Harry', 31115),
 ('Harry Potter', 28133),
 ('Hermione', 20159),
 ('Draco Malfoy', 19813),
 ('Draco', 16983),
 ('Ron', 16967),
 ('Voldemort', 14113),
 ('James Potter I', 14046),
 ('Sirius', 13106),
 ('Remus', 11668),
 ('Ginny', 10927),
 ('Snape', 9919),
 ('Dumbledore', 9835),
 ('Severus', 9109),
 ('Lily J. Potter', 8743),
 ('Albus Dumbledore', 7499),
 ('Severus Snape', 5776),
 ('McGonagall', 5314),
 ('Sirius Black', 5183),
 ('Granger', 5145),
 ('Hermione Granger', 5039),
 ('Neville', 4791),
 ('George', 4509),
 ('Peter', 4389),
 ('Lucius', 4171),
 ('Luna', 4149),
 ('Pansy', 4109),
 ('Tom Riddle', 3723),
 ('Fred', 3583),
 ('Narcissa', 3463),
 ('Lucius Malfoy', 3304),
 ('Molly', 3170),
 ('Remus Lupin', 2886),
 ('Teddy Lupin', 2868),
 ('Scorpius', 2739),
 ('Blaise', 2665),
 ('Percy', 2646),
 ('Lupin', 2580),
 ('Hagrid', 2500),
 ('Charlie', 2468),
 ('Bill', 2444),
 ('Lily Evans', 2411),
 ('Bellatrix', 2344),
 ('Molly Weasley', 2325),
 ('Ron Weasley', 2320),
 ('Minerva', 2149),
 ('Seamus',

In [None]:
character_gender = {} # Keys are character names from character features
characters_no_genderbox = []
characters_http_error = []

In [38]:
character_gender

{'Harry': 'male',
 'Hermione': 'female',
 'Draco': 'male',
 'Ron': 'male',
 'Harry Potter': 'male',
 'Voldemort': 'male',
 'Sirius': 'male',
 'Remus': 'male'}

In [66]:
# Get gender from wikia page
import urllib.request
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from IPython.core.debugger import set_trace
import time
import urllib.error

for character in tqdm([name for name, count in characters_ctr_matches.most_common()]):
    
    if character in character_gender: continue
    if character in characters_no_genderbox: continue
    if character in characters_http_error: continue
    
    url_base = 'https://harrypotter.fandom.com/wiki/{}'
    try:
        html_str = urllib.request.urlopen(url_base.format(character.replace(' ', '_'))).read()
    except urllib.error.HTTPError as e:
        characters_http_error.append(character)
        print(f'Character {character} HTTP error')
        
    soup = BeautifulSoup(html_str, 'html.parser')
    
    if soup is None:
        print(html_str)
        break

    # div = list(soup.find('div', {'data-source':"gender"}).children)
    # soup.find('div', {'data-source':"gender"}).text
    genderbox = soup.find('div', {'data-source':"gender"})
    if genderbox is None:
        print(f'Character {character} has no gender box')
        characters_no_genderbox.append(character)
        continue
        
    gender = genderbox.find('div').text.lower()
    time.sleep(.5)
    
    character_gender[character] = gender
    
character_gender

HBox(children=(IntProgress(value=0, max=1971), HTML(value='')))

{'Harry': 'male',
 'Hermione': 'female',
 'Draco': 'male',
 'Ron': 'male',
 'Harry Potter': 'male',
 'Voldemort': 'male',
 'Sirius': 'male',
 'Remus': 'male',
 'Draco Malfoy': 'male',
 'Ginny': 'female',
 'James Potter I': 'male',
 'Snape': 'male',
 'Dumbledore': 'male',
 'Severus': 'male',
 'Lily J. Potter': 'female',
 'Severus Snape': 'male',
 'McGonagall': 'female',
 'Sirius Black': 'male',
 'Granger': 'male',
 'Hermione Granger': 'female',
 'Neville': 'male',
 'George': 'male',
 'Albus Dumbledore': 'male',
 'Peter': 'male',
 'Lucius': 'male',
 'Luna': 'female',
 'Pansy': 'female',
 'Tom Riddle': 'male',
 'Fred': 'male',
 'Narcissa': 'female',
 'Lucius Malfoy': 'male',
 'Molly': 'female',
 'Remus Lupin': 'male',
 'Scorpius': 'male',
 'Blaise': 'male',
 'Percy': 'male',
 'Lupin': 'male',
 'Teddy Lupin': 'male',
 'Hagrid': 'male',
 'Charlie': 'male',
 'Bill': 'male',
 'Lily Evans': 'female',
 'Bellatrix': 'female',
 'Ron Weasley': 'male',
 'Minerva': 'female',
 'Seamus': 'male',
 'Gin

In [67]:
# See how many characters, uses have annotated gender

print(len(character_gender))
# print(len(characters_ctr_matches))
print(len(character_gender)/len(characters_ctr_matches))

1681
0.8528665651953323


In [68]:
labeled_uses = sum([characters_ctr_matches[name] for name in character_gender])
print(labeled_uses)

# unlabeled_uses = sum([characters_ctr_matches[name] for name in characters_no_genderbox + characters_http_error])
# print(unlabeled_uses)
total_uses = sum(characters_ctr_matches.values())

print(labeled_uses/total_uses)

480383
0.955316872558924


In [69]:
# Save character gender json
with open('/data/fanfiction_gender_roles/harrypotter_ao3/hp_character_genders.json', 'w') as f:
    json.dump(character_gender, f)

# From HP pilot ***everything below this***

## Check character gender assignments

In [1]:
import json
fandom = 'supernatural'
gender_assignment_fpath = f'/data/fanfiction_ao3/{fandom}/character_genders.json'
with open(gender_assignment_fpath, 'r') as f:
    character_gender = json.load(f)
len(character_gender)

192

In [2]:
character_gender.values()

dict_values(['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'male', 'female', 'male', 'male', 'male', 'female', 'neutral', 'male', 'female', 'male', 'female', 'male', 'male', 'male', 'male', 'female', 'male', 'male', 'male', 'male', 'female', 'female', 'male', 'male', 'male', 'male', 'male', 'female', 'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male', 'female', 'female', 'neutral', 'male', 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female', 'male', 'male', 'male', 'male', 'male', 'male', 'neutral', 'female', 'female', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 

In [4]:
neutral_chars = [char for char, gender in character_gender.items() if gender=='neutral']
neutral_chars

['Jody',
 'Gwen',
 'Demon',
 'Ghostfacers',
 'WINCHESTER',
 'MEG',
 'SAm',
 'Trenton',
 'Bass',
 'Hurt',
 'IMPALA',
 'Godstiel',
 'Davies',
 'JOHN',
 'MIchael',
 'JO',
 'Moseley',
 'Hanscum',
 'Tran',
 'BENNY',
 'Kline',
 'JIMMY',
 'REBECCA',
 'Talley',
 'Sinclair',
 'BEN',
 'JACK',
 'Zeddmore',
 'LUCIFER',
 'Faeries',
 'Vessel',
 'Robinson',
 'Cuthbert',
 'JImmy',
 'MICHAEL',
 'CLAIRE',
 'Hellhound',
 'Tyson',
 'Gallagher',
 'HENRIKSEN',
 'LISA',
 'Chambers',
 'Bevell',
 'KEVIN',
 'EILEEN',
 'Antonia',
 'PAMELA',
 'DICK',
 'Rosen',
 'ROWENA',
 'Richardson',
 'JESSICA',
 'ANNA',
 'Sands',
 'BECKY',
 'RUFUS',
 'AMELIA',
 'KAIA',
 'Devereaux',
 'MISSOURI',
 'Spangler',
 'BELA',
 'Mullen',
 'Bell']

In [None]:
# Find prevalence in actual character features

## Check character feature output

In [18]:
# Load new output
import pandas as pd

path = '/data/fanfiction_ao3/harrypotter/complete_en_1k-50k/output/character_gender_features.csv'
feats = pd.read_csv(path)
print(feats.columns)
print(feats.shape)

Index(['fic_id', 'character_original', 'character_canonical',
       'character_gender', 'character_features', 'title', 'author',
       'author_key', 'rating', 'category', 'fandom', 'relationship',
       'character', 'additional tags', 'language', 'published', 'status',
       'status date', 'words', 'comments', 'kudos', 'bookmarks', 'hits',
       'chapter_count', 'series', 'seriespart', 'seriesid', 'summary',
       'preface_notes', 'afterword_notes', 'character_in_relationship',
       'character_in_relationship_type'],
      dtype='object')
(279167, 32)


In [20]:
feats['character_in_relationship'].value_counts()

False    186089
True      93078
Name: character_in_relationship, dtype: int64

In [21]:
feats['relationship'].iloc[:10]

0                         ["Sirius Black/Remus Lupin"]
1    ["James Potter/Lily Evans Potter-mention", "Se...
2    ["James Potter/Lily Evans Potter-mention", "Se...
3                         ["Draco Malfoy/Ron Weasley"]
4        ["Hermione Granger/Draco Malfoy", "dramione"]
5        ["Hermione Granger/Draco Malfoy", "dramione"]
6        ["Hermione Granger/Draco Malfoy", "dramione"]
7        ["Hermione Granger/Draco Malfoy", "dramione"]
8        ["Hermione Granger/Draco Malfoy", "dramione"]
9                         ["Sirius Black/Remus Lupin"]
Name: relationship, dtype: object

In [23]:
feats['character_in_relationship_type'].sample(10)

119167    character_not_in_relationship
246241                         straight
153150    character_not_in_relationship
177422    character_not_in_relationship
50294                          straight
192721                            queer
99790     character_not_in_relationship
68525                             queer
30645                          straight
221917                            queer
Name: character_in_relationship_type, dtype: object

In [19]:
# Load old output
old_path = '/data/fanfiction_ao3/harrypotter/complete_en_1k-50k/output/character_gender_features_orig.csv'
old_feats = pd.read_csv(old_path)
print(old_feats.columns)
print(old_feats.shape)

Index(['fic_id', 'character_original', 'character_canonical',
       'character_gender', 'character_features'],
      dtype='object')
(279166, 5)


## Fix gender footnote issue

### Just for the json

In [2]:
# Load character gender dictionary

import json

with open('/data/fanfiction_ao3/harrypotter/character_genders.json', 'r') as f:
    character_gender = json.load(f)

In [3]:
set(character_gender.values())

{'female',
 'female[1]',
 'female[2]',
 'female[3]',
 'female[4]',
 'female[5]',
 'male',
 'male[1]',
 'male[2]',
 'male[3]',
 'male[4]',
 'male[5]',
 'male[6]'}

In [12]:
import pdb
import re

new_character_gender = {}
for char in character_gender:
    new_character_gender[char] = re.match(r'\w+', character_gender[char]).group()

In [13]:
print(len(new_character_gender))
print(len(character_gender))

1681
1681


In [14]:
set(new_character_gender.values())

{'female', 'male'}

In [16]:
# Save character gender dictionary
with open('/data/fanfiction_ao3/harrypotter/character_genders.json', 'w') as f:
    character_gender = json.dump(new_character_gender, f)

### For the CSV

In [82]:
path = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/character_relationship_features.csv'
feats = pd.read_csv(path)
feats.columns

Index(['fic_id', 'character_original', 'character_canonical',
       'character_gender', 'character_features', 'title', 'author',
       'author_key', 'rating', 'category', 'fandom', 'relationship',
       'character', 'additional tags', 'language', 'published', 'status',
       'status date', 'words', 'comments', 'kudos', 'bookmarks', 'hits',
       'chapter_count', 'series', 'seriespart', 'seriesid', 'summary',
       'preface_notes', 'afterword_notes', 'character_in_relationship',
       'character_in_relationship_type'],
      dtype='object')

In [75]:
feats['character_gender'].value_counts()

male         195246
female       71526 
male[5]      3539  
male[2]      2745  
male[1]      2430  
male[4]      1749  
male[3]      595   
female[5]    375   
female[3]    313   
female[2]    274   
female[4]    227   
male[6]      147   
Name: character_gender, dtype: int64

In [83]:
feats['character_gender'] = feats['character_gender'].map(lambda x: re.match(r'\w+', x).group())
feats['character_gender'].value_counts()

male      206451
female    72715 
Name: character_gender, dtype: int64

In [84]:
# Save out annotated data
outpath = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/character_relationship_features.csv'
feats.to_csv(outpath, index=False)

In [85]:
feats['character_in_relationship_type'].value_counts()

character_not_in_relationship    186089
queer                            56580 
straight                         32448 
unknown                          4049  
Name: character_in_relationship_type, dtype: int64

## Annotate fic relationship type

In [1]:
# Load current character/fic features
import pandas as pd

dataset_name = 'complete_en_1k-50k'
path = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/character_gender_features.csv'
char_fic_feats = pd.read_csv(path)
char_fic_feats

Unnamed: 0,fic_id,character_original,character_canonical,character_gender,character_features
0,5888698,Remus,Remus,male,carries cherished carried carried was been wan...
1,5888698,Padfoot,Padfoot,male,by
2,10199,Draco,Draco,male,responded hinted is walked knew question sat c...
3,5355224,Hermione,Hermione,female,was felt growled let doing read gotten going r...
4,5355224,Draco,Draco,male,find like pouted gripped smirked joked drawled...
5,5355224,Ginny,Ginny,female,said snapped said said snickered getting think...
6,5355224,Luna,Luna,female,'s said seemed imagine said said smiled asked ...
7,5355224,Salazar,Salazar,male[5],damned
8,10838292,Remus,Remus,male,considered thought asked doing pulled going op...
9,10838292,Marlene,Marlene,male[5],liked tell trying mean mind mind touch said kn...


In [2]:
# Load fic metadata

metadata_fpath = '/data/fanfiction_gender_roles/harrypotter_ao3/story_metadata.csv'
metadata = pd.read_csv(metadata_fpath)
metadata

Unnamed: 0,fic_id,title,author,author_key,rating,category,fandom,relationship,character,additional tags,...,kudos,bookmarks,hits,chapter_count,series,seriespart,seriesid,summary,preface_notes,afterword_notes
0,100,A Strong Defense,ellen_fremedon,ellen_fremedon,"[""Explicit""]","[""M/M""]","[""Harry Potter - Rowling""]","[""Remus Lupin/Severus Snape""]","[""Remus Lupin"", ""Severus Snape""]","[""pre-HBP""]",...,109.0,10.0,2773.0,1,,,,"After the battle, the survivors regroup.\nWrit...",Thanks to Louise Lux for her thorough and thou...,
1,1000039,Laocoon's Children: The End Of The Story,copperbadge,copperbadge,"[""Teen And Up Audiences""]","[""Gen""]","[""Harry Potter - J. K. Rowling""]",[],[],"[""Alternate Universe"", ""Alternate Universe - C...",...,1773.0,61.0,27154.0,1,Stealing Harryverse,12.0,58157.0,My notes and fic excerpts for the end of Laoco...,,"And that's the end. Thanks for reading, everyo..."
2,1000044,"Sing, For The Sea Belongs To Me",leontina,Leontina,"[""Teen And Up Audiences""]","[""M/M""]","[""Harry Potter - J. K. Rowling""]","[""Draco Malfoy/Harry Potter""]","[""Draco Malfoy"", ""Harry Potter"", ""Teddy Lupin""]","[""Suicide Attempt""]",...,321.0,57.0,4880.0,1,,,,An unfortunate circumstance means that Harry g...,This was written for the Dracotops_Harry Fest ...,
3,10000460,Grim's child,Willofhounds,Willofhounds,"[""Teen And Up Audiences""]","[""Gen""]","[""RWBY"", ""Harry Potter - J. K. Rowling""]","[""Ozpin/Harry Potter"", ""Harry Potter/Voldemort""]","[""Harry Potter"", ""Voldemort"", ""Ozpin"", ""Roman ...","[""Harry Potter is Voldemort's Child"", ""Harry i...",...,139.0,27.0,6690.0,21,Children of Grim,1.0,810846.0,Voldemort and Bellatrix had a set of triplets ...,,
4,10000490,(the Girl Standing) In the Blush of Dawn,writing_as_tracey,writing_as_tracey,"[""Teen And Up Audiences""]","[""F/M"", ""Gen""]","[""Harry Potter - J. K. Rowling""]","[""Hermione Granger/Cormac McLaggen"", ""Sirius B...","[""Hermione Granger"", ""King Perseus - OC"", ""Sir...","[""Yona of the Dawn retelling"", ""High Fantasy"",...",...,8.0,2.0,279.0,1,,,,"On the eve of her 17th birthday, she confesses...","This HP AU follows a similar, but still diverg...",
5,1000053,The World Burned,Lillielle,Lillielle,"[""Teen And Up Audiences""]","[""M/M""]","[""Harry Potter - J. K. Rowling""]","[""Albus Dumbledore/Gellert Grindelwald""]","[""Albus Dumbledore"", ""Gellert Grindelwald"", ""A...","[""Slash"", ""Justification"", ""Unhealthy Relation...",...,21.0,1.0,816.0,1,"A Breath of Romance, A Twist of Despair",1.0,59048.0,"Disclaimer: I own nothing.\nAgainst the world,...",,
6,1000059,Take Half Of Me,leontina,Leontina,"[""Mature""]","[""M/M""]","[""Harry Potter - J. K. Rowling""]","[""Draco Malfoy/Harry Potter"", ""Astoria Greengr...","[""Harry Potter"", ""Draco Malfoy"", ""Luna Lovegoo...","[""Infidelity"", ""Background Het"", ""Anal Sex""]",...,98.0,17.0,2517.0,1,,,,When Harry joins Luna and Rolf on an expeditio...,"This was written for the HD-Tropes fest, and t...",
7,1000060,An Interlude with Potions,Lillielle,Lillielle,"[""Mature""]","[""F/M""]","[""Harry Potter - J. K. Rowling""]","[""Hermione Granger/Severus Snape""]","[""Hermione Granger"", ""Severus Snape""]","[""Potions"", ""Potions Accident"", ""unapologetic ...",...,21.0,,1323.0,1,"A Breath of Romance, A Twist of Despair",2.0,59048.0,"Disclaimer: I own nothing.\n""Use your mind,"" h...",,
8,1000061,Everybody Knows (Your Sister Does),Neon_Opal,Neon_Opal,"[""Mature""]","[""Multi""]","[""Harry Potter - J. K. Rowling""]","[""Harry Potter/Ginny Weasley"", ""and whoever Ha...","[""Harry Potter"", ""Ron Weasley"", ""mentions of.....","[""Best Friends"", ""Implied Sexual Content"", ""ar...",...,1.0,1.0,272.0,1,,,,Ron is listening to a muggle rock band on an o...,,"The song is real called ""Your Sister Does"" fro..."
9,10001,Remorse,Mairi%20Nathaira,Tara,"[""Teen And Up Audiences""]","[""M/M""]","[""Harry Potter - Rowling""]","[""Sirius Black/Remus Lupin""]","[""Sirius Black"", ""Remus Lupin"", ""James Potter""]","[""Ficlet"", ""Angst"", ""Drama""]",...,25.0,6.0,731.0,1,,,,What would've happened if James didn't rescue ...,"Another one of those ""What if"" fics . . . anot...",


In [7]:
# Check fic overlap
print(len(set(char_fic_feats['fic_id']).intersection(set(metadata['fic_id']))))
print(len(char_fic_feats['fic_id'].unique()))

66061
66061


In [3]:
# Merge
merged = pd.merge(char_fic_feats, metadata, on=['fic_id'])
print(len(char_fic_feats))
print(len(merged))

279166
279166


In [16]:
import re

re.split(r'[\/ ]', 'Remus Lupin/Sirius Black')

['Remus', 'Lupin', 'Sirius', 'Black']

In [33]:
# Load canonical list of characters
char_list_fpath = '/projects/fanfiction_gender_roles/scripts/harrypotter_characters.txt'

with open(char_list_fpath) as f:
    canonical_characters = f.read().splitlines()

extra = ['Dobby']
remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
         ]
canonical_characters += extra
canonical_characters = [c for c in canonical_characters if not c in remove]

canonical_character_name_parts = set([part for name in canonical_characters for part in name.split()])
exclude = set(['The'])
canonical_character_name_parts -= exclude
# print(len(canonical_character_name_parts))
# list(canonical_character_name_parts)[:10]

remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
          'Weasley', # ambiguous
          'Weasleys',
          'Hufflepuff',
         ]

name_transform = { # Might then miss when stories use these terms, though (same with canonicalize): might want to spread gender info afterward
    'Potter': 'Harry Potter',
    'Mr. Potter': 'Harry Potter',
    'Mr Potter': 'Harry Potter',
    'The Harry': 'Harry Potter',
    'Mr Weasley': 'Ron Weasley',
    'Tom': 'Tom Riddle',
    'Black': 'Sirius Black',
    'The Draco': 'Draco Malfoy',
    'The Dark Lord': 'Voldemort',
    'The Dark Lord Voldemort': 'Voldemort',
    'Dark Lord': 'Voldemort',
    'Lord': 'Voldemort',
    'Malfoy': 'Draco Malfoy',
    'James': 'James Potter I',
    'James Potter': 'James Potter I',
    'Lily': 'Lily J. Potter',
    'Albus': 'Albus Dumbledore',
    'Teddy': 'Teddy Lupin',
    'Newt': 'Newton Scamander',
    'Rose': 'Rose Granger-Weasley',
    'Lily Potter': 'Lily J. Potter',
    'Regulus': 'Regulus Black I',
    'Mrs Weasley': 'Molly Weasley',
}
    

def canonicalize(name):
    
    name_parts = name.split()
    new_name_parts = []
    
    for name_part in name_parts:
        if name_part in canonical_character_name_parts:
            new_name_parts.append(name_part)
            
    new_name = ' '.join(new_name_parts)
    
    if new_name in remove:
        new_name = ''
        
    if new_name in name_transform:
        new_name = name_transform[new_name]
            
    return new_name

In [34]:
# Load character gender dictionary

import json

with open('/data/fanfiction_gender_roles/harrypotter_ao3/hp_character_genders.json', 'r') as f:
    character_gender = json.load(f)

In [60]:
from IPython.core.debugger import set_trace
import re

# TODO: consider relationships with more than 2 characters differently
def char_in_relationship(char, rel):
    
    match = False
    rel_type = 'character_not_in_relationship'
    
    char_parts = set([c.lower() for c in char.split()])
    
    # Is the character in a relationship?
    for relationship in eval(rel):
        if not '/' in relationship: continue # isn't romantic
        rel_parts = set([c.lower() for c in re.split(r'[\/ ]', relationship)])
#         set_trace()
        if len(char_parts.intersection(rel_parts)) > 0:
            match = True
            
            # Determine other character in relationship and their gender
            rel_chars = relationship.split('/')
            other_char = ''
            for rel_char in rel_chars:
                if not any([char_part in rel_char.lower() for char_part in char_parts]):
                    other_char = canonicalize(rel_char)
                    
            # Determine gender of char, other char in relationship
            char_gender = character_gender[canonicalize(char)]
            if not other_char in character_gender: # other char gender unknown
                rel_type = 'unknown'
            else:
                other_char_gender = character_gender[other_char]
            
                if char_gender != other_char_gender:
                    rel_type = 'straight'

                else:
                    rel_type = 'queer'
    
            break
    
    return (match, rel_type)

In [67]:
print(char_in_relationship('Sirius', '["Remus Lupin/Sirius Black"]'))
print(char_in_relationship('Hermione', '["Hermione Granger/Draco Malfoy", "Fleur Delacour/Bill Weasley", "Arthur Weasley/Molly Weasley", "Lucius Malfoy/Narcissa Black Malfoy", "Katie Bell/Marcus Flint"]'))
print(char_in_relationship('Snape', '["Severus Snape/Original Female Character"]'))

(True, 'queer')
(True, 'straight')
(True, 'unknown')


In [61]:
merged['character_in_relationship'], merged['character_in_relationship_type'] = list(zip(*[char_in_relationship(tup[0], tup[1]) for tup in zip(merged['character_original'], merged['relationship'])]))
print(merged['character_in_relationship'].sum())
merged['character_in_relationship_type'].value_counts()

93077


character_not_in_relationship    186089
queer                            56580 
straight                         32448 
unknown                          4049  
Name: character_in_relationship_type, dtype: int64

In [57]:
merged.drop(columns=['char_in_relationship'], inplace=True)
merged.columns

Index(['fic_id', 'character_original', 'character_canonical',
       'character_gender', 'character_features', 'title', 'author',
       'author_key', 'rating', 'category', 'fandom', 'relationship',
       'character', 'additional tags', 'language', 'published', 'status',
       'status date', 'words', 'comments', 'kudos', 'bookmarks', 'hits',
       'chapter_count', 'series', 'seriespart', 'seriesid', 'summary',
       'preface_notes', 'afterword_notes', 'character_in_relationship',
       'character_in_relationship_type'],
      dtype='object')

In [56]:
# Check char in relationship labeling
pd.set_option('display.max_colwidth', -1)
merged[merged['character_in_relationship']==False].sample(10).loc[:, ['fic_id', 'character_original', 'relationship']]

Unnamed: 0,fic_id,character_original,relationship
171347,8209547,Harry,"[""Teddy Lupin/Victoire Weasley"", ""Teddy Lupin & Victoire Weasley""]"
218808,12084768,Harry,[]
60664,712827,Hermione,[]
41445,12962706,Harry,"[""Hermione Granger/Ginny Weasley""]"
99129,635844,Neville,"[""Harry Potter/Severus Snape""]"
36039,3374774,Hermione,"[""Draco Malfoy/Harry Potter""]"
113080,3330665,Albus,[]
52758,5295572,Minerva,"[""Sirius Black/Remus Lupin"", ""Draco Malfoy/Harry Potter""]"
102685,233961,Alicia,"[""Katie Bell/Marcus Flint""]"
83053,1525373,Diggory,"[""Roger Davies/Fleur Delacour""]"


In [55]:
# Check char in relationship labeling
pd.set_option('display.max_colwidth', -1)
merged[merged['character_in_relationship']==True].sample(10).loc[:, ['fic_id', 'character_original', 'relationship']]

Unnamed: 0,fic_id,character_original,relationship
149409,1651082,Harry,"[""Harry Potter/Severus Snape""]"
169998,14869499,Tonks,"[""Remus Lupin/Nymphadora Tonks""]"
200983,6267799,Ron,"[""Harry Potter/Ron Weasley""]"
58186,2595152,Malfoy,"[""Draco Malfoy/Harry Potter""]"
105711,248997,Sybill,"[""Severus Snape/Sybill Trelawney""]"
139083,14923379,Bill,"[""Draco Malfoy/Harry Potter"", ""Luna Lovegood/Ginny Weasley"", ""Hermione Granger/Ron Weasley"", ""Angelina Johnson/George Weasley"", ""Fleur Delacour/Bill Weasley"", ""Arthur Weasley/Molly Weasley""]"
209325,3325496,Harry,"[""Hermione Granger & Harry Potter & Ron Weasley"", ""Hermione Granger & Harry Potter"", ""Hermione Granger & Ron Weasley"", ""Harry Potter & Ron Weasley"", ""Fred Weasley & George Weasley & Lee Jordan"", ""lightly implied Harry Potter/Draco Malfoy flirtatious rivalry"", ""weasley family - Relationship"", ""General Friendship""]"
79070,573057,Hermione,"[""Hermione Granger/Draco Malfoy""]"
116194,891078,Remus,"[""Remus Lupin/Severus Snape""]"
239538,374732,Severus,"[""Remus Lupin/Severus Snape""]"


In [69]:
# Check relationship type labeling
from IPython.display import display

for rel_type in ['unknown', 'queer', 'straight']:
    display(merged[merged['character_in_relationship_type'] == rel_type].sample(10).loc[:, ['character_in_relationship_type', 'fic_id', 'character_original', 'relationship', 'category']])

Unnamed: 0,character_in_relationship_type,fic_id,character_original,relationship,category
217559,unknown,4001998,Draco,"[""Draco Malfoy/Alex Russo""]","[""F/M""]"
142184,unknown,16201475,Potter,"[""James Potter/Lily Evans Potter""]","[""F/M""]"
160804,unknown,875827,Harry,"[""Harry Potter/Half-Blood Prince""]","[""M/M""]"
168816,unknown,14263503,Druella,"[""Druella Rosier Black & Narcissa Black Malfoy"", ""Cygnus Black/Druella Rosier Black"", ""Lucius Malfoy/Narcissa Black Malfoy"", ""Narcissa Black Malfoy & Andromeda Black Tonks""]","[""Gen""]"
139436,unknown,7866745,Draco,"[""Draco Malfoy/Other(s)"", ""Voldemort/Other(s)""]",[]
45865,unknown,14615643,Charlie,"[""Charlie Weasley/Original Female Character(s)""]","[""F/M""]"
196113,unknown,422526,Harry,"[""Harry/Teddy - Relationship""]","[""M/M""]"
14214,unknown,653867,Bill,"[""Nymphadora Tonks/Charlie Weasley"", ""Bill Weasley/Original Female Character""]","[""F/M""]"
197042,unknown,2590781,Harry,"[""Spock/Harry Potter"", ""James T. Kirk/Leonard McCoy""]","[""M/M""]"
230475,unknown,4351493,Severus,"[""Severus Snape/OFC""]","[""F/M""]"


Unnamed: 0,character_in_relationship_type,fic_id,character_original,relationship,category
80194,queer,580094,Sirius,"[""Sirius Black/Remus Lupin""]","[""M/M""]"
40388,queer,622356,Draco,"[""Draco Malfoy/Harry Potter""]","[""M/M""]"
181895,queer,10052486,Severus,"[""Harry Potter/Severus Snape""]","[""M/M"", ""Gen""]"
40676,queer,55315,Draco,"[""Harry Potter/Draco Malfoy""]","[""M/M""]"
94827,queer,10060838,Tom,"[""Fred Weasley/George Weasley/Other(s)"", ""Harry Potter/Tom Riddle"", ""Hermione Granger/Ron Weasley""]","[""M/M"", ""Multi"", ""F/M""]"
95008,queer,10190321,Harry,"[""Draco Malfoy/Harry Potter"", ""Lucius Malfoy/Harry Potter""]","[""M/M""]"
95305,queer,81277,Draco,"[""Harry/Draco""]","[""M/M""]"
277426,queer,14342685,Harry,"[""Draco Malfoy/Harry Potter""]","[""M/M""]"
3671,queer,292770,Hagrid,"[""Rubeus Hagrid/Severus Snape""]","[""M/M""]"
218241,queer,14221179,Bill,"[""Original Percival Graves/Newt Scamander"", ""Rolf Scamander/Bill Weasley"", ""Tina Goldstein/Newt Scamander""]","[""Gen"", ""M/M"", ""F/M""]"


Unnamed: 0,character_in_relationship_type,fic_id,character_original,relationship,category
197683,straight,10764942,Harry,"[""Harry Potter/Nymphadora Tonks""]","[""F/M""]"
24843,straight,8725459,James,"[""James Potter/Lily Evans"", ""James Potter/Lily Evans Potter""]","[""F/M""]"
206290,straight,6333055,Harry,"[""Hermione Granger/Harry Potter""]","[""F/M""]"
68748,straight,6059362,Fleur,"[""Fleur Delacour/Bill Weasley"", ""Hermione Granger/Ron Weasley""]","[""F/M"", ""Gen""]"
114085,straight,9780308,Hermione,"[""Hermione Granger/Ron Weasley"", ""Ron Weasley & Hugo Weasley""]","[""Gen""]"
169491,straight,8387506,Hermione,"[""Draco Malfoy/Harry Potter"", ""Hermione Granger/Ron Weasley"", ""Pansy Parkinson/Blaise Zabini""]","[""F/M"", ""M/M""]"
171642,straight,15831033,Draco,"[""Hermione Granger/Draco Malfoy""]","[""F/M""]"
30434,straight,15759117,Ginny,"[""Harry Potter/Ginny Weasley""]","[""F/M""]"
56260,straight,703770,Harry,"[""Harry Potter/Ginny Weasley"", ""Harry Potter/Charlie Weasley"", ""Hermione Granger/Harry Potter""]","[""F/M"", ""M/M""]"
58718,straight,1748846,Astoria,"[""Harry/Draco"", ""Astoria Greengrass/Draco Malfoy""]","[""M/M""]"


In [71]:
# Save out annotated data
outpath = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/character_relationship_features.csv'
merged.to_csv(outpath, index=False)

## Create spreadsheet with character features, gender, etc

In [20]:
# Load character gender dictionary
import json

with open('/data/fanfiction_gender_roles/harrypotter_ao3/hp_character_genders.json', 'r') as f:
    character_gender = json.load(f)

# Load canonical character names to remove from character features
char_list_fpath = '/projects/fanfiction_gender_roles/scripts/harrypotter_characters.txt'
with open(char_list_fpath) as f:
    canonical_characters = f.read().splitlines()
    
extra = ['Dobby']
remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
         ]
canonical_characters += extra
canonical_characters = [c for c in canonical_characters if not c in remove]

print(len(canonical_characters))

canonical_character_name_parts = set([part for name in canonical_characters for part in name.split()])
exclude = set(['the'])
canonical_character_name_parts -= exclude
list(canonical_character_name_parts)[:10]

remove = ['Hogwarts', 'Slytherin', 'Gryffindor', 'Order', 'The',
         'The Slytherin',
          'The Headmaster',
          'Weasley', # ambiguous
          'Weasleys',
          'Hufflepuff',
         ]

name_transform = { # Might then miss when stories use these terms, though (same with canonicalize): might want to spread gender info afterward
    'Potter': 'Harry Potter',
    'Mr. Potter': 'Harry Potter',
    'Mr Potter': 'Harry Potter',
    'The Harry': 'Harry Potter',
    'Mr Weasley': 'Ron Weasley',
    'Tom': 'Tom Riddle',
    'Black': 'Sirius Black',
    'The Draco': 'Draco Malfoy',
    'The Dark Lord': 'Voldemort',
    'The Dark Lord Voldemort': 'Voldemort',
    'Dark Lord': 'Voldemort',
    'Lord': 'Voldemort',
    'Malfoy': 'Draco Malfoy',
    'James': 'James Potter I',
    'James Potter': 'James Potter I',
    'Lily': 'Lily J. Potter',
    'Albus': 'Albus Dumbledore',
    'Teddy': 'Teddy Lupin',
    'Newt': 'Newton Scamander',
    'Rose': 'Rose Granger-Weasley',
    'Lily Potter': 'Lily J. Potter',
    'Regulus': 'Regulus Black I',
    'Mrs Weasley': 'Molly Weasley',
}
    

def canonicalize(name):
    
    name_parts = name.split()
    new_name_parts = []
    
    for name_part in name_parts:
        if name_part in canonical_character_name_parts:
            new_name_parts.append(name_part)
            
    new_name = ' '.join(new_name_parts)
    
    if new_name in remove:
        new_name = ''
        
    if new_name in name_transform:
        new_name = name_transform[new_name]
            
    return new_name

245


In [23]:
# Load character features, merge with gender, etc
import os
import json
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from IPython.core.debugger import set_trace

dataset_name = 'complete_en_1k-50k'
output_path = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/char_features'

header = ['fic_id', 'character_original', 'character_canonical', 'character_gender', 'character_features']
# outlines = [header]
outlines = []

name_parts_lower = [c.lower() for c in canonical_character_name_parts]

for fname in tqdm(os.listdir(output_path)):
    fic_id = fname.split('.')[0]
    
    with open(os.path.join(output_path, fname)) as f:
        char_features = json.load(f)
              
    for char in char_features:
        canonical_char = canonicalize(char)
        if not canonical_char in character_gender: continue
        gender = character_gender[canonical_char]
        
        feats = ' '.join([feat for feat in char_features[char] if not feat in name_parts_lower])
        if len(feats) < 1: continue
        outlines.append([fic_id, char, canonical_char, gender, feats])
            
out_df = pd.DataFrame(outlines[1:], columns=header)
out_df

HBox(children=(IntProgress(value=0, max=71943), HTML(value='')))

Unnamed: 0,fic_id,character_original,character_canonical,character_gender,character_features
0,5888698,Remus,Remus,male,carries cherished carried carried was been wan...
1,5888698,Padfoot,Padfoot,male,by
2,10199,Draco,Draco,male,responded hinted is walked knew question sat c...
3,5355224,Hermione,Hermione,female,was felt growled let doing read gotten going r...
4,5355224,Draco,Draco,male,find like pouted gripped smirked joked drawled...
5,5355224,Ginny,Ginny,female,said snapped said said snickered getting think...
6,5355224,Luna,Luna,female,'s said seemed imagine said said smiled asked ...
7,5355224,Salazar,Salazar,male[5],damned
8,10838292,Remus,Remus,male,considered thought asked doing pulled going op...
9,10838292,Marlene,Marlene,male[5],liked tell trying mean mind mind touch said kn...


In [24]:
outpath = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/output/character_gender_features.csv'
out_df.to_csv(outpath, index=False)

In [25]:
# Print #stories

len(out_df['fic_id'].unique())

66061

# Turn processed txt files into csv

In [9]:
# Copy existing csv files over
for fname in tqdm(os.listdir(coref_output_dirpath)):
    
    if not fname.endswith('.txt'):
        outpath = os.path.join(os.path.dirname(coref_output_dirpath), 'char_coref_stories_csv', fname)
        shutil.copy(os.path.join(coref_output_dirpath, fname), outpath)

HBox(children=(IntProgress(value=0, max=23780), HTML(value='')))

In [None]:
import os
from tqdm import tqdm_notebook as tqdm
import csv
import sys
import shutil
from IPython.core.debugger import set_trace

csv.field_size_limit(sys.maxsize)
coref_output_dirpath = '/data/fanfiction_gender_roles/harrypotter_ao3/complete_en_1k-50k/output/char_coref_stories'
original_dirpath = '/data/fanfiction_gender_roles/harrypotter_ao3/complete_en_1k-50k/fics'

for fname in tqdm(os.listdir(coref_output_dirpath)):
    
    if not fname.endswith('.txt'):
        continue
        
    fic_id = fname[:-10]
        
    # Read in header from original CSV
    with open(os.path.join(original_dirpath, f'{fic_id}.csv')) as fin:
        reader = csv.reader(fin)
        header = next(reader)
        
        # Read in coreffed txt
        with open(os.path.join(coref_output_dirpath, fname)) as f:
            lines = f.readlines()

        # Output
        with open(os.path.join(coref_output_dirpath, f'{fic_id}.coref.csv'), 'w') as fout:
            writer = csv.writer(fout)
            writer.writerow(header)

            # Write content
            for row,text in zip(reader, lines):
                if not len(row) - 1 > 0:
                    continue
                row[len(row)-1]= text # fill in text in final position
                writer.writerow(row)

HBox(children=(IntProgress(value=0, max=94861), HTML(value='')))

In [None]:
# Original
f = open(output_dir+"/"+filename[:-4]+".coref.txt")
fin = open(test_csv_dir+filename)
reader = csv.reader(fin)                                                 │
 50             header = next(reader)                                                    │  1  [||       8.9%]   9  [||||    23.1%]   17 [|||||||100.0%]   25 [|||||   32.7%]
 51             #fout = open(output_dir+"/"+filename[:-4]+".coref.csv","w", encoding='cp1│  2  [|||     13.0%]   10 [||||    25.6%]   18 [|||||||100.0%]   26 [|||     12.9%]
    250')                                                                                │  3  [||||||||57.1%]   11 [||||||| 45.3%]   19 [|||||   30.7%]   27 [||||    21.1%]
 52             fout = open(output_dir+"/"+filename[:-4]+".coref.csv","w", encoding='utf8│  4  [|||     10.5%]   12 [|||||||100.0%]   20 [|||||||100.0%]   28 [|||     14.0%]
    ')                                                                                   │  5  [||||||||97.5%]   13 [|||||||100.0%]   21 [|||     12.3%]   29 [||       7.5%]
 53             writer = csv.writer(fout)                                                │  6  [||||||||96.9%]   14 [|||||||100.0%]   22 [|||     11.7%]   30 [|||      9.2%]
 54             lines = f.readlines()                                                    │  7  [||||||||87.7%]   15 [|||||||100.0%]   23 [|||     12.9%]   31 [|||     14.8%]
 55             writer.writerow(header)                                                  │  8  [|||||||100.0%]   16 [|||     14.2%]   24 [|||      9.3%]   32 [|||||||100.0%]
 56                                                                                      │  Mem[|||||||||||||||||||||||||193G/252G]   Tasks: 236, 1474 thr; 39 running
 57             for row,text in zip(reader, lines):                                      │  Swp[|||||||||||||||||||||||||128G/128G]   Load average: 20.66 22.19 22.91 
 58             #row["paragraph"] = text                                                 │                                            Uptime: 32 days, 23:14:02
 59             #print (row["paragraph"])                                                │
 60             #print (text)                                                            │  PID USER      PRI  NI  VIRT   RES   SHR S CPU% MEM%   TIME+  Command                   
 61                 row[len(row)-1]= text                                                │31768 mamille2   20   0  9.5G 5801M 16612 S 355.  2.2 16h16:59 java -Xmx5g -cp stanford-c
 62                 writer.writerow(row)                                                 │20318 mamille2   20   0 9651M 5735M 16656 S 135.  2.2 13h32:48 java -Xmx5g -cp stanford-c
 63             #print (row['paragraph'])                                                │32698 xinruyan   20   0 9643M 3483M 16600 S 123.  1.4  2:46.96 java -cp /usr0/home/xinruy
 64                                                                                      │18935 mamille2   20   0 9715M 5795M  9660 S 119.  2.2 76h10:31 java -Xmx5g -cp stanford-c
 65             fin.close()                                                              │32760 xinruyan   20   0 9643M 3339M 16868 R 104.  1.3  0:27.91 java -cp /usr0/home/xinruy
 66             f.close()                                                                │32759 xinruyan   20   0 9643M 3339M 16868 S 103.  1.3  2:07.80 java -cp /usr0/home/xinruy
 67             fout.close()          

# Split data into smaller directories

In [6]:
import os
from collections import defaultdict
import numpy as np

dataset_name = 'complete_en_1k-50k'
# fic_dirpath = f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/fics'
fic_dirpath = f'/projects/fanfiction-nlp/{dataset_name}_fic_text'

fnames = os.listdir(fic_dirpath)

# Mock split
split100 = defaultdict(list)
for fname in fnames:
    split100[fname[:2]].append(fname)
    
# Figures on splits
split100_lengths = [len(subdir) for name, subdir in split100.items()]
print(sorted(split100_lengths))
print(np.mean(split100_lengths))
print(len(split100))

[304, 378, 395, 401, 408, 421, 443, 445, 449, 457, 459, 459, 467, 470, 473, 477, 482, 484, 488, 488, 490, 491, 497, 499, 504, 508, 509, 511, 518, 521, 526, 528, 535, 538, 541, 541, 553, 562, 566, 570, 572, 576, 584, 585, 586, 588, 589, 590, 591, 591, 592, 592, 605, 608, 608, 611, 619, 624, 628, 629, 630, 630, 639, 640, 644, 650, 654, 655, 658, 674, 675, 677, 680, 699, 699, 701, 710, 728, 756, 840, 883, 1100, 1433, 2559, 4031, 4240, 4643, 5603, 5972, 8723]
935.0
90


In [7]:
# Make split
from tqdm import tqdm_notebook as tqdm
import shutil

fic_split_dirpath = fic_dirpath+'_split'
os.mkdir(fic_split_dirpath)

for split, split_fnames in tqdm(split100.items()):
    split_dirpath = os.path.join(fic_split_dirpath, split)
    os.mkdir(split_dirpath)
    for fname in split_fnames:
        shutil.copy(os.path.join(fic_dirpath, fname), os.path.join(split_dirpath, fname))

HBox(children=(IntProgress(value=0, max=90), HTML(value='')))




# Sample HP data

In [1]:
# Load HP metadata
import pandas as pd

metadata_fpath = '/data/fanfiction_gender_roles/harrypotter_ao3/story_metadata.csv'

metadata = pd.read_csv(metadata_fpath)
metadata.columns

Index(['fic_id', 'title', 'author', 'author_key', 'rating', 'category',
       'fandom', 'relationship', 'character', 'additional tags', 'language',
       'published', 'status', 'status date', 'words', 'comments', 'kudos',
       'bookmarks', 'hits', 'chapter_count', 'series', 'seriespart',
       'seriesid', 'summary', 'preface_notes', 'afterword_notes'],
      dtype='object')

In [3]:
# Filter to complete fics in English between 1k-50k words

filtered_metadata = metadata[(metadata['status']=='Completed') & \
                            (metadata['language'] == 'English') & \
                            (metadata['words']>=1000) & \
                            (metadata['words']<=50000)]

print(len(metadata))
print(len(filtered_metadata))

179407
84150


In [8]:
# Load HP files
import os

# fic_dirpath = '/usr2/mamille2/fanfiction-project/data/ao3/harrypotter/fics_paras/'
fic_dirpath = '/data/fanfiction_gender_roles/harrypotter_ao3/preprocessed_paras/

fic_ids = []
for fname in os.listdir(fic_dirpath):
    if not fname.endswith('_tokenized_paras.txt'):
        fic_ids.append(int(fname[:-4]))
        
len(fic_ids)

179407

In [4]:
selected_fic_ids = filtered_metadata['fic_id'].to_list()
len(selected_fic_ids)

84150

In [None]:
# Copy fics over 
import shutil
from tqdm import tqdm_notebook as tqdm

for fic_id in tqdm(selected_fic_ids):
    shutil.copy(os.path.join(fic_dirpath, f'{fic_id}.txt'), 
                os.path.join(f'/data/fanfiction_gender_roles/harrypotter_ao3/selected/{fic_id}.txt')
               )

HBox(children=(IntProgress(value=0, max=86303), HTML(value='')))

In [5]:
# Construct dict of fics -> chapter names
import os
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

fic_dirpath = '/usr2/scratch/fanfic/ao3_harrypotter_text/stories/'
chap_names = os.listdir(fic_dirpath)
fic2chapter = defaultdict(list)

for chapter in tqdm(chap_names):
    fic,_ = chapter.split('_')
    fic2chapter[fic].append(chapter)
    
len(fic2chapter)

179407

In [7]:
# Copy raw fic CSVs over
import shutil
from tqdm import tqdm_notebook as tqdm

problem_chars = [
    "\u2028",
    '\u0092',
    '\u0093',
    '\u0094',
]

dataset_name = 'complete_en_1k-50k'

for fic_id in tqdm(selected_fic_ids):
    # Combine chapters into fics
    fic_chapters = []

    for chapter in fic2chapter[str(fic_id)]:
        with open(os.path.join(fic_dirpath, chapter)) as f:
            data = f.read()
            for char in problem_chars:
                data = data.replace(char, ' ')
                    
#             fic_chapters.append('\n'.join(f.read().replace(, ' ').splitlines()[1:])) # Add all but first header line
            fic_chapters.append('\n'.join(data.splitlines()[1:])) # Add all but first header line
        
    with open(os.path.join(f'/data/fanfiction_gender_roles/harrypotter_ao3/{dataset_name}/fics/{fic_id}.csv'), 'w') as f:
        f.write('fic_id,chapter_id,para_id,text\n')
        f.write('\n'.join(fic_chapters))

HBox(children=(IntProgress(value=0, max=84150), HTML(value='')))


