# Evaluate heuristic filtering characters to just those with quotes and assertions

## Load pipeline output

In [3]:
import os

output_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/pipeline_output_orig'
char_dirpath = os.path.join(output_dirpath, 'char_coref_chars')
pipeline_chars = {}
for fname in os.listdir(char_dirpath):
    story = fname.split('.')[0]
    with open(os.path.join(char_dirpath, fname)) as f:
        pipeline_chars[story] = f.read().splitlines()
pipeline_chars

{'drwho_8333365': ['the_Doctor', 'no_day', 'Clara', 'Miss'],
 'dragonage_4305894': ['the_Tevinter',
  'Lavellan',
  'Sera',
  'Solas',
  '”_Dorian',
  'Leliana',
  'The_Dread_Wolf',
  'The_Well_of_Sorrows'],
 'tolkien_2185500': ['female_Fíli',
  'both_Bilbos',
  'the_Thorin',
  'Female_Thorin',
  'Male_Bilbo',
  'Male_Thorin',
  'Female_Bilbo',
  'The_female_Gandalf',
  '”_female_Kíli',
  '”_female_Bilbo'],
 'starwars_6082176': ['the_Force',
  'Good_night',
  'Tahari',
  'Jaina',
  'Mara',
  'their_infant_son_Ben',
  'Luke',
  'Booster',
  'the_Yuuzhan_Vong',
  'my_fellow_Jedis',
  'this_night',
  'the_deadly_Yuuzhan_Vong',
  'Jedi_Goshyn_Maul',
  'including_Anakin',
  'Anakin_Solo',
  'Han',
  'Master_Skywalker',
  'Young_Ben',
  '”_Leia',
  'A_Celebration'],
 'teenwolf_3150806': ['Scott_McCall', '”_Stiles', 'Dude', '”_Derek', 'Derek'],
 'sherlock_1296961': ['Sherlock',
  'her_Sarah',
  'the_night',
  'Jeanette',
  'John_Watson',
  'Lestrade',
  'Alison'],
 'supernatural_1813147': ['U

## Retrieve quotes and assertions

In [9]:
import json
import pdb

filtered_chars = {} # story: char_list
for story in pipeline_chars:
    filtered_chars[story] = []
    
    # Load quotes
    with open(os.path.join(output_dirpath, 'quote_attribution', f'{story}.quote.json')) as f:
        quotes = json.load(f)
    quotes_chars = [entry['speaker'] for entry in quotes]
        
    # Load assertions
    with open(os.path.join(output_dirpath, 'assertion_extraction', f'{story}.json')) as f:
        assertions = json.load(f)
    assertions_chars = assertions.keys()
    
    for char in pipeline_chars[story]:
        char_assertions = assertions.keys()
        if char in quotes_chars and char in assertions_chars:
            filtered_chars[story].append(char)
filtered_chars

{'drwho_8333365': ['Clara'],
 'dragonage_4305894': ['Lavellan', 'Sera', '”_Dorian', 'Leliana'],
 'tolkien_2185500': ['female_Fíli',
  'both_Bilbos',
  'the_Thorin',
  'Female_Thorin',
  'Male_Bilbo',
  'Male_Thorin',
  'Female_Bilbo',
  'The_female_Gandalf',
  '”_female_Kíli',
  '”_female_Bilbo'],
 'starwars_6082176': ['Jaina',
  'Mara',
  'their_infant_son_Ben',
  'Luke',
  'Booster',
  'Jedi_Goshyn_Maul',
  '”_Leia',
  'A_Celebration'],
 'teenwolf_3150806': ['Scott_McCall', '”_Stiles', '”_Derek', 'Derek'],
 'sherlock_1296961': ['Sherlock', 'John_Watson'],
 'supernatural_1813147': [],
 'harrypotter_2287736': ['It_s_Harry',
  'our_Mr_Potter-',
  'Harry',
  'Cedric',
  'The_Cup',
  'The_Skrewt'],
 'allmarvel_606106': ['Clint',
  'Thor',
  'The_spirit_Bruce',
  'Jarvis',
  'Anthony_Edward_Stark',
  'Tony_Stark'],
 'dcu_16369049': ['”_Cisco',
  'Caitlin',
  'the_Star_Labs_Van',
  '”_Barry',
  'Barry_for_the_mouse',
  '”_Iris']}

In [11]:
# Write out filtered character list
filtered_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/output_filtered'
for story, chars in filtered_chars.items():
    with open(os.path.join(filtered_dirpath, 'char_coref_chars', f'{story}.chars'), 'w') as f:
        for char in sorted(chars):
            f.write(f'{char}\n')

# Check/compare pipeline output

In [6]:
# Check coref
import os
import pandas as pd

pd.set_option('display.max_colwidth', None)

output_dirpath = '/home/mamille2/storyq/student_scifi/output/'
for fname in os.listdir(os.path.join(output_dirpath, 'char_coref_stories'))[:2]:
    if fname.endswith('.csv'):
        data = pd.read_csv(os.path.join(output_dirpath, 'char_coref_stories', fname))
data['text_tokenized']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [36]:
# Check quote attribution
import json

output_dirpath = '/home/mamille2/storyq/student_scifi/output/'
filtered_output_dirpath = '/home/mamille2/storyq/student_scifi/output_filtered/'

for fname in os.listdir(os.path.join(output_dirpath, 'quote_attribution'))[:1]:
    with open(os.path.join(output_dirpath, 'quote_attribution', fname)) as f:
        data = json.load(f)
        
for fname in os.listdir(os.path.join(filtered_output_dirpath, 'quote_attribution'))[:1]:
    with open(os.path.join(filtered_output_dirpath, 'quote_attribution', fname)) as f:
        filtered_data = json.load(f)
        
print(len(data))
print(len(filtered_data))

17
15


In [38]:
data_speakers = [entry['speaker'] for entry in data]
data_speakers

['Car_Kaysia',
 'Car_Kaysia',
 'Car_Kaysia',
 'Car_Kaysia',
 'Car_Kaysia',
 'Maby',
 'Car_Kaysia',
 'the_Bison',
 'Jayson',
 'Car_Kaysia',
 'the_Bison',
 'Bye_Kaysia',
 'Bye_Kaysia',
 'the_skunks',
 'the_skunks',
 'Bye_Kaysia',
 'Bye_Kaysia']

In [12]:
# Check assertion extraction
for fname in os.listdir(os.path.join(output_dirpath, 'assertion_extraction'))[:1]:
    print(fname)
    with open(os.path.join(output_dirpath, 'assertion_extraction', fname)) as f:
        data = json.load(f)
        
data

story6.json


{'COYOTES': [' OH MY GOD, WHY DO YOU HAVE COYOTES AND SKUNKS IN A NET?!? WAIT WHY DO YOU HAVE COYOTES AND SKUNKS AT ALL, AND WHAT IS THAT STENCH she screams at me./ I say / "Wait you caught him?'],
 'Saturday': [' /  I said./  said Kaysia. /We call the police as an anonymous and leave. /  said Kaysia./  I say.  I ask Kaysia./  asks./  I say. /We stop walking and she looks at me. /  she says./  I say./ She nods and says,, and walk away./ I super speed home and celebrate my huge victory./']}

In [13]:
data.keys()

dict_keys(['COYOTES', 'Phyco', 'Kysia', 'Saturday'])

In [14]:
data['Kysia']

