# Create BookNLP tokens file for quote attribution
Has gold tokenization and quote extraction with a broader set of quote characters

In [34]:
import pdb

def in_quotation(tokens):
    
    start_quote_chars = ['``', '"', '«']
    end_quote_chars = ["''", '"', '»']
    
    result = []
    state = 'O' # Beginning, inside, end, outside
    transform_state = {
        'B': 'B',
        'I': 'I',
        'E': 'I',
        'O': 'O'
    }
    
    for tok in tokens:
        if state == 'O' or state == 'E' and tok in start_quote_chars:
            state = 'B'
            
        elif state == 'B' and not tok in end_quote_chars:
            state = 'I'
            
        elif state == 'B' or state == 'I' and tok in end_quote_chars:
            state = 'E'
            
        elif state == 'E' and not tok in start_quote_chars:
            state = 'O'
            
            
        result.append(transform_state[state])
            
    return result

In [22]:
import re

def verify_bio(sequence):
    
    seq_str = ''.join(sequence)
    assert not re.search(r'[EO]I', seq_str)
    assert not re.search(r'[O]E', seq_str)

In [35]:
# Load gold tokenization

import os
import pandas as pd

dataset_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev'
csv_dirpath = os.path.join(dataset_dirpath, 'fics')
fnames = os.listdir(csv_dirpath)

for fname in fnames[:1]:
#     outlines = []
    header = [
        'paragraph_id', 
        'sentence_id', 
        'token_id', 
        'begin_offset', 
        'end_offset', 
        'whitespace_after', 
        'head_token_id', 
        'original_word', 
        'normalized_word', 
        'lemma', 
        'pos', 
        'ner', 
        'deprel', 
        'in_quotation', 
        'character_id', 
        'supersense'
    ]
    
    data = pd.read_csv(os.path.join(csv_dirpath, fname)).drop(columns=['text'])
#     tokens = data['text_tokenized'].map(lambda x: x.split())
    s = data['text_tokenized'].str.split().apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = 'original_word'
    del data['text_tokenized']
    token_data = data.join(s)
    
    # Add quotation column
    token_data['in_quotation'] = in_quotation(token_data['original_word'])
    verify_bio(token_data['in_quotation'].tolist())
    
    # Add NaN columns
    token_data = token_data.reindex(columns=header)

In [36]:
token_data.columns

Index(['paragraph_id', 'sentence_id', 'token_id', 'begin_offset', 'end_offset',
       'whitespace_after', 'head_token_id', 'original_word', 'normalized_word',
       'lemma', 'pos', 'ner', 'deprel', 'in_quotation', 'character_id',
       'supersense'],
      dtype='object')

In [37]:
print(len(data))
print(len(token_data))

52
2791


In [39]:
token_data.loc[:, ['original_word', 'in_quotation']]

Unnamed: 0,original_word,in_quotation
0,Two,B
0,weeks,I
0,.,I
1,Two,I
1,weeks,I
1,of,I
1,chasing,I
1,a,I
1,HYDRA,I
1,weapons,I


# Check quote attribution output

In [3]:
import os
import json

predicted_quotes_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/pipeline_output/quote_attribution/'
predicted_quotes = {}

for fname in os.listdir(predicted_quotes_dirpath):
    print(fname)
    fic_id = int(fname.split('.')[0].split('_')[-1])
    
    with open(os.path.join(predicted_quotes_dirpath, fname)) as f:
        predicted_quotes[fic_id] = json.load(f)
        
predicted_quotes

teenwolf_1145590.quote.json
harrypotter_459070.quote.json
sherlock_12828381.quote.json
tolkien_5581141.quote.json
allmarvel_1621415.quote.json


{1145590: [],
 459070: [{'speaker': 'uncle_harry',
   'quotes': [{'start': 4541, 'end': 4545, 'quote': "`` m sorry , ''"}],
   'paragraph': 93,
   'type': 'Explicit',
   'start': 4539,
   'end': 4555,
   'replyto': -1}],
 12828381: [{'speaker': 'sherlock_glanced_up_then',
   'quotes': [{'start': 277,
     'end': 283,
     'quote': "`` You have gorgeous eyes . ''"}],
   'paragraph': 3,
   'type': 'Explicit',
   'start': 277,
   'end': 292,
   'replyto': -1},
  {'speaker': 'sherlock_glanced_up_then',
   'quotes': [{'start': 318,
     'end': 332,
     'quote': "`` ccc_Sherlock_glanced_up_then_ccc do n't look like the type to frequent places like this . ''"}],
   'paragraph': 6,
   'type': 'Implicit',
   'start': 318,
   'end': 333,
   'replyto': -1},
  {'speaker': 'sherlock_glanced_up_then',
   'quotes': [{'start': 351,
     'end': 365,
     'quote': "`` That 's an interestin ' accent you got . Where you from ? ''"}],
   'paragraph': 8,
   'type': 'Explicit',
   'start': 351,
   'end': 38

In [8]:
for fic_id, quotes in predicted_quotes.items():
    print(f'{fic_id}: {len(quotes)}')

1145590: 0
459070: 1
12828381: 25
5581141: 24
1621415: 0


In [4]:
predicted_quotes[459070]

[{'speaker': 'uncle_harry',
  'quotes': [{'start': 4541, 'end': 4545, 'quote': "`` m sorry , ''"}],
  'paragraph': 93,
  'type': 'Explicit',
  'start': 4539,
  'end': 4555,
  'replyto': -1}]

In [6]:
gold_entities[459070]

{'Hermione Granger': {(1, 2, 30, 38)},
 'Hugo': set(),
 'Ron Weasley': set(),
 'Rose': {(1, 14, 21, 25),
  (1, 35, 1, 4),
  (1, 35, 8, 23),
  (1, 37, 1, 4),
  (1, 37, 9, 14),
  (1, 39, 1, 9),
  (1, 41, 1, 15),
  (1, 47, 6, 12),
  (1, 50, 1, 14),
  (1, 52, 1, 10),
  (1, 54, 25, 62),
  (1, 56, 1, 4),
  (1, 64, 8, 8),
  (1, 79, 1, 7),
  (1, 81, 13, 19),
  (1, 83, 1, 15),
  (1, 85, 1, 37),
  (1, 87, 1, 18),
  (1, 89, 6, 15),
  (1, 91, 1, 72),
  (1, 93, 1, 5),
  (1, 97, 1, 17),
  (1, 99, 1, 4),
  (1, 101, 1, 26),
  (1, 101, 32, 50),
  (1, 103, 1, 10),
  (1, 105, 1, 7)},
 'Dilys Derwent': set(),
 'Neville Longbotttom': set(),
 'Scorpius Malfoy': set(),
 'Albus': {(1, 34, 1, 6),
  (1, 36, 1, 27),
  (1, 38, 1, 5),
  (1, 38, 9, 32),
  (1, 40, 1, 7),
  (1, 42, 1, 16),
  (1, 46, 9, 18),
  (1, 48, 1, 6),
  (1, 48, 15, 39),
  (1, 51, 1, 17),
  (1, 53, 1, 11),
  (1, 55, 29, 43)},
 'Hera Gamp': set(),
 'Joni Harris': set(),
 'a house-elf': set(),
 'Madam Limnira': {(1, 45, 18, 41), (1, 49, 1, 4)},
 '

In [None]:
# Search for 

In [5]:
# Load ground-truth annotated quotes
import os
import pandas as pd

annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/quote_attribution/'
# annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(annotations_dirpath)):
    
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

allmarvel_1621415_quote_attribution.csv
harrypotter_459070_quote_attribution.csv
sherlock_12828381_quote_attribution.csv
teenwolf_1145590_quote_attribution.csv
tolkien_5581141_quote_attribution.csv


5

# Investigate pipeline output issues

In [47]:
def remove_character_tags(text):
    # Remove character parentheses
    modified_para = re.sub(r'\(\$_.*?\)\ ', '', text)

    # Split up character underscore mentions
    modified_para = modified_para.replace('_', ' ')
    
    return modified_para

In [53]:
import pandas as pd
import os

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 999)

# fname = 'allmarvel_1621415.coref.csv'
# fname = 'harrypotter_459070.coref.csv'
fname = 'sherlock_12828381.coref.csv'

predictions_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/pipeline_output/char_coref_stories'
pipeline_output = pd.read_csv(os.path.join(predictions_dirpath, fname))

csv_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/{fname.split(".")[0]}.csv'
fic_csv = pd.read_csv(csv_dirpath)

# Load and compare para breaks in fic and pipeline output
# pipeline_output.loc[:, ['text_tokenized']]
print(len(pipeline_output))
# pipeline_output.loc[:, ['para_id', 'text_tokenized']]
list(zip(pipeline_output['para_id'], pipeline_output['text_tokenized']))

45


[(1,
  "The deafening blare of the music through the speakers pulsed in Sherlock ($_Sherlock_glanced_up_then) 's ears , threatening to turn his ($_Sherlock_glanced_up_then) brain into mush . Why did the suspect he ($_Sherlock_glanced_up_then) was tracking have to frequent clubs like this ? Sherlock ($_Sherlock_glanced_up_then) hated clubs . Surely he ($_Sherlock_glanced_up_then) was to go mad before the person in question walked through the door . Stupid Mycroft sending Sherlock ($_Sherlock_glanced_up_then) all the way to America tracking some MI6 deserter . What did it matter if he ($_Sherlock_glanced_up_then) had stolen ` state secrets . ' Sherlock ($_Sherlock_glanced_up_then) could find the information from any one of the hundreds of homeless network he ($_Sherlock_glanced_up_then) has , and they sleep in crack dens on a nightly basis . He ($_Sherlock_glanced_up_then) supposed it was better than the alternative , going with his ($_Sherlock_glanced_up_then) parents to see a musical t

In [54]:
print(len(fic_csv))
fic_csv['text_tokenized']

45


0     The deafening blare of the music through the speakers pulsed in Sherlock ’s ears , threatening to turn his brain into mush . Why did the suspect he was tracking have to frequent clubs like this ? Sherlock hated clubs . Surely he was to go mad before the person in question walked through the door . Stupid Mycroft sending Sherlock all the way to America tracking some MI6 deserter . What did it matter if he had stolen ‘ state secrets.’ Sherlock could find the information from any one of the hundreds of homeless network he has , and they sleep in crack dens on a nightly basis . He supposed it was better than the alternative , going with his parents to see a musical they had tickets to . Sherlock shuddered at the thought . No this was much better .                                                                                                                                                                                                                                                 

In [72]:
import re

# test_text = 'the_harry'
# test_text = '_ _ _'
# test_text = 'albus_,_the'
test_text = 'whaddup _'

new_text = re.sub(r'([^ ])_([^ ])', r'\1 \2', test_text)
re.sub(r'([^ ])_([^ ])', r'\1 \2', new_text)

'whaddup _'

In [40]:
import re
import pdb

def count_pipeline_output_tokens(output_para): 
    modified_para = remove_character_tags(output_para)
    return len(modified_para.split())        

In [50]:
test_para = pipeline_output.loc[pipeline_output['para_id']==59, 'text_tokenized'].tolist()[0]
print(test_para)
print()
print(remove_character_tags(test_para))

count_pipeline_output_tokens(test_para)

It was n't fair , I sulked , skimming Rita Skeeter 's Harry Potter : Chosen or Chump ? -LRB- Some of it looked right , but the part about the tattoo ? Had to be false ... did n't it ? -RRB- Most of the rest of the school was purebloods or halfbloods , they 'd gotten to grow up with all the fun stories about Harry_Potter ($_worship_Harry_Potter) . -LRB- I flipped open at random to hear a disgruntled Ministry ex - employee claiming something about smashing time machines . Yes , Mum ($_Mum) had said she ($_Mum) 'd used a Time - Turner in her ($_Mum) third year , but that did n't seem to make sense . -RRB- By the time they went to Hogwarts , probably , that would all be old hat . But no , my parents had to '' try and give us a normal childhood '' and downplay all the exciting parts . -LRB- Okay , so maybe I did n't care about his Quidditch records either , but still . -RRB- Until now , when I was fourteen and ... 

It was n't fair , I sulked , skimming Rita Skeeter 's Harry Potter : Chosen

186

In [51]:
fic_csv[fic_csv['para_id']==59]['text_tokenized'].tolist()[0]

'It was n\'t fair , I sulked , skimming Rita Skeeter \'s Harry Potter : Chosen or Chump ? ( Some of it looked right , but the part about the tattoo ? Had to be false ... didn\'t it ? ) Most of the rest of the school was purebloods or halfbloods , they \'d gotten to grow up with all the fun stories about Harry Potter . ( I flipped open at random to hear a disgruntled Ministry ex - employee claiming something about smashing time machines . Yes , Mum had said she \'d used a Time - Turner in her third year , but that did n\'t seem to make sense . ) By the time they went to Hogwarts , probably , that would all be old hat . But no , my parents had to " try and give us a normal childhood " and downplay all the exciting parts . ( Okay , so maybe I did n\'t care about his Quidditch records either , but still . ) Until now , when I was fourteen and ...'

# Error analysis for pipeline

In [16]:
# Load entity cluster predictions
import os
import pickle

predicted_entities_dirpath = '/projects/fanfiction-nlp/tmp/predicted_entities/'
predicted_entities = {}

for fname in os.listdir(predicted_entities_dirpath):
    
    print(fname)
    fic_id = int(os.path.splitext(fname)[0].split('_')[-1])
    
    with open(os.path.join(predicted_entities_dirpath, fname), 'rb') as f:
        predicted_entities.update(pickle.load(f))
                
len(predicted_entities)

pipeline_clusters_459070.pkl
pipeline_clusters_5581141.pkl
pipeline_clusters_16369049.pkl
pipeline_clusters_2287736.pkl
pipeline_clusters_1145590.pkl
pipeline_clusters_12828381.pkl
pipeline_clusters_606106.pkl
pipeline_clusters_1621415.pkl
pipeline_clusters_4305894.pkl
pipeline_clusters_2185500.pkl
pipeline_clusters_1296961.pkl
pipeline_clusters_1813147.pkl
pipeline_clusters_6082176.pkl
pipeline_clusters_8333365.pkl
pipeline_clusters_3150806.pkl


15

In [2]:
# Load ground-truth annotated entity mentions
import os
import pandas as pd

# annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/entity_clusters'
annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(annotations_dirpath)):
    
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

allmarvel_606106_entity_clusters.csv
dcu_16369049_entity_clusters.csv
dragonage_4305894_entity_clusters.csv
drwho_8333365_entity_clusters.csv
harrypotter_2287736_entity_clusters.csv
sherlock_1296961_entity_clusters.csv
starwars_6082176_entity_clusters.csv
supernatural_1813147_entity_clusters.csv
teenwolf_3150806_entity_clusters.csv
tolkien_2185500_entity_clusters.csv


10

In [17]:
# Types of errors

false_negatives = {}
false_positives = {}
wrong_ending_mentions = {} # predicted mention doesn't have the correct ending, but does have the right start
wrong_start_mentions = {} # predicted mention doesn't have the correct start, but does have the right ending
wrong_start_ending_mentions = {} # predicted mention doesn't have the correct start or ending

for fic_id in gold_entities:
    print(fic_id)
    wrong_ending_mentions[fic_id] = set() # predicted mention doesn't have the correct ending, but does have the right start
    wrong_start_mentions[fic_id] = set() # predicted mention doesn't have the correct start, but does have the right ending
    wrong_start_ending_mentions[fic_id] = set() # predicted mention doesn't have the correct start or ending
    
    predicted_clusters = predicted_entities[fic_id]
    gold_clusters = gold_entities[fic_id]
    
    # Check if catching/not catching mentions
    predicted_mentions = set([mention for cluster in predicted_clusters for mention in predicted_clusters[cluster]])
    gold_mentions = set([mention for cluster in gold_clusters for mention in gold_clusters[cluster]])
    print(f'Predicted {len(predicted_mentions)} total mentions compared with {len(gold_mentions)} gold mentions')
    
    caught_mentions = predicted_mentions.intersection(gold_mentions)
    false_negatives[fic_id] = gold_mentions - caught_mentions
    false_positives[fic_id] = predicted_mentions - caught_mentions
    print(f'Number of false negatives: {len(false_negatives[fic_id])}')
    print(f'Number of false positives: {len(false_positives[fic_id])}')
    
    # Starts/beginnings
    for mention in predicted_mentions:
        chapter_id = mention[0]
        para_id = mention[1]
        token_start = mention[2]
        token_end = mention[3]
        
        found_partial_match = False
        
        # Search for match in gold mentions
        if mention in caught_mentions:
            continue
            
        # Search for same start, diff end
        for gold_mention in gold_mentions:
            if chapter_id == gold_mention[0] and para_id == gold_mention[1]:
                if token_start == gold_mention[2] and token_end != gold_mention[3]:
                    wrong_ending_mentions[fic_id].add((mention, gold_mention))
                    found_partial_match = True
                elif token_start != gold_mention[2] and token_end == gold_mention[3]:
                    wrong_start_mentions[fic_id].add((mention, gold_mention))
                    found_partial_match = True
                    
        if not found_partial_match:
            wrong_start_ending_mentions[fic_id].add(mention)
                        
    print(f'\tNumber of wrong starts: {len(wrong_start_mentions[fic_id])}')
    print(f'\tNumber of wrong endings: {len(wrong_ending_mentions[fic_id])}')
    print(f'\tNumber of wrong both: {len(wrong_start_ending_mentions[fic_id])}')
    print()

606106
Predicted 244 total mentions compared with 445 gold mentions
Number of false negatives: 222
Number of false positives: 21
	Number of wrong starts: 3
	Number of wrong endings: 1
	Number of wrong both: 17

16369049
Predicted 366 total mentions compared with 590 gold mentions
Number of false negatives: 280
Number of false positives: 56
	Number of wrong starts: 1
	Number of wrong endings: 2
	Number of wrong both: 54

4305894
Predicted 155 total mentions compared with 259 gold mentions
Number of false negatives: 112
Number of false positives: 8
	Number of wrong starts: 2
	Number of wrong endings: 0
	Number of wrong both: 6

8333365
Predicted 95 total mentions compared with 173 gold mentions
Number of false negatives: 84
Number of false positives: 6
	Number of wrong starts: 0
	Number of wrong endings: 0
	Number of wrong both: 6

2287736
Predicted 120 total mentions compared with 156 gold mentions
Number of false negatives: 72
Number of false positives: 36
	Number of wrong starts: 1
	N

In [5]:
# Check number of predicted character clusters

for fic_id in sorted(predicted_entities):
    print(fic_id)
    
    predicted_clusters = predicted_entities[fic_id]
    gold_clusters = gold_entities[fic_id]
    
    print(f'Number of predicted clusters: {len(predicted_clusters)}')
    print(f'Number of gold clusters: {len(gold_clusters)}')
    print()

459070
Number of predicted clusters: 22
Number of gold clusters: 39

1145590
Number of predicted clusters: 7
Number of gold clusters: 12

1621415
Number of predicted clusters: 6
Number of gold clusters: 5

5581141
Number of predicted clusters: 6
Number of gold clusters: 7

12828381
Number of predicted clusters: 4
Number of gold clusters: 9



## Specific errors

In [18]:
import random

# dev
# fic_id = 1621415 # allmarvel
# fic_id = 459070 # harrypotter
# fic_id = 1145590 # teenwolf
# fic_id = 5581141 # tolkien

# test
fic_id = 1296961

n_samples = 10

print(f'Example false negatives: {sorted(random.sample(false_negatives[fic_id], n_samples))}')
print()
print(f'Example false positives (wrong start and end): {sorted(random.sample(wrong_start_ending_mentions[fic_id], min(n_samples, len(wrong_start_ending_mentions[fic_id]))))}')

Example false negatives: [(1, 21, 1, 1), (1, 23, 2, 2), (1, 24, 32, 32), (1, 35, 7, 7), (1, 36, 10, 10), (1, 36, 37, 37), (1, 49, 10, 10), (1, 50, 58, 58), (1, 53, 19, 19), (1, 61, 20, 20)]

Example false positives (wrong start and end): [(1, 33, 44, 44), (1, 36, 20, 21), (1, 39, 59, 60)]


# Error analysis for BookNLP

In [13]:
# Load entity cluster predictions
import os
import pickle

predictions_path = '/projects/book-nlp/tmp/predicted_entities/'
predicted_entities = {}

for fname in os.listdir(predictions_path):
    with open(os.path.join(predictions_path, fname), 'rb') as f:
        predicted_entities.update(pickle.load(f))
                
len(predicted_entities)

15

In [5]:
# Load ground-truth annotated entity mentions
import os
import pandas as pd

# annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/entity_clusters'
annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(annotations_dirpath)):
    
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

allmarvel_606106_entity_clusters.csv
dcu_16369049_entity_clusters.csv
dragonage_4305894_entity_clusters.csv
drwho_8333365_entity_clusters.csv
harrypotter_2287736_entity_clusters.csv
sherlock_1296961_entity_clusters.csv
starwars_6082176_entity_clusters.csv
supernatural_1813147_entity_clusters.csv
teenwolf_3150806_entity_clusters.csv
tolkien_2185500_entity_clusters.csv


10

In [14]:
# Types of errors

false_negatives = {}
false_positives = {}
wrong_ending_mentions = {} # predicted mention doesn't have the correct ending, but does have the right start
wrong_start_mentions = {} # predicted mention doesn't have the correct start, but does have the right ending
wrong_start_ending_mentions = {} # predicted mention doesn't have the correct start or ending

for fic_id in gold_entities:
    print(fic_id)
    wrong_ending_mentions[fic_id] = set() # predicted mention doesn't have the correct ending, but does have the right start
    wrong_start_mentions[fic_id] = set() # predicted mention doesn't have the correct start, but does have the right ending
    wrong_start_ending_mentions[fic_id] = set() # predicted mention doesn't have the correct start or ending
    
    predicted_clusters = predicted_entities[fic_id]
    gold_clusters = gold_entities[fic_id]
    
    # Check if catching/not catching mentions
    predicted_mentions = set([mention for cluster in predicted_clusters for mention in predicted_clusters[cluster]])
    gold_mentions = set([mention for cluster in gold_clusters for mention in gold_clusters[cluster]])
    print(f'Predicted {len(predicted_mentions)} total mentions compared with {len(gold_mentions)} gold mentions')
    
    caught_mentions = predicted_mentions.intersection(gold_mentions)
    false_negatives[fic_id] = gold_mentions - caught_mentions
    false_positives[fic_id] = predicted_mentions - caught_mentions
    print(f'Number of false negatives: {len(false_negatives[fic_id])}')
    print(f'Number of false positives: {len(false_positives[fic_id])}')
    
    # Starts/beginnings
    for mention in predicted_mentions:
        chapter_id = mention[0]
        para_id = mention[1]
        token_start = mention[2]
        token_end = mention[3]
        
        found_partial_match = False
        
        # Search for match in gold mentions
        if mention in caught_mentions:
            continue
            
        # Search for same start, diff end
        for gold_mention in gold_mentions:
            if chapter_id == gold_mention[0] and para_id == gold_mention[1]:
                if token_start == gold_mention[2] and token_end != gold_mention[3]:
                    wrong_ending_mentions[fic_id].add((mention, gold_mention))
                    found_partial_match = True
                elif token_start != gold_mention[2] and token_end == gold_mention[3]:
                    wrong_start_mentions[fic_id].add((mention, gold_mention))
                    found_partial_match = True
                    
        if not found_partial_match:
            wrong_start_ending_mentions[fic_id].add(mention)
                        
    print(f'\tNumber of wrong starts: {len(wrong_start_mentions[fic_id])}')
    print(f'\tNumber of wrong endings: {len(wrong_ending_mentions[fic_id])}')
    print(f'\tNumber of wrong both: {len(wrong_start_ending_mentions[fic_id])}')
    print()

606106
Predicted 279 total mentions compared with 445 gold mentions
Number of false negatives: 177
Number of false positives: 11
	Number of wrong starts: 4
	Number of wrong endings: 1
	Number of wrong both: 7

16369049
Predicted 357 total mentions compared with 590 gold mentions
Number of false negatives: 253
Number of false positives: 20
	Number of wrong starts: 1
	Number of wrong endings: 2
	Number of wrong both: 18

4305894
Predicted 114 total mentions compared with 259 gold mentions
Number of false negatives: 149
Number of false positives: 4
	Number of wrong starts: 3
	Number of wrong endings: 0
	Number of wrong both: 1

8333365
Predicted 1 total mentions compared with 173 gold mentions
Number of false negatives: 173
Number of false positives: 1
	Number of wrong starts: 0
	Number of wrong endings: 0
	Number of wrong both: 1

2287736
Predicted 104 total mentions compared with 156 gold mentions
Number of false negatives: 60
Number of false positives: 8
	Number of wrong starts: 1
	Num

In [27]:
# Check number of predicted character clusters

for fic_id in predicted_entities:
    print(fic_id)
    
    predicted_clusters = predicted_entities[fic_id]
    gold_clusters = gold_entities[fic_id]
    
    print(f'Number of predicted clusters: {len(predicted_clusters)}')
    print(f'Number of gold clusters: {len(gold_clusters)}')
    print()

5581141
Number of predicted clusters: 5
Number of gold clusters: 7

1621415
Number of predicted clusters: 3
Number of gold clusters: 5

1145590
Number of predicted clusters: 6
Number of gold clusters: 12

459070
Number of predicted clusters: 10
Number of gold clusters: 39

12828381
Number of predicted clusters: 2
Number of gold clusters: 9



## Specific errors

In [15]:
import random

# dev
# fic_id = 1621415 # allmarvel
# fic_id = 459070 # harrypotter
# fic_id = 12828381 # sherlock
# fic_id = 1145590 # teenwolf
# fic_id = 5581141 # tolkien

# test
fic_id = 1296961 # sherlock
n_samples = 10

print(f'Example false negatives: {sorted(random.sample(false_negatives[fic_id], n_samples))}')
print()
print(f'Example false positives (wrong start and end): {sorted(random.sample(wrong_start_ending_mentions[fic_id], min(n_samples, len(wrong_start_ending_mentions[fic_id]))))}')

Example false negatives: [(1, 14, 2, 2), (1, 17, 36, 36), (1, 32, 7, 7), (1, 32, 39, 39), (1, 35, 20, 21), (1, 36, 37, 37), (1, 49, 18, 18), (1, 50, 33, 33), (1, 54, 13, 13), (1, 59, 10, 10)]

Example false positives (wrong start and end): [(1, 33, 44, 44)]


In [41]:
print(wrong_ending_mentions[fic_id])
print(wrong_start_mentions[fic_id])

set()
set()


# Post-process BookNLP output to get entity mention clusters
Basis for script at /projects/fanfiction-nlp/baselines/evaluate_booknlp_coref.py

In [2]:
# Load entity cluster predictions
import os
import pandas as pd
import csv
import pdb


def modify_paragraph_id(para_id):
   
    if para_id == 34: # trouble line
        new_para_id = 36
        
    if para_id >= 35:
        new_para_id = para_id
    
    else:
        new_para_id = para_id + 1
        
    return new_para_id


predictions_dirpath = '/projects/book-nlp/data/tokens/annotated_10fandom_dev/'

predicted_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

fname = 'allmarvel_1621415.txt.tokens'
booknlp_output = pd.read_csv(os.path.join(predictions_dirpath, fname), sep='\t', quoting=csv.QUOTE_NONE)

# Load CSV file of fic
csv_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/{fname.split(".")[0]}.csv'
fic_csv = pd.read_csv(csv_dirpath)
fic_csv.columns

Index(['fic_id', 'chapter_id', 'para_id', 'text', 'text_tokenized'], dtype='object')

In [3]:
# Paragraph breaks from BookNLP
pd.set_option('display.max_colwidth', -1)
booknlp_output.groupby('paragraphId').agg({'originalWord': lambda x: ' '.join(x.tolist())})

Unnamed: 0_level_0,originalWord
paragraphId,Unnamed: 1_level_1
0,Two weeks .
1,Two weeks of chasing a HYDRA weapons shipment around the world .
2,"Two weeks of no talking , no kissing ."
3,"( The meatloaf team dinner went really well , though . So there 's that . )"
4,"( Skye is n't disappointed . Initial excitement aside , she 's learned to not get her hopes up when it comes to things she wants . )"
5,"There 's a far - off storm and a sunset competing for her attention . The Bus has been parked on this airstrip for half a day now , and Skye 's been sitting here , on the lowered cargo door ramp , just enjoying not having to think about anything at all . It 's quiet ( Jemma , Fitz , and Trip left an hour ago to find dinner , some privacy maybe ) , light breeze bringing in the evening is nice ."
6,"His footsteps behind her ( deliberate on his part , she thinks ) are n't surprising , but she ca n't stop the startled gasp when something cold and wet drips onto her neck ."
7,""" Ah , sorry , "" Coulson says sheepishly , and when she looks up there 's a beer in his outstretched hand . "" I think I used to be better at this sort of thing . """
8,"A peace offering , and Skye rolls her eyes a little but takes the beer , nods at him to sit . This is what she wanted , but Skye suddenly does n't want to hear his potential rejection , would rather just slip quietly back into their working partnership ."
9,"Coulson folds down , a bit of space between them , and takes a long pull from his own beer . She takes a moment to look at him ( no jacket , shirt sleeves rolled up , tie loose ) and thinks he looks tired . He looks tired , and Skye feels horrible when she catches herself beginning to lean ever so slightly towards him , the dark rich soil and pepper of his scent curling around her senses like a blanket . God , could she be anymore unprofessional ?"


In [4]:
fic_csv.loc[:, ['para_id', 'text_tokenized']]

Unnamed: 0,para_id,text_tokenized
0,1,Two weeks .
1,2,Two weeks of chasing a HYDRA weapons shipment around the world .
2,3,"Two weeks of no talking , no kissing ."
3,4,"( The meatloaf team dinner went really well , though . So there 's that . )"
4,5,"( Skye is n't disappointed . Initial excitement aside , she 's learned to not get her hopes up when it comes to things she wants . )"
5,6,"There 's a far - off storm and a sunset competing for her attention . The Bus has been parked on this airstrip for half a day now , and Skye 's been sitting here , on the lowered cargo door ramp , just enjoying not having to think about anything at all . It 's quiet ( Jemma , Fitz , and Trip left an hour ago to find dinner , some privacy maybe ) , light breeze bringing in the evening is nice ."
6,7,"His footsteps behind her ( deliberate on his part , she thinks ) are n't surprising , but she ca n't stop the startled gasp when something cold and wet drips onto her neck ."
7,8,""" Ah , sorry , "" Coulson says sheepishly , and when she looks up there 's a beer in his outstretched hand . "" I think I used to be better at this sort of thing . """
8,9,"A peace offering , and Skye rolls her eyes a little but takes the beer , nods at him to sit . This is what she wanted , but Skye suddenly does n't want to hear his potential rejection , would rather just slip quietly back into their working partnership ."
9,10,"Coulson folds down , a bit of space between them , and takes a long pull from his own beer . She takes a moment to look at him ( no jacket , shirt sleeves rolled up , tie loose ) and thinks he looks tired . He looks tired , and Skye feels horrible when she catches herself beginning to lean ever so slightly towards him , the dark rich soil and pepper of his scent curling around her senses like a blanket . God , could she be anymore unprofessional ?"


In [2]:
    ## Check paragraph breaks
    # Compare number of paragraphs
    if max(fic_csv['para_id']) != len(booknlp_output['paragraphId'].unique()):

        # Fix paragraph break issues
        booknlp_output['modified_paragraphId'] = booknlp_output['paragraphId'].map(modify_paragraph_id)

        # Paragraph breaks from BookNLP
#         pd.set_option('display.max_colwidth', -1)
#         booknlp_output.groupby('modified_paragraphId').agg({'originalWord': lambda x: ' '.join(x.tolist())})
        
        # Confirm that fixed it
        if max(fic_csv['para_id']) != len(booknlp_output['modified_paragraphId'].unique()):
            pdb.set_trace()

    ## Make sure token IDs align
    # Get token counts for paragraphs from BookNLP, make sure they match the original fic token counts
    fic_csv['booknlp_para_length'] = booknlp_output.groupby('modified_paragraphId').size().tolist()
    fic_csv['token_count'] = fic_csv['text_tokenized'].map(lambda x: len(x.split()))
    misaligned_rows = fic_csv.loc[fic_csv['token_count'] != fic_csv['booknlp_para_length'], ['para_id', 'token_count', 'booknlp_para_length']]
    print(misaligned_rows)

    # Fix token misalignment issues
    if len(misaligned_rows > 0):
        modified_booknlp = booknlp_output.copy()

        for selected_paragraph in misaligned_rows['para_id']:

            selected_chapter = 1
            gold_tokens = fic_csv.loc[fic_csv['para_id']==selected_paragraph, 'text_tokenized'].tolist()[0].split()
            booknlp_tokens = booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_paragraph, 'originalWord'].tolist() # careful with chapters

            total_offset = 0
            trouble_offsets = {} # line_number: offset
            first_tokenId = booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_paragraph, 'tokenId'].tolist()[0]

            for i, gold_tok in enumerate(gold_tokens):
                if not gold_tok == booknlp_tokens[i + total_offset]:

                    # Try adding tokens
                    added = booknlp_tokens[i]
                    for offset in range(1, 4):
                        added += booknlp_tokens[i+offset]
                        if added == gold_tok:
                            total_offset += offset
                            trouble_offsets[first_tokenId + i] = offset
                            break

                    else:
                        print(gold_tok)
                        print(booknlp_tokens[i])
                        pdb.set_trace()

    #     print(total_offset)
    #     print(trouble_offsets)

        # Modify BookNLP output
        for line, offset in trouble_offsets.items():
            row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId'].isin(range(line, line+offset+1)))

            # Modify offset word
            new_word = ''.join(modified_booknlp.loc[row_filter, 'originalWord'].tolist())
            modified_row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId']==line)
            modified_booknlp.loc[modified_row_filter, 'originalWord'] = new_word

            # Delete offset words
            delete_row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId'].isin(range(line+1, line+offset+1)))
            delete_index = modified_booknlp.loc[delete_row_filter].index
            modified_booknlp.drop(index=delete_index, inplace=True)

        # Check token length match again
        fic_csv['booknlp_para_length'] = modified_booknlp.groupby('modified_paragraphId').size().tolist()
        fic_csv['token_count'] = fic_csv['text_tokenized'].map(lambda x: len(x.split()))
        misaligned_rows = fic_csv.loc[fic_csv['token_count'] != fic_csv['booknlp_para_length'], ['para_id', 'token_count', 'booknlp_para_length']]
        if len(misaligned_rows > 0):
            pdb.set_trace()

    ## Renumber BookNLP token IDs
    para_token_lengths = modified_booknlp.groupby('modified_paragraphId').size().tolist()
    new_tokenIds = sum([list(range(1, para_length+1)) for para_length in para_token_lengths], [])

    modified_booknlp['modified_tokenId'] = new_tokenIds

    ## Extract entity mention tuples, clusters
    selected_cols = ['modified_paragraphId', 'modified_tokenId', 'characterId', 'originalWord']
    mentions = modified_booknlp[modified_booknlp['characterId']>-1].loc[:, selected_cols]

    # Calculate end tokens for any entity mentions
    mentions['next_entity_tokenId'] = mentions['modified_tokenId'].tolist()[1:] + [0]
    mentions['next_entity_paragraphId'] = mentions['modified_paragraphId'].tolist()[1:] + [0]
    mentions['next_entity_characterId'] = mentions['characterId'].tolist()[1:] + [0]
    mentions['sequential'] = [(next_entity_tokenId == modified_tokenId + 1) and \
                              (next_entity_paragraphId == modified_paragraphId) and \
                              (next_entity_characterId == characterId) 
                            for next_entity_tokenId, modified_tokenId, next_entity_paragraphId, modified_paragraphId, next_entity_characterId, characterId in \
                              zip(mentions['next_entity_tokenId'], mentions['modified_tokenId'], mentions['next_entity_paragraphId'], \
                                  mentions['modified_paragraphId'], mentions['next_entity_characterId'], mentions['characterId'])
                                 ]

    predicted_entities = {}

    prev_was_sequential = False
    prev_token_id_start = 0

    fic_id = int(fname.split('.')[0].split('_')[1])

    for row in list(mentions.itertuples()):
        chapter_id = 1
        para_id = row.modified_paragraphId
        character_id = row.characterId
        token_id_start = row.modified_tokenId

        if row.sequential: # Store last token ID
            if prev_was_sequential: # in the middle of an entity mention
                continue
            else:
                prev_was_sequential = True
                prev_token_id_start = token_id_start
                continue

        # Save entity mention
        if not fic_id in predicted_entities:
            predicted_entities[fic_id] = {}

        if not character_id in predicted_entities[fic_id]:
            predicted_entities[fic_id][character_id] = set()

        if prev_was_sequential:
            token_id_start = prev_token_id_start

        token_id_end = row.modified_tokenId

        predicted_entities[fic_id][character_id].add((chapter_id, para_id, token_id_start, token_id_end))

        prev_was_sequential = row.sequential

    len(predicted_entities)

sherlock_12828381.txt.tokens
   para_id  token_count  booknlp_para_length
0        1          144                  146
8        9           39                   40


NameError: name 'pdb' is not defined

In [130]:
predicted_entities = {}

prev_was_sequential = False
prev_token_id_start = 0

# fic_id = int(fname.split('.')[0].split('_')[1])
fic_id = 12828381

for row in list(mentions.itertuples()):
    chapter_id = 1
    para_id = row.modified_paragraphId
    character_id = row.characterId
    token_id_start = row.modified_tokenId
    
    if row.sequential: # Store last token ID
        if prev_was_sequential: # in the middle of an entity mention
            continue
        else:
            prev_was_sequential = True
            prev_token_id_start = token_id_start
            continue

    # Save entity mention
    if not fic_id in predicted_entities:
        predicted_entities[fic_id] = {}

    if not character_id in predicted_entities[fic_id]:
        predicted_entities[fic_id][character_id] = set()

    if prev_was_sequential:
        token_id_start = prev_token_id_start
        
    token_id_end = row.modified_tokenId
        
    predicted_entities[fic_id][character_id].add((chapter_id, para_id, token_id_start, token_id_end))
    
    prev_was_sequential = row.sequential
                
len(predicted_entities)

1

## Find any chapter IDs other than 1

In [98]:
for fname in sorted(os.listdir(predictions_dirpath)):
    csv_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/{fname.split(".")[0]}.csv'
    fic_csv = pd.read_csv(csv_dirpath)
    print(f'{fname}: {max(fic_csv["chapter_id"])}')

allmarvel_1621415.txt.tokens: 1
harrypotter_459070.txt.tokens: 1
sherlock_12828381.txt.tokens: 1
teenwolf_1145590.txt.tokens: 1
tolkien_5581141.txt.tokens: 1


In [131]:
predicted_entities[12828381]

{1: {(1, 1, 12, 12),
  (1, 1, 19, 19),
  (1, 1, 38, 38),
  (1, 1, 43, 43),
  (1, 1, 61, 61),
  (1, 1, 77, 77),
  (1, 1, 83, 83),
  (1, 1, 97, 97),
  (1, 1, 111, 111),
  (1, 1, 122, 122),
  (1, 1, 133, 133),
  (1, 2, 1, 1),
  (1, 2, 16, 16),
  (1, 2, 18, 18),
  (1, 2, 26, 26),
  (1, 2, 47, 47),
  (1, 2, 54, 54),
  (1, 2, 56, 56),
  (1, 2, 66, 66),
  (1, 3, 1, 1),
  (1, 3, 12, 12),
  (1, 3, 27, 27),
  (1, 3, 43, 43),
  (1, 3, 45, 45),
  (1, 3, 50, 50),
  (1, 3, 54, 54),
  (1, 3, 60, 60),
  (1, 4, 8, 8),
  (1, 6, 1, 1),
  (1, 6, 15, 15),
  (1, 9, 17, 17),
  (1, 10, 7, 7),
  (1, 11, 21, 21),
  (1, 11, 34, 34),
  (1, 12, 9, 9),
  (1, 16, 100, 100),
  (1, 17, 1, 1),
  (1, 17, 17, 17),
  (1, 18, 11, 11),
  (1, 18, 14, 14),
  (1, 18, 21, 21),
  (1, 18, 26, 26),
  (1, 19, 10, 10),
  (1, 19, 21, 21),
  (1, 20, 15, 15),
  (1, 22, 7, 7),
  (1, 22, 11, 11),
  (1, 22, 43, 43),
  (1, 22, 55, 55),
  (1, 22, 70, 70),
  (1, 22, 74, 74),
  (1, 22, 77, 77),
  (1, 22, 80, 80),
  (1, 22, 88, 88),
  (1, 22, 

# 1-time

In [59]:
# Examine misaligned paragraphs
pd.set_option('display.max_rows', 999)
print(fic_csv.loc[fic_csv['para_id']==selected_para, 'text_tokenized'])
booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_para, ['modified_paragraphId', 'tokenId', 'originalWord']]

34    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n It had been nearly 20 minutes and the Bartender had n’t checked on him again . Though there was n’t much of a need . Sherlock ’s drink was still mostly full and he could n’t be possibly bothered to come back to chat when he had other paying customers .
Name: text_tokenized, dtype: object


Unnamed: 0,modified_paragraphId,tokenId,originalWord
1697,35,1697,_
1698,35,1698,_
1699,35,1699,_
1700,35,1700,_
1701,35,1701,_
1702,35,1702,_
1703,35,1703,_
1704,35,1704,_
1705,35,1705,_
1706,35,1706,_


# Run BookNLP multiple files at once

In [47]:
import os
from subprocess import call

input_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/fics_text_tokenized/'
booknlp_log_dirpath = 'data/output/annotated_10fandom_test/'
booknlp_output_dirpath = 'data/tokens/annotated_10fandom_test/'

# cmd = './runjava novels/BookNLP -doc /data/fanfiction_ao3/annotated_10fandom/dev/fics_text/allmarvel_1621415 -p data/output/annotated_10fandom_dev/allmarvel_1621415 -tok data/tokens/allmarvel_1621415.tokens -f'

booknlp_dirpath = '/projects/book-nlp/'
os.chdir(booknlp_dirpath)

for fname in os.listdir(input_dirpath):
    cmd = ['./runjava', 'novels/BookNLP', '-doc', 
                os.path.join(input_dirpath, fname),
               '-p', os.path.join(booknlp_log_dirpath, fname),
               '-tok', f'{os.path.join(booknlp_output_dirpath, os.path.splitext(fname)[0])}.tokens', '-f']

    call(cmd)

# Prepare text of fics for baselines

## Actual fic text

In [45]:
import os
from tqdm import tqdm_notebook as tqdm
import re

input_text_dirpath = '/projects/fanfiction-nlp/tmp/annotated_10fandom_test_text_data'
output_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/fics_text'
if not os.path.exists(output_text_dirpath):
    os.mkdir(output_text_dirpath)

for fname in tqdm(os.listdir(input_text_dirpath)):
    
    # Read
    with open(os.path.join(input_text_dirpath, fname)) as fin:
        text = fin.read()
        new_text = text.replace(' # . ', '\n\n')
        new_text = re.sub(r'\s?\-+\s?', ' - ', new_text)
        
    # Write
    with open(os.path.join(output_text_dirpath, f'{fname}.txt'), 'w') as fout:
        fout.write(new_text)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## Already tokenized

In [46]:
import os
from tqdm import tqdm_notebook as tqdm
import pandas as pd

input_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/fics'
output_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/test/fics_text_tokenized'
if not os.path.exists(output_text_dirpath):
    os.mkdir(output_text_dirpath)

for fname in tqdm(os.listdir(input_text_dirpath)):
    
    # Read
    fic = pd.read_csv(os.path.join(input_text_dirpath, fname))
    tokenized = fic['text_tokenized'].tolist()
    
    # Write
    with open(os.path.join(output_text_dirpath, f'{os.path.splitext(fname)[0]}.txt'), 'w') as fout:
        fout.write('\n\n'.join(tokenized))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




# Calculate LEA coreference evaluation

In [125]:
# Load ground-truth annotated entity mentions
import os
import pandas as pd

annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(annotations_dirpath))[2:3]:
    
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

sherlock_12828381_entity_clusters.csv


1

In [3]:
# Number of characters
for fname in gold_entities:
    print(f'{fname}: {len(gold_entities[fname])}')

459070: 39
1145590: 12
5581141: 7
1621415: 5
12828381: 9


In [8]:
# Length of fics

fic_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics/'

for fname in os.listdir(fic_dirpath):
    fic = pd.read_csv(os.path.join(fic_dirpath, fname))
    n_words = sum(fic['text_tokenized'].map(lambda x: len(x.split())))
    print(f'{fname}: {n_words}')

allmarvel_1621415.csv: 2791
harrypotter_459070.csv: 5288
tolkien_5581141.csv: 2939
sherlock_12828381.csv: 2330
teenwolf_1145590.csv: 3493


In [4]:
def extract_entity_mentions(text):
    """ Return token start and endpoints of entity mentions embedded in text. """
    
    token_count = 1
    entities = {} # cluster_name: {(token_id_start, token_id_end), ...}
    
    tokens = text.split(' ')
    for i, token in enumerate(tokens):
        if token.startswith('($_'): # entity cluster name
            if not token in entities:
                entities[token] = set()
                
            mention = tokens[i-1]
            mention_len = len(mention.split('_'))
            token_id_start = token_count - 1
            token_id_end = (token_count - 1) + (mention_len - 1)
            
            token_count += mention_len - 1 # for the underscore-connected mentions
                
            entities[token].add((token_id_start, token_id_end))
            
        else:
            # Advance token count
            token_count += 1
            
    return entities

In [10]:
# Load entity cluster predictions
import os
import pandas as pd

predictions_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/pipeline_output/char_coref_stories'

predicted_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

csv_output = [fname for fname in sorted(os.listdir(predictions_dirpath)) if fname.endswith('.csv')]

for fname in csv_output:
    
    print(fname)
    df = pd.read_csv(os.path.join(predictions_dirpath, fname))
    for row in list(df.itertuples()):
        fic_id = row.fic_id
        chapter_id = row.chapter_id
        para_id = row.para_id
        entities = extract_entity_mentions(row.text_tokenized)
#         print(entities)
#         print(row.text_tokenized)
        
        if not fic_id in predicted_entities:
            predicted_entities[fic_id] = {}
        
        for cluster_name in entities:
            if not cluster_name in predicted_entities[fic_id]:
                predicted_entities[fic_id][cluster_name] = set()
            
            for mention in entities[cluster_name]:
                token_id_start = mention[0]
                token_id_end = mention[1]
                predicted_entities[fic_id][cluster_name].add((chapter_id, para_id, token_id_start, token_id_end))
                
len(predicted_entities)

allmarvel_1621415.coref.csv
harrypotter_459070.coref.csv
sherlock_12828381.coref.csv
teenwolf_1145590.coref.csv
tolkien_5581141.coref.csv


5

In [124]:
import itertools

def links(entity_mentions):
    """ Returns all the links in an entity between mentions """
    
    if len(entity_mentions) == 1: # self-link
        links = {list(entity_mentions)[0], list(entity_mentions)[0]}

    else:
        links = set(itertools.combinations(entity_mentions, 2))
        
    return links

import numpy as np
from IPython.core.debugger import set_trace

def lea_recall(predicted_entities, gold_entities):
    
    fic_recalls = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for gold_cluster, gold_mentions in gold_entities[fic_id].items():
            gold_links = links(gold_mentions)
            
            cluster_resolution = 0
            
            for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
                predicted_links = links(predicted_mentions)
                
                cluster_resolution += len(predicted_links.intersection(gold_links))
                
            cluster_resolution = cluster_resolution/len(gold_links)
            cluster_resolutions[gold_cluster] = cluster_resolution
            cluster_sizes[gold_cluster] = len(gold_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_recalls[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in gold_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total recall as mean across fics
#     print(fic_recalls)
    total_recall = np.mean(list(fic_recalls.values()))
    return total_recall, fic_recalls

import numpy as np
from IPython.core.debugger import set_trace

def lea_precision(predicted_entities, gold_entities):
    
    fic_precisions = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
            predicted_links = links(predicted_mentions)
            
            cluster_resolution = 0
            
            for gold_cluster, gold_mentions in gold_entities[fic_id].items():
                gold_links = links(gold_mentions)
                cluster_resolution += len(predicted_links.intersection(gold_links))
            
            cluster_resolution = cluster_resolution/len(predicted_links)
            cluster_resolutions[predicted_cluster] = cluster_resolution
            cluster_sizes[predicted_cluster] = len(predicted_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_precisions[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in predicted_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total precision as mean across fics
#     print(fic_precisions)
    total_precision = np.mean(list(fic_precisions.values()))
    return total_precision, fic_precisions

In [126]:
def f_score(precision, recall):
    return 2 * (precision * recall)/(precision + recall)

In [132]:
print(gold_entities.keys())
print(predicted_entities.keys())

dict_keys([12828381])
dict_keys([12828381])


In [133]:
recall, fic_recalls = lea_recall(predicted_entities, gold_entities)
precision, fic_precisions = lea_precision(predicted_entities, gold_entities)
f1 = f_score(precision, recall)

print(f"Precision: {precision: .2%}")
print(f"Recall: {recall: .2%}")
print(f"F-score: {f1: .2%}")

Precision:  58.86%
Recall:  27.24%
F-score:  37.25%


In [134]:
len(gold_entities[12828381])

9

In [138]:
len(predicted_entities[12828381][1])

141

In [21]:
fic_precisions

{459070: 0.20664608688802236,
 1145590: 0.44628475692709196,
 5581141: 0.5345563442755866,
 1621415: 0.08021746118261987,
 12828381: 0.1519804183355585}

In [22]:
fic_recalls

{459070: 0.02770425922695492,
 1145590: 0.24624357045549267,
 5581141: 0.269897197854156,
 1621415: 0.024993739043325823,
 12828381: 0.10310111184268897}

In [79]:
# Test calculation with toy examples

import itertools

# set(itertools.combinations({(1,3), (1,4), (2,2), (3,5)}, 2))
test_gold = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3)},
                'B': {(1,1,4,4), (1,1,5,5), (1,1,6,6)}
                }}

test_predicted = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3), (1,1,6,6)},
                'B': {(1,1,4,4), (1,1,5,5)}
                }}

print(lea_recall(test_predicted, test_gold))
print(lea_precision(test_predicted, test_gold))

0.6666666666666666
0.6666666666666666


# Create personal coref annotation interface (token id subscripts)

In [19]:
def add_token_subscript(text):
    numbered_tokens = [el for el in enumerate(text.split())]
    subscripted = [f'{tok}<sub>{tok_num+1}</sub>' for tok_num, tok in numbered_tokens]
    return ' '.join(subscripted)

In [20]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', -1)
annotation_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev'
csv_dirpath = os.path.join(annotation_dirpath, 'fics')
subscripts_dirpath = os.path.join(annotation_dirpath, 'subscripted')
fnames = os.listdir(csv_dirpath)

fandoms = [
    'allmarvel',
#     'harrypotter',
#     'sherlock',
#     'teenwolf',
#     'tolkien'
]

for fandom in fandoms:
    for fname in fnames:
        if fname.endswith('.csv') and fname.startswith(fandom):
            data = pd.read_csv(os.path.join(csv_dirpath, fname))
            data['annotation_text'] = data['text_tokenized'].map(add_token_subscript)
            data.loc[:, ['chapter_id', 'para_id', 'annotation_text']].to_html(os.path.join(subscripts_dirpath, f'{fname[:-4]}_subscripts.html'), escape=False, index=False)

# Load data for preliminary annotation dataset

In [2]:
import random

all_fandoms = [
#     'allmarvel',
    'supernatural',
    'harrypotter',
    'dcu',
    'sherlock',
    'teenwolf',
    'starwars',
    'drwho',
    'tolkien',
    'dragonage',
]

random.sample(all_fandoms, 4)

['teenwolf', 'harrypotter', 'sherlock', 'tolkien']

In [8]:
import os, shutil
import random

old_seeds = [9, 12, 1234, 99, 120]
current_seed = 120
random.seed(current_seed)

dataset = 'complete_en_1k-50k'
fandoms = [
#     'allmarvel',
#     'supernatural',
#     'harrypotter',
#     'dcu',
    'sherlock',
    'teenwolf',
#     'starwars',
#     'drwho',
#     'tolkien',
#     'dragonage',
]

for fandom in fandoms:

    fic_dirpath = f'/data/fanfiction_ao3/{fandom}/{dataset}/fics'
    annotation_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/'
    fnames = os.listdir(fic_dirpath)
    selected = random.sample(fnames, 1)[0]
    print(f'{fandom}: {selected}')
    shutil.copy(os.path.join(fic_dirpath, selected), os.path.join(annotation_dirpath, f'{fandom}_{selected}'))

sherlock: 12828381.csv
teenwolf: 1145590.csv
