# Calculate LEA coreference evaluation

In [3]:
# Load ground-truth annotated entity mentions
import os
import pandas as pd

annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in os.listdir(annotations_dirpath):
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

harrypotter_459070_entity_clusters.csv
teenwolf_1145590_entity_clusters.csv
tolkien_5581141_entity_clusters.csv
allmarvel_1621415_entity_clusters.csv
sherlock_12828381_entity_clusters.csv


5

In [4]:
def extract_entity_mentions(text):
    """ Return token start and endpoints of entity mentions embedded in text. """
    
    token_count = 1
    entities = {} # cluster_name: {(token_id_start, token_id_end), ...}
    
    tokens = text.split(' ')
    for i, token in enumerate(tokens):
        if token.startswith('($_'): # entity cluster name
            if not token in entities:
                entities[token] = set()
                
            mention = tokens[i-1]
            mention_len = len(mention.split('_'))
            token_id_start = token_count - 1
            token_id_end = (token_count - 1) + (mention_len - 1)
            
            token_count += mention_len - 1 # for the underscore-connected mentions
                
            entities[token].add((token_id_start, token_id_end))
            
        else:
            # Advance token count
            token_count += 1
            
    return entities

In [10]:
# Load entity cluster predictions
import os
import pandas as pd

predictions_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/pipeline_output/char_coref_stories'

predicted_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

csv_output = [fname for fname in sorted(os.listdir(predictions_dirpath)) if fname.endswith('.csv')]

for fname in csv_output:
    
    print(fname)
    df = pd.read_csv(os.path.join(predictions_dirpath, fname))
    for row in list(df.itertuples()):
        fic_id = row.fic_id
        chapter_id = row.chapter_id
        para_id = row.para_id
        entities = extract_entity_mentions(row.text_tokenized)
#         print(entities)
#         print(row.text_tokenized)
        
        if not fic_id in predicted_entities:
            predicted_entities[fic_id] = {}
        
        for cluster_name in entities:
            if not cluster_name in predicted_entities[fic_id]:
                predicted_entities[fic_id][cluster_name] = set()
            
            for mention in entities[cluster_name]:
                token_id_start = mention[0]
                token_id_end = mention[1]
                predicted_entities[fic_id][cluster_name].add((chapter_id, para_id, token_id_start, token_id_end))
                
len(predicted_entities)

allmarvel_1621415.coref.csv
harrypotter_459070.coref.csv
sherlock_12828381.coref.csv
teenwolf_1145590.coref.csv
tolkien_5581141.coref.csv


5

In [19]:
import itertools

def links(entity_mentions):
    """ Returns all the links in an entity between mentions """
    
    if len(entity_mentions) == 1: # self-link
        links = {list(entity_mentions)[0], list(entity_mentions)[0]}

    else:
        links = set(itertools.combinations(entity_mentions, 2))
        
    return links

import numpy as np
from IPython.core.debugger import set_trace

def lea_recall(predicted_entities, gold_entities):
    
    fic_recalls = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for gold_cluster, gold_mentions in gold_entities[fic_id].items():
            gold_links = links(gold_mentions)
            
            cluster_resolution = 0
            
            for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
                predicted_links = links(predicted_mentions)
                
                cluster_resolution += len(predicted_links.intersection(gold_links))
                
            cluster_resolution = cluster_resolution/len(gold_links)
            cluster_resolutions[gold_cluster] = cluster_resolution
            cluster_sizes[gold_cluster] = len(gold_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_recalls[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in gold_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total recall as mean across fics
#     print(fic_recalls)
    total_recall = np.mean(list(fic_recalls.values()))
    return total_recall, fic_recalls

import numpy as np
from IPython.core.debugger import set_trace

def lea_precision(predicted_entities, gold_entities):
    
    fic_precisions = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
            predicted_links = links(predicted_mentions)
            
            cluster_resolution = 0
            
            for gold_cluster, gold_mentions in gold_entities[fic_id].items():
                gold_links = links(gold_mentions)
                cluster_resolution += len(predicted_links.intersection(gold_links))
            
            cluster_resolution = cluster_resolution/len(predicted_links)
            cluster_resolutions[predicted_cluster] = cluster_resolution
            cluster_sizes[predicted_cluster] = len(predicted_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_precisions[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in predicted_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total precision as mean across fics
#     print(fic_precisions)
    total_precision = np.mean(list(fic_precisions.values()))
    return total_precision, fic_precisions

In [14]:
def f_score(precision, recall):
    return 2 * (precision * recall)/(precision + recall)

In [20]:
recall, fic_recalls = lea_recall(predicted_entities, gold_entities)
precision, fic_precisions = lea_precision(predicted_entities, gold_entities)
f1 = f_score(precision, recall)

print(f"Precision: {precision: .2%}")
print(f"Recall: {recall: .2%}")
print(f"F-score: {f1: .2%}")

Precision:  28.39%
Recall:  13.44%
F-score:  18.24%


In [21]:
fic_precisions

{459070: 0.20664608688802236,
 1145590: 0.44628475692709196,
 5581141: 0.5345563442755866,
 1621415: 0.08021746118261987,
 12828381: 0.1519804183355585}

In [22]:
fic_recalls

{459070: 0.02770425922695492,
 1145590: 0.24624357045549267,
 5581141: 0.269897197854156,
 1621415: 0.024993739043325823,
 12828381: 0.10310111184268897}

In [79]:
# Test calculation with toy examples

import itertools

# set(itertools.combinations({(1,3), (1,4), (2,2), (3,5)}, 2))
test_gold = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3)},
                'B': {(1,1,4,4), (1,1,5,5), (1,1,6,6)}
                }}

test_predicted = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3), (1,1,6,6)},
                'B': {(1,1,4,4), (1,1,5,5)}
                }}

print(lea_recall(test_predicted, test_gold))
print(lea_precision(test_predicted, test_gold))

0.6666666666666666
0.6666666666666666


# Create personal coref annotation interface (token id subscripts)

In [1]:
def add_token_subscript(text):
    numbered_tokens = [el for el in enumerate(text.split())]
    subscripted = [f'{tok}<sub>{tok_num+1}</sub>' for tok_num, tok in numbered_tokens]
    return ' '.join(subscripted)

In [4]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', -1)
annotation_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev'
csv_dirpath = os.path.join(annotation_dirpath, 'fics')
subscripts_dirpath = os.path.join(annotation_dirpath, 'subscripted')
fnames = os.listdir(csv_dirpath)

fandoms = [
#     'allmarvel',
#     'harrypotter',
#     'sherlock',
#     'teenwolf',
    'tolkien'
]

for fandom in fandoms:
    for fname in fnames:
        if fname.endswith('.csv') and fname.startswith(fandom):
            data = pd.read_csv(os.path.join(csv_dirpath, fname))
            data['annotation_text'] = data['text_tokenized'].map(add_token_subscript)
            data.loc[:, ['chapter_id', 'para_id', 'annotation_text']].to_html(os.path.join(subscripts_dirpath, f'{fname[:-4]}_subscripts.html'), escape=False, index=False)

# Load data for preliminary annotation dataset

In [2]:
import random

all_fandoms = [
#     'allmarvel',
    'supernatural',
    'harrypotter',
    'dcu',
    'sherlock',
    'teenwolf',
    'starwars',
    'drwho',
    'tolkien',
    'dragonage',
]

random.sample(all_fandoms, 4)

['teenwolf', 'harrypotter', 'sherlock', 'tolkien']

In [8]:
import os, shutil
import random

old_seeds = [9, 12, 1234, 99, 120]
current_seed = 120
random.seed(current_seed)

dataset = 'complete_en_1k-50k'
fandoms = [
#     'allmarvel',
#     'supernatural',
#     'harrypotter',
#     'dcu',
    'sherlock',
    'teenwolf',
#     'starwars',
#     'drwho',
#     'tolkien',
#     'dragonage',
]

for fandom in fandoms:

    fic_dirpath = f'/data/fanfiction_ao3/{fandom}/{dataset}/fics'
    annotation_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/'
    fnames = os.listdir(fic_dirpath)
    selected = random.sample(fnames, 1)[0]
    print(f'{fandom}: {selected}')
    shutil.copy(os.path.join(fic_dirpath, selected), os.path.join(annotation_dirpath, f'{fandom}_{selected}'))

sherlock: 12828381.csv
teenwolf: 1145590.csv
