# Try to figure LEA discrepancies

In [28]:
gold_clusters = [
    ['a', 'b', 'c'],
    ['d', 'e', 'f', 'g']
]
sys_clusters = [
    ['a', 'b'],
    ['c', 'd'],
    ['f', 'g', 'h', 'i']
]
sys_mention_gold_cluster = {
    'a': 0,
    'b': 0,
    'c': 0,
    'd': 1,
    'f': 1,
    'g': 1,
}
gold_mention_sys_cluster = {
    'a': 0,
    'b': 0,
    'c': 1,
    'd': 1,
    'f': 2,
    'g': 2,
}

In [29]:
import pdb

def lea_internal(input_clusters, output_clusters, mention_to_gold):
    num, den = 0, 0

    for c in input_clusters:
#         if len(c) == 4: pdb.set_trace()
        if len(c) == 1:
            all_links = 1
            if c[0] in mention_to_gold and len(
                    output_clusters[mention_to_gold[c[0]]]) == 1:
                common_links = 1
            else:
                common_links = 0
        else:
            common_links = 0
            all_links = len(c) * (len(c) - 1) / 2.0
            for i, m in enumerate(c):
                if m in mention_to_gold:
                    for m2 in c[i + 1:]:
                        if m2 in mention_to_gold and mention_to_gold[
                                m] == mention_to_gold[m2]:
                            common_links += 1

        num += len(c) * common_links / float(all_links)
        den += len(c)

    return num, den

def lea_actual(sys_clusters, gold_clusters, sys_mention_gold_cluster, gold_mention_sys_cluster):
    pn, pd = lea_internal(sys_clusters, gold_clusters, sys_mention_gold_cluster)
    rn, rd = lea_internal(gold_clusters, sys_clusters, gold_mention_sys_cluster)
    
    precision = 0 if pn == 0 else pn/pd
    recall = 0 if rn == 0 else rn/rd
    print(f'Precision {precision}')
    print(f'Recall {recall}')

In [30]:
# Pipeline implementation
import pdb
import itertools

def links(mention_cluster):
    """ Returns a set of all the links in an entity between lists of AnnotatedSpans
        Links are tuples (chap_id, start_token_id, end_token_id)
    """

    if len(mention_cluster) == 1: # self-link
        links = {mention_cluster[0], mention_cluster[0]}

    else:
        links = set([(mention1, mention2) for mention1,     
                     mention2 in itertools.combinations(mention_cluster, 2)])
    n_mentions = len(mention_cluster)
    if len(mention_cluster) != 1:
        if len(links) != (n_mentions * (n_mentions-1))/2:
            pdb.set_trace()
    return links


def lea(clusters, ref_clusters):
    """ Generic LEA calculcation between 2 sets of clusters..
        Input clusters are of form {'annotation': [AnnotatedSpan, ...]}
        Returns a numerator and denominator to be used for recall and precision.
    """

    cluster_resolutions = {}
    cluster_sizes = {}

    for ref_cluster_name, ref_mentions in enumerate(ref_clusters):
        ref_links = links(ref_mentions)
        cluster_resolution = 0
        for cluster, mentions in enumerate(clusters):
            cluster_links = links(mentions)
            cluster_resolution += len(cluster_links.intersection(ref_links))

        cluster_resolution = cluster_resolution/len(ref_links)
        cluster_resolutions[ref_cluster_name] = cluster_resolution
        cluster_sizes[ref_cluster_name] = len(ref_mentions)

    num = sum([cluster_sizes[c] * cluster_resolutions[c] for c in range(len(ref_clusters))])
    den = sum(cluster_sizes.values())

    return num, den


def lea_pipeline_recall(predicted_clusters, gold_clusters):
    """ Calculates LEA recall between predicted mention clusters and gold mention
        clusters for a fic.
        Input clusters are of form {'annotation': [AnnotatedSpan, ...]}
    """
    if len(predicted_clusters) == 0:
        return 0
    else:
        num, den = lea(predicted_clusters, gold_clusters)
        return num/den


def lea_pipeline_precision(predicted_clusters, gold_clusters):
    """ Calculates LEA precision between predicted mention clusters and gold mention
        clusters for a fic.
        Input clusters are of form {'annotation': [AnnotatedSpan, ...]}
    """
    if len(predicted_clusters) == 0:
        return 0
    else:
        num, den = lea(gold_clusters, predicted_clusters)
        return num/den

In [31]:
print('Actual:')
lea_actual(sys_clusters, gold_clusters, sys_mention_gold_cluster, gold_mention_sys_cluster)
print()

Actual:
Precision 0.3333333333333333
Recall 0.23809523809523808



In [32]:
pipeline_r = lea_pipeline_recall(sys_clusters, gold_clusters)
pipeline_p = lea_pipeline_precision(sys_clusters, gold_clusters)
print('Pipeline:')
print(f'Precision: {pipeline_p}')
print(f'Recall: {pipeline_r}')

Pipeline:
Precision: 0.3333333333333333
Recall: 0.23809523809523808


# Load SpanBERT coref
For placement in fanfiction_nlp_evaluation/annotation.py

In [1]:
import json
import os

clusters = {}
# dirpath = '/projects/presidio_analysis/Patient_Doctor_Conversations/COREF_RESOLUTION/fanfiction_coref/data/output_spl_lit'
dirpath = '/projects/presidio_analysis/Patient_Doctor_Conversations/COREF_RESOLUTION/fanfiction_coref/data/output_spb_lit'
for fname in sorted(os.listdir(dirpath)):
    fpath = os.path.join(dirpath, fname)
    with open(fpath) as f:
        clusters[fname[:-5]] = json.load(f)

In [2]:
clusters.keys()

dict_keys(['allmarvel_606106', 'dcu_16369049', 'dragonage_4305894', 'drwho_8333365', 'harrypotter_2287736', 'sherlock_1296961', 'starwars_6082176', 'supernatural_1813147', 'teenwolf_3150806', 'tolkien_2185500'])

In [9]:
clusters['allmarvel_606106'].keys()

dict_keys(['document', 'doc_tokens', 'subtoken_map', 'tokenized_doc', 'clusters'])

In [12]:
len(clusters['allmarvel_606106']['document'].split())

4176

In [13]:
len(clusters['allmarvel_606106']['doc_tokens'])

4176

In [14]:
clusters['allmarvel_606106']['clusters']

[[[1, 3],
  [20, 20],
  [23, 23],
  [32, 32],
  [40, 40],
  [45, 45],
  [50, 50],
  [56, 56],
  [88, 88],
  [93, 93],
  [114, 114],
  [116, 116],
  [135, 135],
  [145, 145],
  [155, 155],
  [158, 158],
  [184, 184],
  [212, 213],
  [215, 215],
  [220, 220],
  [224, 224],
  [236, 236],
  [258, 258],
  [261, 261],
  [277, 277],
  [285, 285],
  [297, 297],
  [312, 312],
  [324, 324],
  [358, 358],
  [398, 398],
  [405, 405],
  [433, 433],
  [452, 452],
  [462, 462],
  [468, 468],
  [505, 505],
  [529, 529],
  [537, 537],
  [585, 586],
  [591, 591],
  [608, 608],
  [616, 616],
  [625, 625],
  [641, 641],
  [648, 648],
  [669, 669],
  [690, 690],
  [697, 697],
  [716, 716],
  [718, 718],
  [732, 732],
  [759, 759],
  [762, 762],
  [775, 775],
  [788, 788],
  [793, 793],
  [813, 813],
  [840, 840],
  [890, 890],
  [916, 916],
  [922, 922],
  [938, 938],
  [953, 953],
  [969, 969],
  [984, 984],
  [1006, 1006],
  [1056, 1056],
  [1062, 1062],
  [1103, 1103],
  [1120, 1120],
  [1135, 1135],
  

In [7]:
clusters['allmarvel_606106']['clusters']

[[[1, 3],
  [20, 20],
  [23, 23],
  [32, 32],
  [40, 40],
  [45, 45],
  [50, 50],
  [56, 56],
  [88, 88],
  [93, 93],
  [114, 114],
  [116, 116],
  [135, 135],
  [145, 145],
  [155, 155],
  [158, 158],
  [184, 184],
  [212, 213],
  [215, 215],
  [220, 220],
  [224, 224],
  [236, 236],
  [258, 258],
  [261, 261],
  [277, 277],
  [285, 285],
  [297, 297],
  [312, 312],
  [324, 324],
  [358, 358],
  [398, 398],
  [405, 405],
  [433, 433],
  [452, 452],
  [462, 462],
  [468, 468],
  [505, 505],
  [529, 529],
  [537, 537],
  [585, 586],
  [591, 591],
  [608, 608],
  [616, 616],
  [625, 625],
  [641, 641],
  [648, 648],
  [669, 669],
  [690, 690],
  [697, 697],
  [716, 716],
  [718, 718],
  [732, 732],
  [759, 759],
  [762, 762],
  [775, 775],
  [788, 788],
  [793, 793],
  [813, 813],
  [840, 840],
  [890, 890],
  [916, 916],
  [922, 922],
  [938, 938],
  [953, 953],
  [969, 969],
  [984, 984],
  [1006, 1006],
  [1056, 1056],
  [1062, 1062],
  [1103, 1103],
  [1120, 1120],
  [1135, 1135],
  

In [88]:
len([tag for taglist in clusters['allmarvel_606106']['clusters'] for tag in taglist])

574

In [142]:
# Convert Bert-tokenized spans to chapter_id, para_id, token_ids
import pdb
from tqdm.notebook import tqdm

matches = []
gold_tok_clusters = {}
for fandom_fname in tqdm(clusters):
    print(fandom_fname)
    gold_toks = load_gold_toks(fandom_fname)
    bert_toks = load_bert_toks(fandom_fname)
    gold_tok_clusters[fandom_fname] = []
    for cluster in clusters[fandom_fname]['clusters']:
        cluster_gold_toks = set()
        for entry in cluster:
            bert_beg_tok = entry[0][0]
            bert_end_tok = entry[0][1]
            bert_text = entry[1]
            if bert_beg_tok == bert_end_tok: continue
            gold_start_tok, gold_end_tok, gold_text = bert2gold(bert_beg_tok, bert_end_tok, bert_toks, gold_toks)
            cluster_gold_toks.add((gold_start_tok, gold_end_tok))
            gold_tok_clusters[fandom_fname].append(cluster_gold_toks)
            matches.append([fandom_fname, bert_text, gold_text, bert_beg_tok, bert_end_tok, gold_start_tok, gold_end_tok])
            
matches = pd.DataFrame(matches, columns=['fandom_fname', 'bert_text', 'gold_text', 'bert_beg_tok', 'bert_end_tok', 
                                        'gold_start_tok', 'gold_end_tok'])
pd.set_option('display.max_colwidth', None)
mismatches = matches.loc[matches['bert_text']!= matches['gold_text']]
mismatches

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

allmarvel_606106
dcu_16369049
dragonage_4305894
drwho_8333365
harrypotter_2287736
sherlock_1296961
starwars_6082176
supernatural_1813147
teenwolf_3150806
tolkien_2185500



Unnamed: 0,fandom_fname,bert_text,gold_text,bert_beg_tok,bert_end_tok,gold_start_tok,gold_end_tok
17,allmarvel_606106,you,/you/,1192,1193,"(1, 10, 46)","(1, 10, 46)"
367,allmarvel_606106,the family he ' s chosen,the family he 's chosen,2752,2758,"(1, 31, 86)","(1, 31, 90)"
453,allmarvel_606106,I,I-,2544,2545,"(1, 30, 2)","(1, 30, 2)"
579,dcu_16369049,a species of mobster fish people stealing humans who all seem to have sea - diving skills,a species of mobster fish people stealing humans who all seem to have sea - diving,10,27,"(1, 1, 9)","(1, 1, 24)"
600,dcu_16369049,Merpeople,—Merpeople,31,32,"(1, 2, 2)","(1, 2, 2)"
624,dcu_16369049,the Merpeople,the Merpeople—,1354,1356,"(1, 47, 7)","(1, 47, 8)"
626,dcu_16369049,merpeople,merpeople—,1449,1450,"(1, 51, 22)","(1, 51, 22)"
769,dcu_16369049,The . . . Bretans,The ... Bretans,129,134,"(1, 6, 22)","(1, 6, 24)"
954,dcu_16369049,I,I’m,3035,3036,"(1, 114, 12)","(1, 114, 12)"
955,dcu_16369049,I,I’m,3039,3040,"(1, 114, 14)","(1, 114, 14)"


In [141]:
# Load bert and gold tokenizations
import pandas as pd
import pdb

def add_nospace_boundaries(df, colname):
    df['token_len'] = df[colname].map(len)
    boundaries = []
    ptr = 0
    for length in df['token_len']:
        boundaries.append((ptr, ptr+length))
        ptr += length
    df[['beg_nospace', 'end_nospace']] = pd.DataFrame(boundaries, index=df.index)
    return df

# Build gold df
def load_gold_toks(fandom_fname):
    fpath = f'/data/fanfiction_ao3/annotated_10fandom/test/fics/{fandom_fname}.csv'
    gold = pd.read_csv(fpath)
    s = gold['text_tokenized'].str.split().apply(pd.Series, 1).stack()
    gt = pd.DataFrame(s, columns=['token'])
    gt['token_id'] = gt.index.get_level_values(1) + 1
    gt.index = gt.index.droplevel(-1)
    gold_toks = gt.merge(gold, left_index=True, right_index=True)[['token', 'chapter_id', 'para_id', 'token_id']]
    gold_toks = gold_toks.reset_index(drop=True)
    gold_toks = add_nospace_boundaries(gold_toks, 'token')
    return gold_toks

# Build bert df
def load_bert_toks(fandom_fname):
    bert_toks = pd.DataFrame(clusters[fandom_fname]['document'].split(), columns=['token'])
    bert_toks = add_nospace_boundaries(bert_toks, 'token')
    return bert_toks

def bert2gold(beg_tok, end_tok, bert_toks, gold_toks):
#     if beg_tok == 1354 and end_tok == 1356:
#         pdb.set_trace()
    beg_nospace = bert_toks.loc[beg_tok, 'beg_nospace']
    end_nospace = bert_toks.loc[end_tok-1, 'end_nospace']
    selected = gold_toks.loc[(gold_toks['beg_nospace']>=beg_nospace) & (gold_toks['end_nospace']<=end_nospace)]
    spread = 1
    while len(selected) == 0 or ' '.join(selected['token'].tolist()) == 'the':
        selected = gold_toks.loc[(gold_toks['beg_nospace']>=beg_nospace-spread) & (gold_toks['end_nospace']<=end_nospace+spread)]
        # Remove any rows with only punctuation
        selected = selected.loc[selected['token'].str.contains(r'\w')]
        spread +=1 
        if spread == 10:
            pdb.set_trace()

    # Look up in gold_toks
    start_gold_tok = tuple(selected.iloc[0][['chapter_id', 'para_id', 'token_id']])
    end_gold_tok = tuple(selected.iloc[-1][['chapter_id', 'para_id', 'token_id']])
    gold_text = ' '.join(selected.token.tolist())
    return (start_gold_tok, end_gold_tok, gold_text)

# Post-process CoreNLP server output

In [7]:
# Load JSON output
import os
import json

server_output_dirpath = '/data/fanfiction_ao3/allmarvel/complete_en_1k-50k/fics_proc'
fname = '2114373.csv'
fpath = os.path.join(server_output_dirpath, fname)

with open(fpath) as f:
    data = [json.loads(line) for line in f.read().splitlines()]
    
data

[{'fic_id': '2114373',
  'chapter_id': '1',
  'para_id': '1',
  'text': 'Their relationship, right from the start, was unconventional, to say the least. They both worked for Daily Bugle, or at least that’s what Wade had told him. He was handsome and flirty, if a little bit odd. They’ve talked a few times and Peter liked him just fine.',
  'text_tokenized': 'Their relationship , right from the start , was unconventional , to say the least . They both worked for Daily Bugle , or at least that ’s what Wade had told him . He was handsome and flirty , if a little bit odd . They ’ve talked a few times and Peter liked him just fine .',
  'corenlp': {'sentences': [{'index': 0,
     'parse': '(ROOT\n  (S\n    (NP\n      (NP (PRP$ Their) (NN relationship))\n      (, ,)\n      (ADVP (RB right)\n        (PP (IN from)\n          (NP (DT the) (NN start))))\n      (, ,))\n    (VP (VBD was)\n      (ADJP (JJ unconventional))\n      (, ,)\n      (S\n        (VP (TO to)\n          (VP (VB say)\n         