# Post-process BookNLP output to get entity mention clusters

In [10]:
# Load entity cluster predictions
import os
import pandas as pd
import csv


def modify_paragraph_id(para_id):
   
    if para_id == 34: # trouble line
        new_para_id = 36
        
    if para_id >= 35:
        new_para_id = para_id
    
    else:
        new_para_id = para_id + 1
        
    return new_para_id


predictions_dirpath = '/projects/book-nlp/data/tokens/annotated_10fandom_dev/'

predicted_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(predictions_dirpath))[2:3]:
    
    print(fname)
    booknlp_output = pd.read_csv(os.path.join(predictions_dirpath, fname), sep='\t', quoting=csv.QUOTE_NONE)
    print(df.columns)

    # Load CSV file of fic
    csv_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/{fname.split(".")[0]}.csv'
    fic_csv = pd.read_csv(csv_dirpath)
    fic_csv.columns

    ## Check paragraph breaks
    # Compare number of paragraphs
    if max(fic_csv['para_id']) != len(booknlp_output['paragraphId'].unique()):

        # Fix paragraph break issues
        booknlp_output['modified_paragraphId'] = booknlp_output['paragraphId'].map(modify_paragraph_id)

        # Confirm that fixed it
        if max(fic_csv['para_id']) != len(booknlp_output['modified_paragraphId'].unique()):
            pdb.set_trace()

# Paragraph breaks from BookNLP
pd.set_option('display.max_colwidth', -1)
booknlp_output.groupby('modified_paragraphId').agg({'originalWord': lambda x: ' '.join(x.tolist())})

sherlock_12828381.txt.tokens
Index(['paragraphId', 'sentenceID', 'tokenId', 'beginOffset', 'endOffset',
       'whitespaceAfter', 'headTokenId', 'originalWord', 'normalizedWord',
       'lemma', 'pos', 'ner', 'deprel', 'inQuotation', 'characterId',
       'supersense'],
      dtype='object')


Index(['fic_id', 'chapter_id', 'para_id', 'text', 'text_tokenized'], dtype='object')

## Make sure token IDs align

In [71]:
# Get token counts for paragraphs from BookNLP, make sure they match the original fic token counts

fic_csv['booknlp_para_length'] = booknlp_output.groupby('modified_paragraphId').size().tolist()
fic_csv['token_count'] = fic_csv['text_tokenized'].map(lambda x: len(x.split()))
misaligned_rows = fic_csv.loc[fic_csv['token_count'] != fic_csv['booknlp_para_length'], ['para_id', 'token_count', 'booknlp_para_length']]
misaligned_rows

Unnamed: 0,para_id,token_count,booknlp_para_length
0,1,144,146
8,9,39,40


In [81]:
# Fix token misalignment issues
modified_booknlp = booknlp_output.copy()
    
for selected_paragraph in misaligned_rows['para_id']:

    selected_chapter = 1
    gold_tokens = fic_csv.loc[fic_csv['para_id']==selected_paragraph, 'text_tokenized'].tolist()[0].split()
    booknlp_tokens = booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_paragraph, 'originalWord'].tolist() # careful with chapters

    total_offset = 0
    trouble_offsets = {} # line_number: offset
    first_tokenId = booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_paragraph, 'tokenId'].tolist()[0]

    for i, gold_tok in enumerate(gold_tokens):
        if not gold_tok == booknlp_tokens[i + total_offset]:

            # Try adding tokens
            added = booknlp_tokens[i]
            for offset in range(1, 4):
                added += booknlp_tokens[i+offset]
                if added == gold_tok:
                    total_offset += offset
                    trouble_offsets[first_tokenId + i] = offset
                    break

            else:
                print(gold_tok)
                print(booknlp_tokens[i])
                pdb.set_trace()

#     print(total_offset)
#     print(trouble_offsets)

    # Modify BookNLP output
    for line, offset in trouble_offsets.items():
        row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId'].isin(range(line, line+offset+1)))

        # Modify offset word
        new_word = ''.join(modified_booknlp.loc[row_filter, 'originalWord'].tolist())
        modified_row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId']==line)
        modified_booknlp.loc[modified_row_filter, 'originalWord'] = new_word

        # Delete offset words
        delete_row_filter = (modified_booknlp['modified_paragraphId']==selected_paragraph) & (modified_booknlp['tokenId'].isin(range(line+1, line+offset+1)))
        delete_index = modified_booknlp.loc[delete_row_filter].index
        modified_booknlp.drop(index=delete_index, inplace=True)

# Check token length match again
fic_csv['booknlp_para_length'] = modified_booknlp.groupby('modified_paragraphId').size().tolist()
fic_csv['token_count'] = fic_csv['text_tokenized'].map(lambda x: len(x.split()))
misaligned_rows = fic_csv.loc[fic_csv['token_count'] != fic_csv['booknlp_para_length'], ['para_id', 'token_count', 'booknlp_para_length']]
misaligned_rows

{81: 2}
{366: 1}


Unnamed: 0,para_id,token_count,booknlp_para_length


## Renumber BookNLP token IDs

In [82]:
modified_booknlp.columns

Index(['paragraphId', 'sentenceID', 'tokenId', 'beginOffset', 'endOffset',
       'whitespaceAfter', 'headTokenId', 'originalWord', 'normalizedWord',
       'lemma', 'pos', 'ner', 'deprel', 'inQuotation', 'characterId',
       'supersense', 'modified_paragraphId'],
      dtype='object')

In [91]:
para_token_lengths = modified_booknlp.groupby('modified_paragraphId').size().tolist()
para_token_lengths

[144,
 67,
 70,
 17,
 2,
 25,
 17,
 18,
 39,
 18,
 49,
 147,
 12,
 4,
 14,
 107,
 21,
 36,
 49,
 21,
 27,
 113,
 44,
 12,
 43,
 13,
 27,
 14,
 4,
 35,
 63,
 310,
 9,
 103,
 197,
 2,
 116,
 145,
 36,
 4,
 37,
 19,
 59,
 8,
 13]

In [92]:
new_tokenIds = sum([list(range(1, para_length+1)) for para_length in para_token_lengths], [])
new_tokenIds

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51

In [93]:
modified_booknlp['modified_tokenId'] = new_tokenIds
modified_booknlp.loc[:, ['modified_paragraphId', 'modified_tokenId', 'tokenId']]

Unnamed: 0,modified_paragraphId,modified_tokenId,tokenId
0,1,1,0
1,1,2,1
2,1,3,2
3,1,4,3
4,1,5,4
5,1,6,5
6,1,7,6
7,1,8,7
8,1,9,8
9,1,10,9


## Find any chapter IDs other than 1

In [98]:
for fname in sorted(os.listdir(predictions_dirpath)):
    csv_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/{fname.split(".")[0]}.csv'
    fic_csv = pd.read_csv(csv_dirpath)
    print(f'{fname}: {max(fic_csv["chapter_id"])}')

allmarvel_1621415.txt.tokens: 1
harrypotter_459070.txt.tokens: 1
sherlock_12828381.txt.tokens: 1
teenwolf_1145590.txt.tokens: 1
tolkien_5581141.txt.tokens: 1


## Extract entity mention tuples, clusters

In [99]:
modified_booknlp.columns

Index(['paragraphId', 'sentenceID', 'tokenId', 'beginOffset', 'endOffset',
       'whitespaceAfter', 'headTokenId', 'originalWord', 'normalizedWord',
       'lemma', 'pos', 'ner', 'deprel', 'inQuotation', 'characterId',
       'supersense', 'modified_paragraphId', 'modified_tokenId'],
      dtype='object')

In [105]:
selected_cols = ['modified_paragraphId', 'modified_tokenId', 'characterId', 'originalWord']
mentions = modified_booknlp[modified_booknlp['characterId']>-1].loc[:, selected_cols]
mentions

Unnamed: 0,modified_paragraphId,modified_tokenId,characterId,originalWord
11,1,12,1,Sherlock
18,1,19,1,his
37,1,38,1,Sherlock
42,1,43,1,he
60,1,61,1,Sherlock
76,1,77,1,he
84,1,83,1,Sherlock
98,1,97,1,he
112,1,111,1,He
123,1,122,1,his


In [106]:
fname

'tolkien_5581141.txt.tokens'

In [115]:
# Calculate end tokens for any entity mentions

mentions['next_entity_tokenId'] = mentions['modified_tokenId'].tolist()[1:] + [0]
mentions['next_entity_paragraphId'] = mentions['modified_paragraphId'].tolist()[1:] + [0]
mentions['next_entity_characterId'] = mentions['characterId'].tolist()[1:] + [0]
mentions['sequential'] = [(next_entity_tokenId == modified_tokenId + 1) and \
                          (next_entity_paragraphId == modified_paragraphId) and \
                          (next_entity_characterId == characterId) 
                        for next_entity_tokenId, modified_tokenId, next_entity_paragraphId, modified_paragraphId, next_entity_characterId, characterId in \
                          zip(mentions['next_entity_tokenId'], mentions['modified_tokenId'], mentions['next_entity_paragraphId'], \
                              mentions['modified_paragraphId'], mentions['next_entity_characterId'], mentions['characterId'])
                             ]
mentions

Unnamed: 0,modified_paragraphId,modified_tokenId,characterId,originalWord,next_entity_tokenId,next_entity_paragraphId,sequential,next_entity_characterId
11,1,12,1,Sherlock,19,1,False,1
18,1,19,1,his,38,1,False,1
37,1,38,1,Sherlock,43,1,False,1
42,1,43,1,he,61,1,False,1
60,1,61,1,Sherlock,77,1,False,1
76,1,77,1,he,83,1,False,1
84,1,83,1,Sherlock,97,1,False,1
98,1,97,1,he,111,1,False,1
112,1,111,1,He,122,1,False,1
123,1,122,1,his,133,1,False,1


In [116]:
sum(mentions['sequential'])

1

In [112]:
mentions[mentions['sequential']==True]

Unnamed: 0,modified_paragraphId,modified_tokenId,characterId,originalWord,next_entity_tokenId,next_entity_paragraphId,sequential
1676,34,83,1,him,84,34,True


In [114]:
mentions[mentions['modified_tokenId'].isin([83,84])]

Unnamed: 0,modified_paragraphId,modified_tokenId,characterId,originalWord,next_entity_tokenId,next_entity_paragraphId,sequential
84,1,83,1,Sherlock,97,1,False
1676,34,83,1,him,84,34,True
1677,34,84,1,he,88,34,False
1979,37,84,1,his,96,37,False


In [130]:
predicted_entities = {}

prev_was_sequential = False
prev_token_id_start = 0

# fic_id = int(fname.split('.')[0].split('_')[1])
fic_id = 12828381

for row in list(mentions.itertuples()):
    chapter_id = 1
    para_id = row.modified_paragraphId
    character_id = row.characterId
    token_id_start = row.modified_tokenId
    
    if row.sequential: # Store last token ID
        if prev_was_sequential: # in the middle of an entity mention
            continue
        else:
            prev_was_sequential = True
            prev_token_id_start = token_id_start
            continue

    # Save entity mention
    if not fic_id in predicted_entities:
        predicted_entities[fic_id] = {}

    if not character_id in predicted_entities[fic_id]:
        predicted_entities[fic_id][character_id] = set()

    if prev_was_sequential:
        token_id_start = prev_token_id_start
        
    token_id_end = row.modified_tokenId
        
    predicted_entities[fic_id][character_id].add((chapter_id, para_id, token_id_start, token_id_end))
    
    prev_was_sequential = row.sequential
                
len(predicted_entities)

1

In [131]:
predicted_entities[12828381]

{1: {(1, 1, 12, 12),
  (1, 1, 19, 19),
  (1, 1, 38, 38),
  (1, 1, 43, 43),
  (1, 1, 61, 61),
  (1, 1, 77, 77),
  (1, 1, 83, 83),
  (1, 1, 97, 97),
  (1, 1, 111, 111),
  (1, 1, 122, 122),
  (1, 1, 133, 133),
  (1, 2, 1, 1),
  (1, 2, 16, 16),
  (1, 2, 18, 18),
  (1, 2, 26, 26),
  (1, 2, 47, 47),
  (1, 2, 54, 54),
  (1, 2, 56, 56),
  (1, 2, 66, 66),
  (1, 3, 1, 1),
  (1, 3, 12, 12),
  (1, 3, 27, 27),
  (1, 3, 43, 43),
  (1, 3, 45, 45),
  (1, 3, 50, 50),
  (1, 3, 54, 54),
  (1, 3, 60, 60),
  (1, 4, 8, 8),
  (1, 6, 1, 1),
  (1, 6, 15, 15),
  (1, 9, 17, 17),
  (1, 10, 7, 7),
  (1, 11, 21, 21),
  (1, 11, 34, 34),
  (1, 12, 9, 9),
  (1, 16, 100, 100),
  (1, 17, 1, 1),
  (1, 17, 17, 17),
  (1, 18, 11, 11),
  (1, 18, 14, 14),
  (1, 18, 21, 21),
  (1, 18, 26, 26),
  (1, 19, 10, 10),
  (1, 19, 21, 21),
  (1, 20, 15, 15),
  (1, 22, 7, 7),
  (1, 22, 11, 11),
  (1, 22, 43, 43),
  (1, 22, 55, 55),
  (1, 22, 70, 70),
  (1, 22, 74, 74),
  (1, 22, 77, 77),
  (1, 22, 80, 80),
  (1, 22, 88, 88),
  (1, 22, 

# 1-time

In [59]:
# Examine misaligned paragraphs
pd.set_option('display.max_rows', 999)
print(fic_csv.loc[fic_csv['para_id']==selected_para, 'text_tokenized'])
booknlp_output.loc[booknlp_output['modified_paragraphId']==selected_para, ['modified_paragraphId', 'tokenId', 'originalWord']]

34    _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\n It had been nearly 20 minutes and the Bartender had n’t checked on him again . Though there was n’t much of a need . Sherlock ’s drink was still mostly full and he could n’t be possibly bothered to come back to chat when he had other paying customers .
Name: text_tokenized, dtype: object


Unnamed: 0,modified_paragraphId,tokenId,originalWord
1697,35,1697,_
1698,35,1698,_
1699,35,1699,_
1700,35,1700,_
1701,35,1701,_
1702,35,1702,_
1703,35,1703,_
1704,35,1704,_
1705,35,1705,_
1706,35,1706,_


In [None]:
        # Paragraph breaks from BookNLP
        pd.set_option('display.max_colwidth', -1)
        booknlp_output.groupby('paragraphId').agg({'originalWord': lambda x: ' '.join(x.tolist())})

# Run BookNLP multiple files at once

In [3]:
import os
from subprocess import call

input_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics_text_tokenized/'
booknlp_log_dirpath = 'data/output/annotated_10fandom_dev/'
booknlp_output_dirpath = 'data/tokens/annotated_10fandom_dev/'

# cmd = './runjava novels/BookNLP -doc /data/fanfiction_ao3/annotated_10fandom/dev/fics_text/allmarvel_1621415 -p data/output/annotated_10fandom_dev/allmarvel_1621415 -tok data/tokens/allmarvel_1621415.tokens -f'

booknlp_dirpath = '/projects/book-nlp/'
os.chdir(booknlp_dirpath)

for fname in os.listdir(input_dirpath):
    cmd = ['./runjava', 'novels/BookNLP', '-doc', 
                os.path.join(input_dirpath, fname),
               '-p', os.path.join(booknlp_log_dirpath, fname),
               '-tok', f'{os.path.join(booknlp_output_dirpath, os.path.splitext(fname)[0])}.tokens', '-f']
    
    call(cmd)

# Prepare text of fics for baselines

## Actual fic text

In [54]:
import os
from tqdm import tqdm_notebook as tqdm
import re

input_text_dirpath = '/projects/fanfiction-nlp/tmp/annotated_10fandom_dev_text_data'
output_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics_text'
if not os.path.exists(output_text_dirpath):
    os.mkdir(output_text_dirpath)

for fname in tqdm(os.listdir(input_text_dirpath)):
    
    # Read
    with open(os.path.join(input_text_dirpath, fname)) as fin:
        text = fin.read()
        new_text = text.replace(' # . ', '\n\n')
        new_text = re.sub(r'\s?\-+\s?', ' - ', new_text)
        
    # Write
    with open(os.path.join(output_text_dirpath, f'{fname}.txt'), 'w') as fout:
        fout.write(new_text)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




## Already tokenized

In [2]:
import os
from tqdm import tqdm_notebook as tqdm
import pandas as pd

input_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics'
output_text_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics_text_tokenized'
if not os.path.exists(output_text_dirpath):
    os.mkdir(output_text_dirpath)

for fname in tqdm(os.listdir(input_text_dirpath)):
    
    # Read
    fic = pd.read_csv(os.path.join(input_text_dirpath, fname))
    tokenized = fic['text_tokenized'].tolist()
    
    # Write
    with open(os.path.join(output_text_dirpath, f'{os.path.splitext(fname)[0]}.txt'), 'w') as fout:
        fout.write('\n\n'.join(tokenized))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




# Calculate LEA coreference evaluation

In [125]:
# Load ground-truth annotated entity mentions
import os
import pandas as pd

annotations_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/entity_clusters'

gold_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

for fname in sorted(os.listdir(annotations_dirpath))[2:3]:
    
    print(fname)
    
    fic_id = int(fname.split('_')[1])
    gold_entities[fic_id] = {}
    
    df = pd.read_csv(os.path.join(annotations_dirpath, fname))
    for colname in df.columns:
        gold_entities[fic_id][colname] = set()
        for mention in df[colname].dropna():
            parts = mention.split('.')
            chapter_id = int(parts[0])
            paragraph_id = int(parts[1])
            if '-' in parts[2]:
                token_id_start = int(parts[2].split('-')[0])
                token_id_end = int(parts[2].split('-')[-1])
            else:
                token_id_start = int(parts[2])
                token_id_end = int(parts[2])
                
            gold_entities[fic_id][colname].add((chapter_id, paragraph_id, token_id_start, token_id_end))

len(gold_entities)

sherlock_12828381_entity_clusters.csv


1

In [3]:
# Number of characters
for fname in gold_entities:
    print(f'{fname}: {len(gold_entities[fname])}')

459070: 39
1145590: 12
5581141: 7
1621415: 5
12828381: 9


In [8]:
# Length of fics

fic_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/fics/'

for fname in os.listdir(fic_dirpath):
    fic = pd.read_csv(os.path.join(fic_dirpath, fname))
    n_words = sum(fic['text_tokenized'].map(lambda x: len(x.split())))
    print(f'{fname}: {n_words}')

allmarvel_1621415.csv: 2791
harrypotter_459070.csv: 5288
tolkien_5581141.csv: 2939
sherlock_12828381.csv: 2330
teenwolf_1145590.csv: 3493


In [4]:
def extract_entity_mentions(text):
    """ Return token start and endpoints of entity mentions embedded in text. """
    
    token_count = 1
    entities = {} # cluster_name: {(token_id_start, token_id_end), ...}
    
    tokens = text.split(' ')
    for i, token in enumerate(tokens):
        if token.startswith('($_'): # entity cluster name
            if not token in entities:
                entities[token] = set()
                
            mention = tokens[i-1]
            mention_len = len(mention.split('_'))
            token_id_start = token_count - 1
            token_id_end = (token_count - 1) + (mention_len - 1)
            
            token_count += mention_len - 1 # for the underscore-connected mentions
                
            entities[token].add((token_id_start, token_id_end))
            
        else:
            # Advance token count
            token_count += 1
            
    return entities

In [10]:
# Load entity cluster predictions
import os
import pandas as pd

predictions_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev/pipeline_output/char_coref_stories'

predicted_entities = {} # fic_id: {cluster_name: {(chapter_id, paragraph_id, token_id_start, token_id_end), ...}}

csv_output = [fname for fname in sorted(os.listdir(predictions_dirpath)) if fname.endswith('.csv')]

for fname in csv_output:
    
    print(fname)
    df = pd.read_csv(os.path.join(predictions_dirpath, fname))
    for row in list(df.itertuples()):
        fic_id = row.fic_id
        chapter_id = row.chapter_id
        para_id = row.para_id
        entities = extract_entity_mentions(row.text_tokenized)
#         print(entities)
#         print(row.text_tokenized)
        
        if not fic_id in predicted_entities:
            predicted_entities[fic_id] = {}
        
        for cluster_name in entities:
            if not cluster_name in predicted_entities[fic_id]:
                predicted_entities[fic_id][cluster_name] = set()
            
            for mention in entities[cluster_name]:
                token_id_start = mention[0]
                token_id_end = mention[1]
                predicted_entities[fic_id][cluster_name].add((chapter_id, para_id, token_id_start, token_id_end))
                
len(predicted_entities)

allmarvel_1621415.coref.csv
harrypotter_459070.coref.csv
sherlock_12828381.coref.csv
teenwolf_1145590.coref.csv
tolkien_5581141.coref.csv


5

In [124]:
import itertools

def links(entity_mentions):
    """ Returns all the links in an entity between mentions """
    
    if len(entity_mentions) == 1: # self-link
        links = {list(entity_mentions)[0], list(entity_mentions)[0]}

    else:
        links = set(itertools.combinations(entity_mentions, 2))
        
    return links

import numpy as np
from IPython.core.debugger import set_trace

def lea_recall(predicted_entities, gold_entities):
    
    fic_recalls = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for gold_cluster, gold_mentions in gold_entities[fic_id].items():
            gold_links = links(gold_mentions)
            
            cluster_resolution = 0
            
            for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
                predicted_links = links(predicted_mentions)
                
                cluster_resolution += len(predicted_links.intersection(gold_links))
                
            cluster_resolution = cluster_resolution/len(gold_links)
            cluster_resolutions[gold_cluster] = cluster_resolution
            cluster_sizes[gold_cluster] = len(gold_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_recalls[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in gold_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total recall as mean across fics
#     print(fic_recalls)
    total_recall = np.mean(list(fic_recalls.values()))
    return total_recall, fic_recalls

import numpy as np
from IPython.core.debugger import set_trace

def lea_precision(predicted_entities, gold_entities):
    
    fic_precisions = {}
    
    for fic_id in gold_entities:
        
        cluster_resolutions = {}
        cluster_sizes = {}
        
        for predicted_cluster, predicted_mentions in predicted_entities[fic_id].items():
            predicted_links = links(predicted_mentions)
            
            cluster_resolution = 0
            
            for gold_cluster, gold_mentions in gold_entities[fic_id].items():
                gold_links = links(gold_mentions)
                cluster_resolution += len(predicted_links.intersection(gold_links))
            
            cluster_resolution = cluster_resolution/len(predicted_links)
            cluster_resolutions[predicted_cluster] = cluster_resolution
            cluster_sizes[predicted_cluster] = len(predicted_mentions)
            
        # take importance (size) of clusters into account
#         print(cluster_resolutions)
        fic_precisions[fic_id] = sum([cluster_sizes[c] * cluster_resolutions[c] for c in predicted_entities[fic_id]])/sum(cluster_sizes.values())
        
    # Total precision as mean across fics
#     print(fic_precisions)
    total_precision = np.mean(list(fic_precisions.values()))
    return total_precision, fic_precisions

In [126]:
def f_score(precision, recall):
    return 2 * (precision * recall)/(precision + recall)

In [132]:
print(gold_entities.keys())
print(predicted_entities.keys())

dict_keys([12828381])
dict_keys([12828381])


In [133]:
recall, fic_recalls = lea_recall(predicted_entities, gold_entities)
precision, fic_precisions = lea_precision(predicted_entities, gold_entities)
f1 = f_score(precision, recall)

print(f"Precision: {precision: .2%}")
print(f"Recall: {recall: .2%}")
print(f"F-score: {f1: .2%}")

Precision:  58.86%
Recall:  27.24%
F-score:  37.25%


In [134]:
len(gold_entities[12828381])

9

In [138]:
len(predicted_entities[12828381][1])

141

In [21]:
fic_precisions

{459070: 0.20664608688802236,
 1145590: 0.44628475692709196,
 5581141: 0.5345563442755866,
 1621415: 0.08021746118261987,
 12828381: 0.1519804183355585}

In [22]:
fic_recalls

{459070: 0.02770425922695492,
 1145590: 0.24624357045549267,
 5581141: 0.269897197854156,
 1621415: 0.024993739043325823,
 12828381: 0.10310111184268897}

In [79]:
# Test calculation with toy examples

import itertools

# set(itertools.combinations({(1,3), (1,4), (2,2), (3,5)}, 2))
test_gold = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3)},
                'B': {(1,1,4,4), (1,1,5,5), (1,1,6,6)}
                }}

test_predicted = {1: {'A': {(1,1,1,1), (1,1,2,2), (1,1,3,3), (1,1,6,6)},
                'B': {(1,1,4,4), (1,1,5,5)}
                }}

print(lea_recall(test_predicted, test_gold))
print(lea_precision(test_predicted, test_gold))

0.6666666666666666
0.6666666666666666


# Create personal coref annotation interface (token id subscripts)

In [1]:
def add_token_subscript(text):
    numbered_tokens = [el for el in enumerate(text.split())]
    subscripted = [f'{tok}<sub>{tok_num+1}</sub>' for tok_num, tok in numbered_tokens]
    return ' '.join(subscripted)

In [4]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', -1)
annotation_dirpath = '/data/fanfiction_ao3/annotated_10fandom/dev'
csv_dirpath = os.path.join(annotation_dirpath, 'fics')
subscripts_dirpath = os.path.join(annotation_dirpath, 'subscripted')
fnames = os.listdir(csv_dirpath)

fandoms = [
#     'allmarvel',
#     'harrypotter',
#     'sherlock',
#     'teenwolf',
    'tolkien'
]

for fandom in fandoms:
    for fname in fnames:
        if fname.endswith('.csv') and fname.startswith(fandom):
            data = pd.read_csv(os.path.join(csv_dirpath, fname))
            data['annotation_text'] = data['text_tokenized'].map(add_token_subscript)
            data.loc[:, ['chapter_id', 'para_id', 'annotation_text']].to_html(os.path.join(subscripts_dirpath, f'{fname[:-4]}_subscripts.html'), escape=False, index=False)

# Load data for preliminary annotation dataset

In [2]:
import random

all_fandoms = [
#     'allmarvel',
    'supernatural',
    'harrypotter',
    'dcu',
    'sherlock',
    'teenwolf',
    'starwars',
    'drwho',
    'tolkien',
    'dragonage',
]

random.sample(all_fandoms, 4)

['teenwolf', 'harrypotter', 'sherlock', 'tolkien']

In [8]:
import os, shutil
import random

old_seeds = [9, 12, 1234, 99, 120]
current_seed = 120
random.seed(current_seed)

dataset = 'complete_en_1k-50k'
fandoms = [
#     'allmarvel',
#     'supernatural',
#     'harrypotter',
#     'dcu',
    'sherlock',
    'teenwolf',
#     'starwars',
#     'drwho',
#     'tolkien',
#     'dragonage',
]

for fandom in fandoms:

    fic_dirpath = f'/data/fanfiction_ao3/{fandom}/{dataset}/fics'
    annotation_dirpath = f'/data/fanfiction_ao3/annotated_10fandom/dev/fics/'
    fnames = os.listdir(fic_dirpath)
    selected = random.sample(fnames, 1)[0]
    print(f'{fandom}: {selected}')
    shutil.copy(os.path.join(fic_dirpath, selected), os.path.join(annotation_dirpath, f'{fandom}_{selected}'))

sherlock: 12828381.csv
teenwolf: 1145590.csv
