In [11]:
import torch
import numpy as np
import json

In [12]:
from tqdm import tqdm

In [13]:
all_entity_encs = torch.load('../models/all_entities_large.t7')

In [14]:
def get_blink_enc(fpath, split):
    fname = fpath+split+'.jsonl'
    samples = []
    with open(fname) as f:
        for line in f.readlines():
            samples.append(json.loads(line))
    
    samples_encs = {}
    for sample in tqdm(samples):
        sample_id = sample['id']
        label_ids = sample['label_id']
        cand_encs = []
        for label_id in label_ids:
            cand_encs.append(all_entity_encs[label_id].tolist())
        samples_encs[sample_id] = cand_encs
        assert len(cand_encs)==len(label_ids)
    
    return samples_encs

## Cand enc based on BERT tokenizer

In [29]:
fpath = 'AIDA-YAGO2-wiki_content-NEW/tokenized/'
splits = ['train', 'dev', 'test']

for split in splits:
    samples_encs = get_blink_enc(fpath, split)
    enc_fname = f'{split}_enc.json'
    with open(fpath+enc_fname, 'w') as f:
        json.dump(samples_encs, f)

100%|██████████| 946/946 [00:00<00:00, 966.04it/s] 
100%|██████████| 216/216 [00:00<00:00, 1003.12it/s]
100%|██████████| 231/231 [00:00<00:00, 1074.61it/s]


In [2]:
with open('AIDA-YAGO2-wiki_content-NEW/tokenized/dev_enc.json') as f:
    golden_cand_enc = json.load(f)

In [3]:
len(golden_cand_enc)

216

In [4]:
sample_id = '1127testa NATO'

In [6]:
len(golden_cand_enc[sample_id])

21

In [7]:
fname = 'AIDA-YAGO2-wiki_content-NEW/tokenized/dev.jsonl'
samples = []
with open(fname) as f:
    for line in f.readlines():
        samples.append(json.loads(line))

In [9]:
sample = samples[-44:][8]
sample

{'id': '1127testa NATO',
 'text': 'NATO monitors Moslem move towards tense village . MAHALA , Bosnia 1996-08-31 NATO said it was closely monitoring the movement of about 75 Moslem men towards the village of Mahala in Bosnia \'s Serb republic on Saturday , two days after a violent confrontation with Serbs . " I have to report this morning that we have in fact received reports ... that up to 75 Moslem men are believed to be approaching Mahala , " NATO spokesman Lieutenant-Colonel Max Marriner said in Sarajevo . Marriner said that NATO troops had set up a checkpoint on the road between Tuzla and Mahala to establish the identities and intentions of the men headed towards the village . Mahala is a Moslem village on Bosnian Serb republic territory . Moslems were driven from the village during the 43- month Bosnian war and most of their houses were destroyed . Some Moslems began returning to rebuild their properties earlier in the week . Fights and shooting broke out between the Moslems and S

In [10]:
len(sample['mentions'])

21

## Cand enc based on longformer enc

In [15]:
fpath = 'AIDA-YAGO2_longformer/tokenized/'
splits = ['train', 'dev', 'test']

for split in splits:
    samples_encs = get_blink_enc(fpath, split)
    enc_fname = f'{split}_enc.json'
    with open(fpath+enc_fname, 'w') as f:
        json.dump(samples_encs, f)

100%|██████████| 946/946 [00:01<00:00, 933.53it/s]
100%|██████████| 216/216 [00:00<00:00, 1013.14it/s]
100%|██████████| 231/231 [00:00<00:00, 1185.32it/s]


In [16]:
with open('AIDA-YAGO2_longformer/tokenized/dev_enc.json') as f:
    golden_cand_enc = json.load(f)

In [17]:
len(golden_cand_enc)

216

In [18]:
len(golden_cand_enc[sample_id])

27