In [1]:
import torch
import matplotlib.pyplot as plt

In [2]:
from transformers import BertTokenizer, get_linear_schedule_with_warmup, BertConfig, BertForMaskedLM
model_path = 'dmis-lab/biobert-base-cased-v1.2'
tokenizer = BertTokenizer.from_pretrained(model_path)
bert_lm = BertForMaskedLM.from_pretrained(model_path)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
inputs = ["The Battle of Dunbar", 
          "Wow that is so good!"]
inputs = tokenizer(inputs, max_length=10, padding='max_length', return_tensors='pt', truncation=True)

In [17]:
inputs

{'input_ids': tensor([[ 101, 1103, 2321, 1104, 3840, 1179, 6824,  102,    0,    0],
        [ 101,  192, 4064, 1115, 1110, 1177, 1363,  106,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [18]:
last_hidden_state = bert_lm.bert(**inputs).last_hidden_state
outputs = bert_lm.cls.predictions.transform(last_hidden_state)

In [20]:
emb1 = outputs[:1, 0]
emb2 = outputs[1:, 0]
torch.cosine_similarity(emb1, emb2)
def l2_dist(a, b, dim=-1):
    return ((a - b)**2).sum(dim=dim)

l2_dist(emb1, emb2)

tensor([215.8967], grad_fn=<SumBackward1>)

In [51]:
exp_path = '../exp/0hp_unseen_path2/'

In [53]:
mention2id = torch.load(exp_path+'mention2id')
pack = torch.load(exp_path+'pack.bin')
name_array = torch.load(exp_path+'name.bin')
ent_total = len(name_array)
triples = torch.load(exp_path+'triples.bin')

In [54]:
def get_nei(triples, max_length, ent_total):
    from collections import defaultdict
    import copy
    neis = [{} for i in range(max_length+1)] # neis[i] stores i-hop neighbors
    
    neis[0] = {e:{e} for e in range(ent_total)}

    for i in range(ent_total):
        neis[1][i] = set()
    for h, r, t in triples:
        neis[1][h].add(t)
        neis[1][t].add(h)
    
    for length in range(2, max_length+1):
        nei_1 = neis[1]
        nei_last = neis[length-1]
        nei = neis[length]
        for center in range(ent_total):
            nei[center] = copy.deepcopy(nei_1[center])
            for i in nei_1[center]:
                nei[center] = nei[center].union(nei_last[i])
    for i in range(5):
        for j in range(i+1, 6):
            for e in range(ent_total):
                neis[-i-1][e] -= neis[-j-1][e]

    return neis
neis = get_nei(triples, 5, ent_total)

In [24]:
labels = pack['labels'][:, 0]
N = len(labels)

results = []
for depth in range(6):
    result = 0
    for i in range(N):
        topk = pack['idx'][i][:1].tolist()
        result += len(neis[depth][int(labels[i])].intersection(topk)) / 1 # top 1
    result /= N
    print(f'{depth}-hop neighbor in top{1} = {result}')
    results.append(result)

print('sum = ', sum(results))

0-hop neighbor in top1 = 0.6859903381642513
1-hop neighbor in top1 = 0.04710144927536232
2-hop neighbor in top1 = 0.05404589371980676
3-hop neighbor in top1 = 0.02717391304347826
4-hop neighbor in top1 = 0.023852657004830916
5-hop neighbor in top1 = 0.021437198067632852
sum =  0.8596014492753624


In [25]:
from collections import defaultdict
child2parent = defaultdict(set)
for h, r, t in triples:
    if r == 'is_a':
        child2parent[h].add(t)

E = len(name_array)
siblings = set()
for a in range(E):
 for c in range(a+1, E):
  if len(child2parent[a].intersection(child2parent[c])) > 0:
   siblings.add((a,c))
   siblings.add((c,a))

grandpas = set()
grandsons = set()
for aa in range(E):
 for b in child2parent[aa]:
  for c in child2parent[b]:
   grandpas.add((aa,c))
   grandsons.add((c,aa))

In [35]:
# %%
from tqdm import tqdm
entity_set = {}
for aa, c in siblings:
    entity_set[aa] = None
    entity_set[c] = None
for aa, c in grandsons:
    entity_set[aa] = None
    entity_set[c] = None
entity_ids = list(entity_set.keys())
names = [name_array[i] for i in entity_ids]
inputs = tokenizer(names, return_tensors='pt', max_length=60, padding='max_length')
#%%
from torch.utils.data import Dataset, TensorDataset, DataLoader
input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
dataloader = DataLoader(dataset=TensorDataset(input_ids, attention_mask), batch_size=32, shuffle=False)
name_emb = []

bert_lm.eval()
bert_lm.cuda()
with torch.no_grad():
    for i, (input_ids, attention_mask) in enumerate(dataloader):
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()

        tmp = bert_lm.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        tmp = bert_lm.cls.predictions.transform(tmp)
        name_emb.append(tmp.cpu())
name_emb = torch.cat(name_emb, dim=0)

In [36]:
siblings_l2 = l2_dist(name_emb[[aa for (aa, c) in siblings]], name_emb[[c for (aa, c) in siblings]])
grandpas_l2 = l2_dist(name_emb[[aa for (aa, c) in grandpas]], name_emb[[c for (aa, c) in grandpas]])
grandsons_l2 = l2_dist(name_emb[[aa for (aa, c) in grandsons]], name_emb[[c for (aa, c) in grandsons]])
child_parent = [(h,t) for h,r,t in triples if r == 'is_a']
parent_l2 = l2_dist(name_emb[[aa for (aa, c) in child_parent]], name_emb[[c for (aa, c) in child_parent]])
assert grandsons_l2.mean() == grandpas_l2.mean()

AssertionError: 

In [37]:
siblings_l2.mean()

tensor(43.1007)

In [38]:
grandpas_l2.mean()

tensor(43.4708)

In [40]:
parent_l2.mean()

tensor(43.1485)

In [50]:
import numpy as np
import Levenshtein
siblings_edit = np.mean([Levenshtein.distance(name_array[a], name_array[c]) for a,c in siblings])
grandpas_edit = np.mean([Levenshtein.distance(name_array[a], name_array[c]) for a,c in grandpas])
grandsons_edit = np.mean([Levenshtein.distance(name_array[a], name_array[c]) for a,c in grandsons])

child_parent = [(h,t) for h,r,t in triples if r == 'is_a']
parent_edit = np.mean([Levenshtein.distance(name_array[a], name_array[c]) for a,c in child_parent])
assert grandsons_edit.mean() == grandpas_edit.mean()

print(f'siblings_edit = {siblings_edit}')
print(f'grandpas_edit = {grandpas_edit}')
print(f'parent_edit = {parent_edit}')

siblings_edit = 19.415777908783966
grandpas_edit = 24.503273590310172
parent_edit = 18.355697810789678


In [48]:
import numpy as np
import Levenshtein
siblings_edit = {}
grandpas_edit = {}
grandsons_edit = {}
parent_edit = {}
child_edit = {}

for a, c in siblings:
    dist = Levenshtein.distance(name_array[a], name_array[c])
    if dist < siblings_edit[]

print(f'siblings_edit = {siblings_edit}')
print(f'grandpas_edit = {grandpas_edit}')
print(f'parent_edit = {parent_edit}')

In [57]:
import numpy as np
import Levenshtein

ret = []
for syn, ent in mention2id.items():
    ent = name_array[ent]
    if syn == ent:
        continue
    ret.append(Levenshtein.distance(syn, ent))
print(np.mean(ret))

16.89960139511709


In [63]:
synonym_list, dictionaries = torch.load('/tmp/synonym_list')
datasets = set(synonym_list.keys())
assert set(synonym_list.keys()) == set(dictionaries.keys())
ret = {}
for i in datasets:
    tmp = []
    for ent, syns in synonym_list[i].items():
        if ent not in dictionaries[i]:
            continue
        ent = dictionaries[i][ent]
        for syn in syns:
            if syn == ent:
                continue
            tmp.append(Levenshtein.distance(syn, ent)) 
    ret[i] = np.mean(tmp)

In [64]:
ret

{'bc5cdr-chemical': 19.318244170096023,
 'bc5cdr-disease': 13.852631578947369,
 'ncbi-disease': 17.92057761732852}