In [1]:
import json

import numpy as np

import torch

In [2]:
with open('../input/umls-kb/umls.2020AA.active.full.json',) as f:
  
# returns JSON object as 
# a dictionary
    kb = json.load(f)

In [3]:
kb['C4308010']

{'SAB': 'MSH',
 'STY': ['T116', 'T123'],
 'DEF': [],
 'STR': [],
 'Name': 'DCTN4 protein, human'}

In [4]:
akeys = []
for k, v in kb.items():
    for STR in kb[k]['STR']:
        akeys.append(STR)
    akeys.append(kb[k]['Name'])
len(akeys)

5015434

In [5]:
akeys

['(131)I-Macroaggregated Albumin',
 '1,2-Dipalmitoylphosphatidylcholine',
 '1,2 Dipalmitoylphosphatidylcholine',
 '1,2-dipalmitoylphosphatidylcholine',
 'Branching Enzyme, 1,4-alpha-Glucan',
 '1,4-alpha-Glucan branching enzyme',
 '1,4-Alpha glucan branching enzyme',
 'Enzyme, 1,4-alpha-Glucan Branching',
 '1,4 alpha Glucan Branching Enzyme',
 '1,4-alpha-Glucan Branching Enzyme',
 '1 Alkyl 2 Acylphosphatidates',
 '1-Alkyl-2-Acylphosphatidates',
 '1 Carboxyglutamic Acid',
 '1-Carboxyglutamic Acid',
 '1 Methyl 3 isobutylxanthine',
 '1-Methyl-3-isobutylxanthine',
 '1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine',
 '1 Methyl 4 phenylpyridinium',
 '1-Methyl-4-phenylpyridinium',
 '1-naphthylamine',
 '1 Naphthylamine',
 '1-Naphthylamine',
 '1 Naphthylisothiocyanate',
 '1-Naphthylisothiocyanate',
 'Angiotensin II, 1-Sarcosine-8-Isoleucine',
 '1 Sarcosine 8 Isoleucine Angiotensin II',
 '1-Sarcosine-8-Isoleucine Angiotensin II',
 '11 Hydroxycorticosteroids',
 '11-Hydroxycorticosteroids',
 'syntheti

## Data

In [6]:
def reformat_data(data_file):
    with open(data_file, 'r') as file:
        article_sentences, article_labels = [], []
        sentence_tokens, sentence_labels = [], []
        
        for line in file.readlines():
            if "-DOCSTART-" in line:
                if sentence_labels != []:
                    article_sentences.append(sentence_tokens)
                    article_labels.append(sentence_labels)
                    sentence_tokens, sentence_labels = [], []
            else:
                try:
                    token = line.split("\t")[0]
                    label = line.split("\t")[3][:-1]
                    sentence_tokens.append(token)
                    sentence_labels.append(label)
                except:
                    if sentence_labels != []:
                        article_sentences.append(sentence_tokens)
                        article_labels.append(sentence_labels)
                        sentence_tokens, sentence_labels = [], []
                
    return article_sentences, article_labels
    
train_sentences, train_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.train.conll")
test_sentences, test_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.test.conll")
dev_sentences, dev_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.dev.conll")
_, train_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.train.conll")
_, test_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.test.conll")
_, dev_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.dev.conll")

## UMLSBert Model

In [7]:
%%capture

from transformers import BertTokenizer, BertConfig, TFBertForTokenClassification, TFBertModel, BertModel
!wget -O umlsbert.tar.xz https://www.dropbox.com/s/qaoq5gfen69xdcc/umlsbert.tar.xz?dl=0
!tar -xvf umlsbert.tar.xz
#https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
#https://github.com/billpku/NLP_In_Action/blob/master/NER_with_BERT.ipynb
tokenizer = BertTokenizer.from_pretrained('./umlsbert')

## Tokenizer

In [108]:
test1 = tokenizer.encode('Nyanza')
test2 = tokenizer.convert_tokens_to_ids('Pseudomonas aeruginosa (Pa) infection')
test5 = tokenizer.convert_tokens_to_ids('Pa')
print(test2, test5)
test3 = tokenizer.encode('Kenya')
test4 = tokenizer.encode('Pa')

print(test1)
print(test3)
## This is strange
print(tokenizer.convert_ids_to_tokens(test1))
print(test2)
print(tokenizer.convert_ids_to_tokens(1394))

[101, 23563, 24507, 1116, 170, 1200, 9610, 4559, 3202, 113, 185, 1161, 114, 8974, 102] [101, 185, 1161, 102]
[101, 183, 6582, 3293, 102]
[101, 180, 1424, 2315, 102]
['[CLS]', 'n', '##yan', '##za', '[SEP]']
[101, 23563, 24507, 1116, 170, 1200, 9610, 4559, 3202, 113, 185, 1161, 114, 8974, 102]
##in


In [83]:
config = BertConfig.from_json_file('./umlsbert/config.json')
config.output_hidden_states=True
umlsbert = BertModel.from_pretrained('./umlsbert', config=config)

# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
umlsbert = umlsbert.to(device)
umlsbert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [97]:
test3 = torch.tensor(test3)
test1 = torch.tensor(test1)
test4 = torch.tensor(test4)


In [98]:
test3.unsqueeze_(0)
test1.unsqueeze_(0)
test4.unsqueeze_(0)

tensor([[ 101,  185, 1161,  102]])

In [103]:
with torch.no_grad():
    out = umlsbert(input_ids=test1)
hidden_states1 = out.hidden_states[-1]
with torch.no_grad():
    out = umlsbert(input_ids=test3)
hidden_states3 = out.hidden_states[-1]
with torch.no_grad():
    out = umlsbert(input_ids=test4)
hidden_states4 = out.hidden_states[-1]

In [104]:
hidden_states1.shape, hidden_states3.shape, hidden_states4.shape

(torch.Size([1, 5, 768]), torch.Size([1, 5, 768]), torch.Size([1, 4, 768]))

In [106]:
hidden_states4[0]

tensor([[ 0.2660,  0.0167,  0.0315,  ..., -0.2036, -0.0999, -0.4413],
        [-0.0501, -0.0265, -0.0585,  ...,  0.6674, -0.0237,  0.1157],
        [ 0.4457, -0.4611, -0.0657,  ...,  0.1706,  0.1314, -0.1926],
        [ 0.2935, -0.4333, -0.1174,  ..., -0.4061, -0.7934, -0.1693]])

In [95]:
cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
torch.mean(cos(hidden_states1, hidden_states3))
torch.mean(cos(hidden_states1, hidden_states4))

tensor(0.8102)