In [47]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
import torch
import numba
import math
import json
from bertviz import head_view
from numba import jit, cuda

**Download Model & Tokenizer**

In [48]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True, output_attentions=True)
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Read in & Process Tokenized Data (JSON)**

In [17]:
with open('/shared/data2/pk36/multidim/CORD-NER-corpus.json') as corpus:
    docs = corpus.readlines()

data = []
for d in tqdm(docs):
    data.append(json.loads(d))

100%|████████████████████████████████████| 29500/29500 [00:31<00:00, 951.28it/s]


In [70]:
def bert_text_preparation(text, tokenizer):
    text.insert(0, '[CLS]')
    text.append('[SEP]')
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(text)
    segments_ids = [1]*len(text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return text, tokens_tensor, segments_tensors

def bert_text_str(text, tokenizer):
    text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(text)
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

def get_bert_embeddings(tokens_tensor, segments_tensors, model, text=None, viz=False):
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]
    
    if viz:
        head_view(outputs[-1], text)
        
    return list_token_embeddings

In [77]:
test_str = "should paramedics intubate patients with sars like symptoms ?"
text, tok, seg = bert_text_str(test_str, tokenizer)

In [79]:
text, tok

(['[CLS]',
  'should',
  'para',
  '##med',
  '##ics',
  'int',
  '##uba',
  '##te',
  'patients',
  'with',
  'sar',
  '##s',
  'like',
  'symptoms',
  '?',
  '[SEP]'],
 tensor([[  101,  2323, 11498,  7583,  6558, 20014, 19761,  2618,  5022,  2007,
          18906,  2015,  2066,  8030,  1029,   102]]))

In [72]:
test_emb = get_bert_embeddings(tok, seg, model, text, True)

<IPython.core.display.Javascript object>

In [59]:
text

['[CLS]',
 'should',
 'para',
 '##med',
 '##ics',
 'int',
 '##uba',
 '##te',
 'patients',
 'with',
 'sar',
 '##s',
 'like',
 'symptoms',
 '?',
 '[SEP]']

In [81]:
test_sent = ["should", "paramedics", "intubate", "patients", "with", "sars", "like", "symptoms", "?"]
text, tok, seg = bert_text_preparation(test_sent, tokenizer)
print(text, tok)
test_emb = get_bert_embeddings(tok, seg, model, text, True)

['[CLS]', 'should', 'paramedics', 'intubate', 'patients', 'with', 'sars', 'like', 'symptoms', '?', '[SEP]'] tensor([[ 101, 2323,  100,  100, 5022, 2007,  100, 2066, 8030, 1029,  102]])


<IPython.core.display.Javascript object>

In [43]:
tok

tensor([[ 101, 2323,  100,  100, 5022, 2007,  100, 2066, 8030, 1029,  102]])

In [27]:
word_embeddings = None # shape: (# words, 768)
all_words = []
add_topics = True
limit = 100

for i in tqdm(np.arange(len(data))):
    if i > limit:
        break
    for s in data[i]['sents']:
        tokens_tensor, segments_tensors = bert_text_preparation(s['sent_tokens'], tokenizer)
        emb = get_bert_embeddings(tokens_tensor, segments_tensors, model)
        all_words.extend(s)

    if word_embeddings is None:
        word_embeddings = emb
    else:
        word_embeddings = np.append(word_embeddings, emb, axis=0)

  0%|                                    | 101/29500 [06:01<29:14:20,  3.58s/it]


**Helper Functions**

In [3]:
def bert_text_preparation(text, tokenizer, clean=True):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    """
    marked_text = "[CLS] " + text + " [SEP]"
    if clean:
        tokenized_text = marked_text.split(" ")
    else:
        tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

**Split the Dataset in 512 Chunks & Tokenize**

In [19]:
virus_type = ["COVID-19", "sars", "MERS", "Ebola"]
virus_study = ["origin", "evolution", "symptom", "examination"]
age_group = ["infant", "adult", "elderly"]

#all_topics = ["covid_19", "sars", "mers", "ebola", "origin", "evolution", "symptom", "examination", "infant", "adult", "elderly"]
all_topics = ["sars", "symptom"]
topics_added = False

In [None]:
def generate_chunks(corpus, size=512, limit=None): #TODO: CHANGE TO GENERATOR and might need to introduce overlap
    # corpus is an array of strings where each item represents a document
    for doc in tqdm(corpus):
        if num == limit:
            break
        sents = doc.split(".")
        for s in sents:
            marked_s = "[CLS] " + s + " [SEP]"
            split_sent = marked_s.split(" ")
        
        ret_txt = np.array_split(split_sent, math.ceil(tokenized_text.shape[0]/size))
        if not topics_added:
            token_topics = [np.array(["[CLS]", w, "[SEP]"], dtype=str) for w in all_topics]
            ret_txt[:0] = token_topics
            topics_added = True
        
    
    return np.array(ret_txt, dtype=np.ndarray)

In [20]:
def chunkize(corpus, size=512, limit=None): #TODO: CHANGE TO GENERATOR and might need to introduce overlap
    tokenized_text = None
    num = 0
    # corpus is an array of strings where each item represents a document
    for doc in tqdm(corpus):
        if num == limit:
            break
        sents = doc.split(".")
        for s in sents:
            marked_s = "[CLS] " + s + " [SEP]"
            split_sent = marked_s.split(" ")
            if tokenized_text is None:
                tokenized_text = split_sent
            else:
                tokenized_text = np.append(tokenized_text, split_sent)
            
        num += 1
    print(tokenized_text.shape)
    ret_txt = np.array_split(tokenized_text, math.ceil(tokenized_text.shape[0]/size))
    token_topics = [np.array(["[CLS]", w, "[SEP]"], dtype=str) for w in all_topics]
    ret_txt[:0] = token_topics
    
    return np.array(ret_txt, dtype=np.ndarray)

def tokenize(chunk):        
    indexed_tokens = tokenizer.convert_tokens_to_ids(chunk)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokens_tensor, segments_tensors

**Read in Corpus & Define Topics**

In [6]:
with open('/shared/data2/pk36/multidim/covid_phrase_text.txt') as f:
    docs = f.readlines()

In [21]:
chunks = chunkize(docs, limit=100)
print(chunks.shape)

  0%|                                    | 100/29500 [02:43<13:20:50,  1.63s/it]

(286384,)
(562,)





**Compute Embeddings for All Words in Each Chunk**

In [22]:
word_embeddings = None # shape: (# words, 768)
all_words = []
add_topics = True

for chunk in tqdm(chunks):
    tokens_tensor, segments_tensors = tokenize(chunk)
    list_token_embeddings = np.array(get_bert_embeddings(tokens_tensor, segments_tensors, model)) # shape: (512 tokens in chunk, 768)
    all_words.extend(chunk)

    if word_embeddings is None:
        word_embeddings = list_token_embeddings
    else:
        word_embeddings = np.append(word_embeddings, list_token_embeddings, axis=0)

100%|█████████████████████████████████████████| 562/562 [06:57<00:00,  1.35it/s]


In [23]:
len(all_words)

286390

In [24]:
word_embeddings.shape

(286390, 768)

**Cosine-Similiarity**

_Word-to-Word:_

In [72]:
topic_col = []
word_col = []
cos_col = []
topic_emb = np.zeros((len(all_topics), 768))

for i in tqdm(np.arange(len(all_topics))):
    topic_col.extend(np.repeat(all_topics[i], len(all_words)))
    word_col.extend(all_words)
    #cos_col.extend(cosine_similarity(word_embeddings[3*i+1].reshape(1, -1), word_embeddings).reshape(-1, 1))

topic_col = np.array(topic_col)
word_col = np.array(word_col)
#cos_col = np.array(cos_col).reshape((-1, ))

data = np.array([topic_col, word_col]).T
cosine_df = pd.DataFrame(data, columns=['topic', 'word'])

# now we have the indices, let's compute cosine similarity

for i in tqdm(np.arange(len(all_topics))):
    topic_emb[i] = np.mean(word_embeddings[list(cosine_df[:len(all_words)][cosine_df[:len(all_words)]['word'] == all_topics[i]].index)])
    cos_col.extend(cosine_similarity(topic_emb[i].reshape(1, -1), word_embeddings).reshape(-1, 1))
cos_col = np.array(cos_col).reshape((-1, ))
cosine_df["cosine"] = cos_col

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 12.48it/s]
100%|█████████████████████████████████████████████| 2/2 [00:01<00:00,  1.02it/s]


In [73]:
cosine_df.head(10)

Unnamed: 0,topic,word,cosine
0,sars,[CLS],0.019248
1,sars,sars,0.024972
2,sars,[SEP],0.029875
3,sars,[CLS],0.019248
4,sars,symptom,0.024972
5,sars,[SEP],0.029875
6,sars,[CLS],0.020109
7,sars,angiotensin_converting_enzyme,0.021145
8,sars,2,0.019495
9,sars,(,0.0201


In [74]:
len(topic_col), len(word_col), len(cos_col)

(572780, 572780, 572780)

In [31]:
cosine_df

Unnamed: 0,topic,word,cosine
0,sars,[CLS],0.2028250618645076
1,sars,sars,1.000000000000001
2,sars,[SEP],0.08997175855176623
3,sars,[CLS],0.2028250618645076
4,sars,symptom,1.000000000000001
...,...,...,...
572775,symptom,,0.7400542686902241
572776,symptom,[SEP],0.11537445452665142
572777,symptom,[CLS],0.154220021546168
572778,symptom,\n,0.7534395201913552


In [77]:
cosine_df["cosine"] = pd.to_numeric(cosine_df["cosine"])
word_df = cosine_df[(cosine_df.topic == "sars") 
                    & (cosine_df.word != "[CLS]") 
                    & (cosine_df.word != "[SEP]")
                    & (cosine_df.word.str.len() > 3)
                    & ~(cosine_df.word.isin(all_topics))].sort_values(by=['cosine'], ascending=False)
word_df = word_df[word_df.word.str.isalnum()]
word_df.head(10)

Unnamed: 0,topic,word,cosine
46567,sars,quarantined,0.037342
111658,sars,other,0.037132
257522,sars,other,0.036575
223876,sars,respectively,0.036482
258232,sars,boulos,0.036235
219151,sars,other,0.036134
258231,sars,kamel,0.036049
61780,sars,peilun,0.035977
219794,sars,underestimation,0.035923
215969,sars,were,0.035893


In [32]:
cosine_df["cosine"] = pd.to_numeric(cosine_df["cosine"])
word_df = cosine_df[(cosine_df.topic == "sars") 
                    & (cosine_df.word != "[CLS]") 
                    & (cosine_df.word != "[SEP]")
                    & (cosine_df.word.str.len() > 3)
                    & ~(cosine_df.word.isin(all_topics))].sort_values(by=['cosine'], ascending=False)
word_df = word_df[word_df.word.str.isalnum()]
word_df.head(10)

Unnamed: 0,topic,word,cosine
89110,sars,mmons,0.640814
250640,sars,omics,0.63859
250639,sars,prote,0.638193
93718,sars,gretl2019d,0.637314
89109,sars,iveco,0.636099
50768,sars,meegid,0.635733
104826,sars,bigd,0.635045
250632,sars,logy,0.633736
50778,sars,104272,0.632624
93765,sars,sourceforge,0.632146


In [34]:
cosine_df["cosine"] = pd.to_numeric(cosine_df["cosine"])
word_df = cosine_df[(cosine_df.topic == "symptom") 
                    & (cosine_df.word != "[CLS]") 
                    & (cosine_df.word != "[SEP]")
                    & (cosine_df.word.str.len() > 3)
                    & ~(cosine_df.word.isin(all_topics))].sort_values(by=['cosine'], ascending=False)
word_df = word_df[word_df.word.str.isalnum()]
word_df.head(10)

Unnamed: 0,topic,word,cosine
375500,symptom,mmons,0.640814
537030,symptom,omics,0.63859
537029,symptom,prote,0.638193
380108,symptom,gretl2019d,0.637314
375499,symptom,iveco,0.636099
337158,symptom,meegid,0.635733
391216,symptom,bigd,0.635045
537022,symptom,logy,0.633736
337168,symptom,104272,0.632624
380155,symptom,sourceforge,0.632146


In [38]:
word_df.head(30)

Unnamed: 0,topic,word,cosine
115202,symptom,role,0.714869
102915,symptom,noted,0.710099
104451,symptom,virus,0.710076
71683,symptom,guarding,0.683049
274634,symptom,fixed,0.661575
204116,symptom,code,0.65788
97283,symptom,risk,0.657662
54275,symptom,virus,0.657354
87044,symptom,better,0.656019
151483,symptom,reported,0.650768


In [25]:
word_df[(word_df.topic == "ebola")]

Unnamed: 0,topic,word,cosine
974483,ebola,role,0.714869
962196,ebola,noted,0.710099
963732,ebola,virus,0.710076
930964,ebola,guarding,0.683049
1133915,ebola,fixed,0.661575
...,...,...,...
1078429,ebola,other,-0.140917
1116801,ebola,hand,-0.140965
970936,ebola,other,-0.146952
1119358,ebola,case,-0.149871


**Scratch Work (aka ignore)**

In [47]:
target_word_embeddings = []

for i in np.arange(0, len(text[0]), 512):
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text[0][i:min(i+512, len(text[0]))], tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

    word = "covid_19" # virus_type[1]

    # Find the position 'bank' in list of tokens
    if word not in tokenized_text:
        if i == 0:
            print(tokenized_text)
        print(i, ": not found")
    else:
        print(i, ": found")
        word_index = tokenized_text.index(word)
        # Get the embedding for bank
        word_embedding = list_token_embeddings[word_index]
        target_word_embeddings.append(word_embedding)

['[CLS]', 'ang', '##iot', '##ens', '##in', '_', 'converting', '_', 'enzyme', '2', '(', 'ace', '##2', ')', 'as', 'a', 'sar', '##s', '_', 'co', '##v', '_', '2', 'receptor', 'molecular', '_', 'mechanisms', 'and', 'potential', 'therapeutic', '_', 'target', 'sar', '##s', '_', 'co', '##v', '_', '2', 'has', 'been', 'sequence', '##d', '3', '.', 'a', 'phylogenetic', 'analysis', '3', ',', '4', 'found', 'a', 'bat', 'origin', 'for', 'the', 'sar', '##s', '_', 'co', '##v', '_', '2', '.', 'there', 'is', 'a', 'diversity', 'of', 'possible', 'intermediate', '_', 'hosts', 'for', 'sar', '##s', '_', 'co', '##v', '_', '2', ',', 'including', 'pang', '##olin', '##s', ',', 'but', 'not', 'mice', 'and', 'rats', '5', '.', 'there', 'are', 'many', 'similarities', 'of', 'sar', '##s', '_', 'co', '##v', '_', '2', 'with', 'the', 'original', 'sar', '##s', 'co', '##v', '.', 'using', 'computer', 'modeling', ',', 'xu', 'et', 'al', '.', '6', 'found', 'that', 'the', 'spike', '_', 'proteins', 'of', 'sar', '##s', '_', 'co', '#

In [28]:
word_embeddings = None # shape: (# words, 768)
all_words = []
add_topics = True

for doc in tqdm(docs):
    split_text = doc.split(".") # split corpus into list of sentences
    if add_topics:
        split_text[:0] = all_topics
        add_topics = False

    for sentence in split_text:
        tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
        list_token_embeddings = np.array(get_bert_embeddings(tokens_tensor, segments_tensors, model))[1:-1] # shape: (# words in sentence, 768)
        all_words.extend(tokenized_text[1:-1])

        if word_embeddings is None:
            word_embeddings = list_token_embeddings
        else:
            word_embeddings = np.append(word_embeddings, list_token_embeddings, axis=0)
    
    # word_embeddings.append(list_token_embeddings[1:-1]) # ignore CLS and SEP tokens

  5%|█▍                              | 1328/29500 [5:54:55<125:29:12, 16.04s/it]


KeyboardInterrupt: 

In [10]:
# Calculating the distance between the embeddings of the word in all the given contexts of the word

list_of_distances = []
for text1, embed1 in zip(split_text, target_word_embeddings):
    for text2, embed2 in zip(split_text, target_word_embeddings):
        cos_dist = 1 - cosine(embed1, embed2)
        list_of_distances.append([text1, text2, cos_dist])

distances_df = pd.DataFrame(list_of_distances, columns=['text1', 'text2', 'distance'])

In [11]:
word_df = distances_df[distances_df.text1 == word].sort_values(by=['distance'], ascending=False)
word_df.head(10)

Unnamed: 0,text1,text2,distance
0,sars,sars,1.0
1,sars,angiotensin_converting_enzyme 2 ( ace2 ) as a ...,0.615406
3,sars,there is a diversity of possible intermediate...,0.602366
9,sars,"5 identity in amino_acid sequences 6 and , im...",0.547345
14,sars,this similarity with sars cov is critical bec...,0.53167
15,sars,it is required for host_cell entry and subseq...,0.527674
10,sars,wan et al,0.520226
11,sars,4 reported that residue 394 ( glutamine ) in ...,0.51776
13,sars,"thus , the sars_cov_2 spike_protein was predi...",0.512084
12,sars,further analysis even suggested that sars_cov...,0.509995


In [14]:
word_df["text2"][2]

' a phylogenetic analysis 3 , 4 found a bat origin for the sars_cov_2 '