In [1]:
import pandas as pd
import torch
import pickle
from transformers import AutoTokenizer,AutoModel
from tqdm import tqdm
import networkx as nx
import math


In [2]:
model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def create_kegg_graph(data_df, include_reactions = True):
    
    G = nx.MultiDiGraph()  # At first create multigraph, later it can be converted
    
    unique_entries = []  # List of unique entries. Each gene, ompound, etc needs to be stored as 1 node
    
    # Iterate data to create all nodes
    for index, entry in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        
        if not include_reactions and entry['link type']=='reaction':
            continue            
        
        # -- Handle node for Head --
        n1 = entry['head id']
        if n1 not in unique_entries:  # If there is no node for this id yet
            
            unique_entries.append(n1)
            
            #Add node n1
            G.add_node(n1)
            # Assign node type
            if n1[0:3] == 'hsa':
                t = 'gene'
            elif n1[0:3] == 'cpd':
                t = 'compound'
            else:
                temp = n1.split(":")
                t = temp[0]
            # Assign rest of node attributes
            if entry['head full name'] == '':
                full_name = n1
            else:
                full_name = entry['head full name']
            pathway = str(entry['pathway'])
            attr = {n1: {'type':t, 'full name': full_name, 'pathways': [pathway]}}
            nx.set_node_attributes(G, attr)
            
        else:  # If there is a node for this id already
            # Add the pathway info (if different pathway)
            temp = G.nodes[n1]['pathways']
            if entry['pathway'] not in temp:
                temp.append(entry['pathway'])
                G.nodes[n1]['pathways'] = temp

        # -- Handle node for Tail --
        n2 = entry['tail id']
        if n2 not in unique_entries:  # If there is no node for this id yet
            
            unique_entries.append(n2)
            
            #Add node n2
            G.add_node(n2)
            # Assign node type
            if n2[0:3] == 'hsa':
                t = 'gene'
            elif n2[0:3] == 'cpd':
                t = 'compound'
            else:
                temp = n2.split(":")
                t = temp[0]
            # Assign rest of node attributes
            if entry['tail full name'] == '':
                full_name = n2
            else:
                full_name = entry['tail full name']
            pathway = str(entry['pathway'])
            attr = {n2: {'type':t, 'full name': full_name, 'pathways': [pathway]}}
            nx.set_node_attributes(G, attr)
            
        else:  # If there is a node for this id already
            # Add the pathway info (if different pathway)
            temp = G.nodes[n2]['pathways']
            if entry['pathway'] not in temp:
                temp.append(entry['pathway'])
                G.nodes[n2]['pathways'] = temp
        
        
     # Iterate data to find all relations           
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        
        if not include_reactions and row['link type']=='reaction':
            continue 
        
        head = str(row['head id'])
        tail = str(row['tail id'])
        pathway = str(row['pathway'])
        link_type = str(row['link type'])
        rel_name = str(row['relation name'])
        if head in G.nodes and tail in G.nodes:
            G.add_edge(head, tail,pathway = pathway, link_type= link_type, relation_name= rel_name)
        else:
            print('node not found @row '+str(index))
        
    return G
    

In [5]:
# === load data ===
df_relations = pd.read_csv('../KEGG Pathways Dataset Collection/All_relations-Curated-full_names.csv')
G_directed = create_kegg_graph(df_relations)
G_undirected = nx.Graph(G_directed)
print(G_undirected)

100%|██████████████████████████████████████████████████████████████████████████| 17448/17448 [00:02<00:00, 6840.09it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17448/17448 [00:01<00:00, 13498.29it/s]


Graph with 5187 nodes and 11804 edges


In [6]:
# Make a list of all the node full names
total_ls = []

for node in  G_undirected.nodes:
    total_ls.append(str(G_undirected.nodes[node]['full name']))

In [7]:
# Make a dictionary to map each full name to an embedding representation
d = {}
for node in tqdm(total_ls):
    inputs=tokenizer(node, return_tensors='pt')
    outputs=model(**inputs)
    print(outputs)
    break
    last_hidden_states=outputs.last_hidden_state
    cls_token=last_hidden_states[0][0].detach().numpy()
    d[node] = cls_token


  0%|                                                                                         | 0/5187 [00:00<?, ?it/s]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2292, -0.3776, -0.0703,  ..., -0.2435,  0.2029,  0.1597],
         [ 0.1506, -0.2711, -0.1019,  ..., -0.0709,  0.2035,  0.2837],
         [-0.0202, -0.1781,  0.0586,  ...,  0.0383, -0.1587,  0.1914],
         ...,
         [ 0.1623,  0.2243, -0.0082,  ...,  0.3017, -0.2608,  0.1315],
         [ 0.0165,  0.3520, -0.1775,  ..., -0.1298,  0.4358,  0.3520],
         [-0.0765, -0.2918, -0.0911,  ..., -0.1019,  0.1305,  0.1017]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.5192e-01,  1.0660e-01,  7.0065e-01, -4.3321e-02,  4.2025e-01,
         -3.7654e-01, -1.0566e-01, -8.0333e-02,  9.2267e-01,  5.9243e-02,
          8.2885e-02,  5.6558e-01,  7.8434e-01, -2.6069e-01,  6.3992e-02,
          7.6690e-02,  2.0907e-02, -1.3029e-01,  2.2313e-01,  1.3015e-01,
         -2.2066e-02, -9.4702e-02, -2.4681e-01,  1.0075e-02,  8.1839e-01,
         -2.9057e-01,  9.7355e-01,  2.7913e-02, -9.9390e-01,  2.825




In [None]:
# Save as pickle file
pickle_out=open('node_embeddings_dict_Last4.pkl','wb')
pickle.dump(d,pickle_out)
pickle_out.close()

In [None]:
# # Save as csv
# df = pd.DataFrame(d)
# df.to_csv('node_embeddings_dict.csv', index=False, header=True)