In [1]:
import pandas as pd
import torch
import pickle
from transformers import AutoTokenizer,AutoModel, BertConfig
from tqdm import tqdm
import networkx as nx
import math
import numpy as np

In [2]:
# model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
# model_name="bert-base-uncased"
# model_name="dmis-lab/biobert-base-cased-v1.2"
model_name="bioformers/bioformer-16L"

config = BertConfig.from_pretrained(model_name, output_hidden_states=True)
model = AutoModel.from_pretrained(model_name, config=config)
tokenizer=AutoTokenizer.from_pretrained(model_name)
# model=AutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/159M [00:00<?, ?B/s]

Some weights of the model checkpoint at bioformers/bioformer-16L were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

In [3]:
def create_kegg_graph(data_df, include_reactions = True):
    
    G = nx.MultiDiGraph()  # At first create multigraph, later it can be converted
    
    unique_entries = []  # List of unique entries. Each gene, ompound, etc needs to be stored as 1 node
    
    # Iterate data to create all nodes
    for index, entry in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        
        if not include_reactions and entry['link type']=='reaction':
            continue            
        
        # -- Handle node for Head --
        n1 = entry['head id']
        if n1 not in unique_entries:  # If there is no node for this id yet
            
            unique_entries.append(n1)
            
            #Add node n1
            G.add_node(n1)
            # Assign node type
            if n1[0:3] == 'hsa':
                t = 'gene'
            elif n1[0:3] == 'cpd':
                t = 'compound'
            else:
                temp = n1.split(":")
                t = temp[0]
            # Assign rest of node attributes
            if entry['head full name'] == '':
                full_name = n1
            else:
                full_name = entry['head full name']
            pathway = str(entry['pathway'])
            attr = {n1: {'type':t, 'full name': full_name, 'pathways': [pathway]}}
            nx.set_node_attributes(G, attr)
            
        else:  # If there is a node for this id already
            # Add the pathway info (if different pathway)
            temp = G.nodes[n1]['pathways']
            if entry['pathway'] not in temp:
                temp.append(entry['pathway'])
                G.nodes[n1]['pathways'] = temp

        # -- Handle node for Tail --
        n2 = entry['tail id']
        if n2 not in unique_entries:  # If there is no node for this id yet
            
            unique_entries.append(n2)
            
            #Add node n2
            G.add_node(n2)
            # Assign node type
            if n2[0:3] == 'hsa':
                t = 'gene'
            elif n2[0:3] == 'cpd':
                t = 'compound'
            else:
                temp = n2.split(":")
                t = temp[0]
            # Assign rest of node attributes
            if entry['tail full name'] == '':
                full_name = n2
            else:
                full_name = entry['tail full name']
            pathway = str(entry['pathway'])
            attr = {n2: {'type':t, 'full name': full_name, 'pathways': [pathway]}}
            nx.set_node_attributes(G, attr)
            
        else:  # If there is a node for this id already
            # Add the pathway info (if different pathway)
            temp = G.nodes[n2]['pathways']
            if entry['pathway'] not in temp:
                temp.append(entry['pathway'])
                G.nodes[n2]['pathways'] = temp
        
        
     # Iterate data to find all relations           
    for index, row in tqdm(data_df.iterrows(), total=data_df.shape[0]):
        
        if not include_reactions and row['link type']=='reaction':
            continue 
        
        head = str(row['head id'])
        tail = str(row['tail id'])
        pathway = str(row['pathway'])
        link_type = str(row['link type'])
        rel_name = str(row['relation name'])
        if head in G.nodes and tail in G.nodes:
            G.add_edge(head, tail,pathway = pathway, link_type= link_type, relation_name= rel_name)
        else:
            print('node not found @row '+str(index))
        
    return G
    

In [4]:
# === load data ===
df_relations = pd.read_csv('../KEGG Pathways Dataset Collection/All_relations-Curated-full_names.csv')
G_directed = create_kegg_graph(df_relations)
G_undirected = nx.Graph(G_directed)
print(G_undirected)

100%|██████████████████████████████████████████████████████████████████████████| 17448/17448 [00:02<00:00, 6670.71it/s]
100%|█████████████████████████████████████████████████████████████████████████| 17448/17448 [00:01<00:00, 12682.10it/s]


Graph with 5187 nodes and 11804 edges


In [5]:
# Make a list of all the node full names
total_ls = []

for node in  G_undirected.nodes:
    total_ls.append(str(G_undirected.nodes[node]['full name']))

In [6]:
# Make a dictionary to map each full name to an embedding representation
d = {}
for node in tqdm(total_ls):
    inputs=tokenizer(node, return_tensors='pt')
    outputs=model(**inputs)
    last_hidden_states=outputs.last_hidden_state
    cls_token=last_hidden_states[0][0].detach().numpy()
    d[node] = cls_token


100%|██████████████████████████████████████████████████████████████████████████████| 5187/5187 [05:36<00:00, 15.41it/s]


In [7]:
# Or select the last 4 hidden states aoutputs
# d = {}
# for node in tqdm(total_ls):
#     inputs=tokenizer(node, return_tensors='pt')
#     outputs=model(**inputs)

#     cls1 = outputs[2][0][0][0].detach().numpy()
#     cls2 = outputs[2][1][0][0].detach().numpy()
#     cls3 = outputs[2][2][0][0].detach().numpy()
#     cls4 = outputs[2][3][0][0].detach().numpy()
#     cls_list = np.concatenate([cls1, cls2, cls3, cls4])
#     d[node] = cls_list

In [8]:
# Save as pickle file
pickle_out=open('node_embeddings_dict_BioFormer.pkl','wb')
pickle.dump(d,pickle_out)
pickle_out.close()

In [9]:
# # Save as csv
# df = pd.DataFrame(d)
# df.to_csv('node_embeddings_dict.csv', index=False, header=True)