# PubMed

In [1]:
 # This code works in Python 3.10.6
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx
from torch_geometric.datasets.dblp import DBLP
import random
import torch
from torch import optim
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, Linear, SAGEConv
import csv

import warnings
warnings.filterwarnings('ignore')

Load Dataset

In [2]:
df_nodes = pd.read_table(('node_pubmed.dat'),names=['node_id', 'node_name', 'node_type', 'node_attributes'],quoting=csv.QUOTE_NONE)
#df_nodes

In [3]:
df_labels_train = pd.read_table(('label_pubmed.dat'),names=['node_id', 'node_name', 'node_type', 'node_label'])
#df_labels_train

In [4]:
df_labels_test = pd.read_table(('label_pubmed.dat.test'),names=['node_id', 'node_name', 'node_type', 'node_label'])
#df_labels_test

In [5]:
df_labels = pd.concat([df_labels_train, df_labels_test],ignore_index=True)
#df_labels

In [6]:
df_nodes[df_nodes['node_type'] == 1]

Unnamed: 0,node_id,node_name,node_type,node_attributes
9,9,RVF,1,"-1.072947,1.158542,0.498745,-0.085264,0.474166..."
10,10,diarrheal_enterotoxin,1,"-0.016865,0.004677,0.066797,-0.062516,-0.0147,..."
13,13,VUSE,1,"-0.075162,0.044893,0.004554,-0.070527,-0.00537..."
14,14,cardiac_auscultation,1,"0.07281,-0.075333,-0.027466,0.005992,-0.065542..."
15,15,intraoral_squamous_cell_carcinoma,1,"0.043272,-0.057455,0.068631,0.059395,-0.065671..."
...,...,...,...,...
63095,63095,Schistosoma_mansoni_infections,1,"-0.013617,-0.046871,0.036176,-0.063161,-0.0984..."
63096,63096,permeability_transition_pore,1,"0.027475,0.017549,0.04185,0.013369,-0.092689,-..."
63098,63098,nutrient-deficient,1,"0.012721,-0.355395,-0.074993,-0.000885,-0.0688..."
63104,63104,archival_tumor,1,"0.066291,0.020142,0.146529,-0.058354,0.11038,0..."


TYPE	MEANING
0		GENE
1		DISEASE
2		CHEMICAL
3		SPECIES

Data Preparation

In [7]:
#Select the nodes of type 1 which are labeled
df_disease=pd.merge(df_nodes, df_labels, on="node_id")[['node_id','node_attributes','node_label']]

In [8]:
#df_disease

In [9]:
df_gene = df_nodes[df_nodes['node_type'] == 0]
#df_gene

In [10]:
df_chemical = df_nodes[df_nodes['node_type'] == 2]
#df_chemical

In [11]:
df_species = df_nodes[df_nodes['node_type'] == 3]
#df_species

In [12]:
def convert_string_to_float(df):
    return df['node_attributes'].apply(lambda x: np.fromstring(x, dtype=float, sep=',' ))

In [13]:
def convert_to_tensor(df):
    return torch.tensor(df).to(dtype=torch.float32)

In [14]:
disease= convert_string_to_float(df_disease)
#print(disease)
x_disease = convert_to_tensor(disease)
#x_disease

In [15]:
y_disease = torch.tensor(np.array(df_disease['node_label']), dtype=torch.long)
#y_disease

In [16]:
gene = convert_string_to_float(df_gene)
gene = gene.reset_index(drop=True)
x_gene = convert_to_tensor(gene)
#x_gene

In [17]:
chemical = convert_string_to_float(df_chemical)
chemical = chemical.reset_index(drop=True)
x_chemical = convert_to_tensor(chemical)
#x_chemical

In [18]:
species = convert_string_to_float(df_species)
species = species.reset_index(drop=True)
x_species = convert_to_tensor(species)
#x_species

Create Hetero Data

In [19]:
data = HeteroData({'disease':{'x': x_disease, 'y':y_disease},'gene':{'x': x_gene},
                          'chemical':{'x': x_chemical},'species':{'x': x_species}})

In [30]:
df_edges = pd.read_table(('link_pubmed.dat'),names=['source', 'target', 'link_type', 'link_weight'])
df_edges

Unnamed: 0,source,target,link_type,link_weight
0,47789,32267,8,225
1,14228,31867,3,2
2,35405,31559,5,2
3,31559,35405,5,2
4,885,32267,8,474
...,...,...,...,...
236453,4079,62356,2,1
236454,30859,57440,2,1
236455,39493,62538,2,1
236456,39493,32267,2,1


In [21]:
#Delete unlabeled nodes from edge list
df_type1= df_nodes[df_nodes['node_type'] == 1]

new_list = list(set(list(df_type1['node_id'])).difference(list(df_disease['node_id'])))

df_edges = df_edges[~df_edges['source'].isin(new_list)]
df_edges = df_edges[~df_edges['target'].isin(new_list)]

df_edges = df_edges.reset_index(drop=True)

In [22]:
#Get lists of edges
batchsize = 500
gene_to_gene = []
gene_to_disease = []
disease_to_disease = []
chemical_to_gene = []
chemical_to_disease = []
chemical_to_chemical = []
chemical_to_species = []
species_to_gene = []
species_to_disease = []
species_to_species = []
remaining_edges = []


for i in range(0, len(df_edges), batchsize):
    batch = df_edges[i:i+batchsize]
    #print(batch)

    if (batch.loc[i, "source"] in list(df_gene['node_id'])) and (batch.loc[i, "target"] in list(df_gene['node_id'])):
            gene_to_gene.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_gene['node_id'])) and (batch.loc[i, "target"] in list(df_disease['node_id'])):
            gene_to_disease.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_disease['node_id'])) and (batch.loc[i, "target"] in list(df_disease['node_id'])):
            disease_to_disease.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_chemical['node_id'])) and (batch.loc[i, "target"] in list(df_gene['node_id'])):
            chemical_to_gene.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_chemical['node_id'])) and (batch.loc[i, "target"] in list(df_disease['node_id'])):
            chemical_to_disease.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_chemical['node_id'])) and (batch.loc[i, "target"] in list(df_chemical['node_id'])):
            chemical_to_chemical.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_chemical['node_id'])) and (batch.loc[i, "target"] in list(df_species['node_id'])):
            chemical_to_species.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_species['node_id'])) and (batch.loc[i, "target"] in list(df_gene['node_id'])):
            species_to_gene.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_species['node_id'])) and (batch.loc[i, "target"] in list(df_disease['node_id'])):
            species_to_disease.append((batch.loc[i, "source"],batch.loc[i, "target"]))
            
    elif (batch.loc[i, "source"] in list(df_species['node_id'])) and (batch.loc[i, "target"] in list(df_species['node_id'])):
            species_to_species.append((batch.loc[i, "source"],batch.loc[i, "target"]))
    else:
        remaining_edges.append((batch.loc[i, "source"],batch.loc[i, "target"]))

In [23]:
def preprocess_edges(edgelist,node_list):
    res = [[node_list[i] for i, j in edgelist],[node_list[j] for i, j in edgelist]] 
    node_from = torch.tensor(res[0])
    node_to = torch.tensor(res[1])
    edges = torch.concat((node_from,node_to)).reshape(-1,len(node_from))
    return edges

In [24]:
def remap_indices(node_list):
    val_list = [*range(0, len(node_list), 1)]
    return dict(zip(node_list,val_list))

In [25]:
 #Re-map indices to correct range
gene_nodes_mapping = remap_indices(list(df_gene["node_id"]))
disease_nodes_mapping = remap_indices(list(df_disease["node_id"]))
chemical_nodes_mapping = remap_indices(list(df_chemical["node_id"]))
species_nodes_mapping = remap_indices(list(df_species["node_id"]))

In [26]:
node_list = {}
for d in [gene_nodes_mapping, disease_nodes_mapping, chemical_nodes_mapping,species_nodes_mapping]:
    node_list.update(d)

In [27]:
#Prepare edge tensor for hetero data
if gene_to_gene:
    edge_index_gene_gene = preprocess_edges(gene_to_gene,node_list)
    data['gene','to','gene'].edge_index = edge_index_gene_gene
if gene_to_disease:
    edge_index_gene_disease = preprocess_edges(gene_to_disease,node_list)
    data['gene','to','disease'].edge_index = edge_index_gene_disease
if disease_to_disease:
    edge_index_disease_disease = preprocess_edges(disease_to_disease,node_list)
    data['disease','to','disease'].edge_index = edge_index_disease_disease
if chemical_to_gene:
    edge_index_chemical_gene = preprocess_edges(chemical_to_gene,node_list)
    data['chemical','to','gene'].edge_index = edge_index_chemical_gene
if chemical_to_disease:
    edge_index_chemical_disease = preprocess_edges(chemical_to_disease,node_list)
    data['chemical','to','disease'].edge_index = edge_index_chemical_disease
if chemical_to_chemical:
    edge_index_chemical_chemical = preprocess_edges(chemical_to_chemical,node_list)
    data['chemical','to','chemical'].edge_index = edge_index_chemical_chemical
if chemical_to_species:
    edge_index_chemical_species = preprocess_edges(chemical_to_species,node_list)
    data['chemical','to','species'].edge_index = edge_index_chemical_species
if species_to_gene:
    edge_index_species_gene = preprocess_edges(species_to_gene,node_list)
    data['species','to','gene'].edge_index = edge_index_species_gene
if species_to_disease:
    edge_index_species_disease = preprocess_edges(species_to_disease,node_list)
    data['species','to','disease'].edge_index = species_to_disease
if species_to_species:
    edge_index_species_species = preprocess_edges(species_to_species,node_list)
    data['species','to','species'].edge_index = edge_index_species_species

In [28]:
# transform = T.RandomNodeSplit(split='train_rest', num_val=0.15, num_test=0.15)
# data = transform(data)

In [29]:
#Hetero Data
print(data)

HeteroData(
  disease={
    x=[454, 200],
    y=[454],
  },
  gene={ x=[13561, 200] },
  chemical={ x=[26522, 200] },
  species={ x=[2863, 200] },
  (gene, to, gene)={ edge_index=[2, 40] },
  (gene, to, disease)={ edge_index=[2, 1] },
  (chemical, to, gene)={ edge_index=[2, 70] },
  (chemical, to, chemical)={ edge_index=[2, 108] },
  (chemical, to, species)={ edge_index=[2, 18] },
  (species, to, gene)={ edge_index=[2, 6] }
)
