In [1]:
import networkx as nx
import pandas as pd
import torch_geometric as geom
import torch

In [3]:
# Create entity indexes

node_list = pd.read_csv(
    '../kge/data/selfloops/entity_ids.del',
    sep='\t', header=None
)

drug_ids_selfloops = [i for i, row in node_list.iterrows() if row[1].startswith('CID')]
drug_list = node_list.loc[drug_ids_selfloops]
drug_list.columns= ['selfloops_id', 'drug_name']
drug_list.reset_index(drop=True, inplace=True)  # Index will be used to identify nodes in torch-geom

gene_ids_selfloops = [i for i, row in node_list.iterrows() if not row[1].startswith('C')]
gene_list = node_list.loc[gene_ids_selfloops]
gene_list.columns= ['selfloops_id', 'gene_name']
gene_list.reset_index(drop=True, inplace=True)  # Index will be used to identify nodes in torch-geom


In [4]:
# Load embeds
drug_emb = torch.load('emb/drug_vecs.pt')#.to("cuda")
gene_emb = torch.load('emb/gene_vecs.pt')#.to("cuda")

In [23]:
# Load Decagon drug-targets edgelist

targets = pd.read_csv('../Chapter2/data/raw/bio-decagon-targets.csv').astype(str)

# Get dicts to convert entity names to IDs
drug_name_to_id = {row.drug_name: i for i, row in drug_list.iterrows()}
gene_name_to_id = {row.gene_name: i for i, row in gene_list.iterrows()}

# Convert edgelist to index-based
targets_edge_index = pd.DataFrame()
targets_edge_index['STITCH'] = [drug_name_to_id[drug] for drug in targets.STITCH.values]
targets_edge_index['Gene'] = [gene_name_to_id[gene] for gene in targets.Gene.values]

# Convert edgelist to edge index
targets_edge_index = torch.tensor(targets_edge_index.to_numpy()).T

# Move to GPU
targets_edge_index.to('cuda')

tensor([[ 142,  142,  603,  ...,  157,  157,  157],
        [9062, 7978, 4726,  ..., 5810,  125, 1892]], device='cuda:0')

In [22]:
# Load PPI edges

ppi = pd.read_csv('../Chapter2/data/raw/bio-decagon-ppi.csv').astype(str)

# Convert names to geom index
ppi_edge_index = pd.DataFrame()
ppi_edge_index['Gene1'] = [gene_name_to_id[gene] for gene in ppi['Gene 1'].values]
ppi_edge_index['Gene2'] = [gene_name_to_id[gene] for gene in ppi['Gene 2'].values]


# Convert edgelist to edge index
ppi_edge_index = torch.tensor(ppi_edge_index.to_numpy()).T

# Move to GPU
ppi_edge_index.to('cuda')

tensor([[15479, 15479, 15479,  ...,  8479,  6702,  3474],
        [ 6845, 15048,   158,  ...,  2671,  2671,  3244]], device='cuda:0')

In [7]:
# Create torch-geometric graph https://pytorch-geometric.readthedocs.io/en/latest/tutorial/heterogeneous.html

data = geom.data.HeteroData()

# Add drug nodes and features
data['drug'].node_id = drug_list.index.values
data['drug'].x = drug_emb

# Add gene nodes and features
data['gene'].node_id = gene_list.index.values
data['gene'].x = gene_emb

# Add drug-target edges
data['drug', 'targets', 'gene'].edge_index = targets_edge_index
data['gene', 'rev_targets', 'drug'].edge_index = targets_edge_index.__reversed__()

# Add PPI edges
data['gene', 'interacts', 'gene'].edge_index = ppi_edge_index
data['gene', 'rev_interacts', 'gene'].edge_index = ppi_edge_index.__reversed__()

# Add selfloops (may not be needed for this analysis)
data = geom.transforms.AddSelfLoops()(data)

# Move to GPU
data = data.to('cuda')

data

HeteroData(
  drug={
    node_id=[645],
    x=[645, 256],
  },
  gene={
    node_id=[19089],
    x=[19089, 256],
  },
  (drug, targets, gene)={ edge_index=[2, 18690] },
  (gene, rev_targets, drug)={ edge_index=[2, 18690] },
  (gene, interacts, gene)={ edge_index=[2, 734701] },
  (gene, rev_interacts, gene)={ edge_index=[2, 734701] }
)

# Note
Would it be possible to simply treat this as a homogenous network with n_drugs + n_genes nodes? The features are the same shape so might actually be better to do it that way. Or is it important to keep the differences between the edge types?

data = data.to_homogenous() may do the trick.

In [13]:
from torch_geometric.nn import HGTConv, Linear

class HGT(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads, num_layers):
        super().__init__()

        self.lin_dict = torch.nn.ModuleDict()
        for node_type in data.node_types:
            self.lin_dict[node_type] = Linear(-1, hidden_channels)

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HGTConv(hidden_channels, hidden_channels, data.metadata(),
                           num_heads)
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for node_type, x in x_dict.items():
            x_dict[node_type] = self.lin_dict[node_type](x).relu_()

        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)

        return x_dict

In [28]:
gcn_vecs = {}
dim = 256
for i, row in drug_list.iterrows():

    # Clone data to keep original
    features_temp = {}
    features_temp['drug'] = torch.clone(data.x_dict['drug'])
    features_temp['gene'] = torch.clone(data.x_dict['gene'])

    # Blind drug vector of interest
    features_temp['drug'][i] = torch.zeros(dim)

    # Run convolutions over graph
    model = HGT(hidden_channels=dim, out_channels=dim, num_heads=1, num_layers=2)
    model = model.to('cuda')
    with torch.no_grad():
        for _ in range(2):
            features_temp = model(features_temp, data.edge_index_dict)

    # Store output
    drug = row.drug_name
    gcn_vecs[drug] = torch.clone(features_temp['drug'][i])
    print(len(gcn_vecs)/645)

0.0015503875968992248
0.0031007751937984496
0.004651162790697674
0.006201550387596899
0.007751937984496124
0.009302325581395349
0.010852713178294573
0.012403100775193798
0.013953488372093023
0.015503875968992248
0.017054263565891473
0.018604651162790697
0.020155038759689922
0.021705426356589147
0.023255813953488372
0.024806201550387597
0.02635658914728682
0.027906976744186046
0.02945736434108527
0.031007751937984496
0.03255813953488372
0.034108527131782945
0.03565891472868217
0.037209302325581395
0.03875968992248062
0.040310077519379844
0.04186046511627907
0.043410852713178294
0.04496124031007752
0.046511627906976744
0.04806201550387597
0.04961240310077519
0.05116279069767442
0.05271317829457364
0.05426356589147287
0.05581395348837209
0.05736434108527132
0.05891472868217054
0.06046511627906977
0.06201550387596899
0.06356589147286822
0.06511627906976744
0.06666666666666667
0.06821705426356589
0.06976744186046512
0.07131782945736434
0.07286821705426356
0.07441860465116279
0.0759689922480

In [33]:
out = pd.DataFrame(columns=range(dim+1))
for key in gcn_vecs:
    result = [key]
    result += gcn_vecs[key].cpu().numpy().tolist()
    out.loc[len(out)] = result

out.to_csv('GCN_est_vecs.csv', header=None, index=False)