# **In this Project we use clinical trials data from https://clinicaltrials.gov/ that contains what drugs have been used for conditions for approval. Our aim is to predict new links between drugs and condtions using Deep Learning methods (Graph Neural Networks).**

We use pretrained embeddings from Drug Repurposing Knowledge Graph (https://github.com/gnn4dr/DRKG/blob/master/DRKG%20Drug%20Repurposing%20Knowledge%20Graph.pdf) as feautres for our drug and condition entities

Install PyTorch Geometric Libraries with dependencies


In [None]:

# !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
# !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


Import Libraries

In [None]:
import os
import sys

from google.colab import drive
drive.mount('/content/drive')

main_path = "drive/MyDrive"
# sys.path.append('/project/uml_rachel_melamed/Affinity_Regression/Athena')
# sys.path.append('/project/uml_rachel_melamed/Panos/Mendelian_diseases_2/mamoon_project/data/updated/')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn.functional as F
import torch_geometric


Mounted at /content/drive


## Import Datasets

Clinical_Trial-Drug Data

In [None]:
file_path = os.path.join(main_path, "clinical_trials_data/updated/clinical_trials_interventions_rxnorm_ingredients_no_drug_comb.txt")
# file_path = "clinical_trials_interventions_rxnorm_ingredients_no_drug_comb.txt"
df_drugs =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")
df_drugs = df_drugs.drop_duplicates(subset=["nct_id", "intervention_mesh_code"])
df_drugs = df_drugs.reset_index(drop=True)
df_drugs.head(3)

Unnamed: 0,nct_id,intervention_mesh_term,intervention_mesh_code,intervention_rxnorm_term,intervention_rxnorm_code
0,NCT02248428,thalidomide,D013792,thalidomide,10432
1,NCT02248428,clarithromycin,D017291,clarithromycin,21212
2,NCT02248428,dexamethasone,D003907,dexamethasone,3264


#####Clinical_trial-Conditions

In [None]:
file_path = os.path.join(main_path, "clinical_trials_data/updated/clinical_trials_conditions_snomedct.txt")
# file_path = "clinical_trials_conditions_snomedct.txt"
df_cond =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")
df_cond = df_cond.drop_duplicates(subset=["nct_id", "condition_mesh_code"])
df_cond = df_cond.reset_index(drop=True)
df_cond.head(3)

Unnamed: 0,nct_id,condition_mesh_term,condition_mesh_code,condition_snomedct_code,condition_snomedct_term
0,NCT02248428,multiple myeloma,D009101,94705007,Multiple myeloma
1,NCT02248428,"neoplasms, plasma cell",D054219,127580003,Plasma cell neoplasm (morphology)
2,NCT02248454,"diabetes mellitus, type 1",D003922,190322003,"Diabetes mellitus: [juvenile type, with no men..."


## Map Multiple SNOMED to Primary SNOMED

This section contains code to map multiple SNOMED codes of conditions to their primary code. This is because mapping from one MESH code for a condition to SNOMED creates multiple SNOMED codes for that condition. For all these SNOMED codes for a given condition we would like to map them back one SNOMED code. This can be done by using the relationship data from ATHENA that contains relationships between concept codes including SNOMED codes. The relationship "Non-standard to Standard map (OMOP)" will try to map multiple SNOMED codes to their primary component.


In [None]:
file_path = os.path.join(main_path, "clinical_trials_data/CONCEPT.csv.bz2")
# file_path = "CONCEPT.csv.bz2"
concept =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")
concept = concept[['concept_id', 'concept_name', 'concept_code' ]]

In [None]:
snomed_conds = df_cond[['condition_snomedct_code', 'condition_snomedct_term']]
# snomed_conds['condition_snomedct_code'] = snomed_conds['condition_snomedct_code'].astype(str)
temp = snomed_conds.merge(concept, left_on='condition_snomedct_code', right_on = 'concept_code')
temp = temp.drop_duplicates(subset = temp.columns)

In [None]:
file_path = os.path.join(main_path, "clinical_trials_data/CONCEPT_RELATIONSHIP.csv.bz2")
# file_path = "CONCEPT_RELATIONSHIP.csv.bz2"
rel =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")[['concept_id_1', 'concept_id_2', 'relationship_id']]

## Get Drug Features from DRKG
Most of the drugs in DRKG have ids that belong to Drugbank. Given that we have drugs in either MESH or RxNorm format we would want to first map RxNorm/MESH to Drugbank. The file "clinical_trials_data/updated/[all_drugs]_drugbankid_to_rxnorm.txt" contains mapping between Drugbank and RxNorm. Once we map the ids we can then use Drugbank ids for the drugs in our CT project to get their get features from DRKG.

In [None]:
# load Drugbank-RxNorm mapping and merge datasets with CT-Drugs data
file_path = os.path.join(main_path, "clinical_trials_data/updated/[all_drugs]_drugbankid_to_rxnorm.txt")
db_rx =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")
merged = df_drugs.merge(db_rx, left_on = 'intervention_rxnorm_code', right_on = 'rxnorm_cui')

In [None]:
# Optional: Check how many drugs matched from CT-Drugs and Drugbank-RxNorm Dataset
seta = set(list(df_drugs['intervention_rxnorm_code'].unique()))
setb = set(list(db_rx['rxnorm_cui'].unique()))
intersection = seta.intersection(setb)
difference = seta.difference(setb)
len(difference), len(intersection)
# db_rx[db_rx['drugbank_id'] == 'DB00411'] check carbochol drug

(71, 1788)

**Note:**
71 Drugs from our CT-Drugs dataset did not match with the Drugbank-RxNorm dataset.
Some unmatched Rxnorm codes could be found through first typing their names in drugank and getting their ATC codes. Then from the athena browser we can use the ATC codes to get primary rxnorm

Example: RxNorm 1999 (carbachol) does not have a drugbank id. From drugank we get the ATC Code:
N07AB01. And from there we get ATC to RxNorm/Extension primary lateral (OMOP): 1999

However, instead of the ATC Code if we look at drugbank id of Cabachol (DB00411) then that matches with drugbank-rxnorm dataset and gives Carbamoylcholine with Rxnorm 1546387. But I cannot find a mapping of 1546387 to 1999 in Athena

After my discussion with Panos we think this can be resolved by only working with MESH codes. That is we use MESH code from CT-Drugs data and instead of using Drugbank-RxNorm we use Drugbank-MESH. For that we will have to create a dataset form Drugbank-MESH. This can be done at later stage to ensure all unique drugs that are in the CT-Drugs data are being used.

Get Entity IDs

In this section, we create dictionaries for Compounds/Drugs and Dieseases/Conditions from DRKGS where the key represents the drug or condition and its values is its position in the embedding matrix that we will be used as features.

In [None]:
# Load dataset and get all unique entities such as Drugs, Genes, Conditions, etc that were used in the DRKG project.
path = os.path.join(main_path, "clinical_trials_data/entities.tsv")
entity_df = pd.read_csv(path, sep="\t", header=None)
entity_unique_list = list(entity_df[0].unique())

# load drug embeddings/features dataset
path = os.path.join(main_path, "clinical_trials_data/DRKG_TransE_l2_entity.npy")
drkg_embeddings = np.load(path)

In [None]:

entity_df

Unnamed: 0,0,1
0,Gene::2157,0
1,Gene::5264,1
2,Gene::2158,2
3,Gene::3309,3
4,Gene::28912,4
...,...,...
97233,Gene::400359,97233
97234,Gene::348751,97234
97235,Gene::101928147,97235
97236,Gene::23591,97236


In [None]:
# Every entity in DRKG has a type associated with it. Below we create two dictionaries for Compound/Drugs and Disease/Condtions and store their locations/index of their features.
compound_dict = {}
for index, value in enumerate(entity_unique_list):
    if "Compound" in value:
        value = value.split("::")[1]
        compound_dict[value] = index

disease_dict = {}
for index, value in enumerate(entity_unique_list):
    if "Disease" in value:
        value = value.split("::")[1]
        disease_dict[value] = index

DRKG Features for Drugs in CT

We match drugs from DRKG and CT-Drugs and get a list of drugs whose features we will use. In total we have 1847 drugs that matched to DRKG.

Additonally some drugs in DRKG are in MESH code type as well. We will have to check how many match with our own CT-Drugs data in case we need it.

In [None]:
# Get unique drugs in CT_Drugs that we mapped to drugbank ids. Then filter it based on if it matches with DRKG drugs and then finally get their values/index of their features
clinical_trial_drugbank_unique = list(merged['drugbank_id'].unique())
values = {i:compound_dict[i] for i in clinical_trial_drugbank_unique if i in list(compound_dict.keys())}
final_drugs_list = list(values.values())
drug_embeddings = drkg_embeddings[final_drugs_list]

# Addtional Mentioned above
# clinical_trial_mesh_unique = list(merged['intervention_mesh_code'].unique())
# values_mesh =  {i:compound_dict["MESH:"+i] for i in clinical_trial_mesh_unique if "MESH:"+i in list(compound_dict.keys())}

## DRKG Features for Conditions in CT

We follow the same steps as above and get features for conditons that match with our CT-Conditions and DRKG dataset


In [None]:
unique_conditions_mesh = list(df_cond['condition_mesh_code'].unique())
condition_mesh_dict =  {i:disease_dict["MESH:"+i] for i in unique_conditions_mesh if "MESH:"+i in list(disease_dict.keys())}
final_condition_list = list(condition_mesh_dict.values())
cond_embeddings = drkg_embeddings[final_condition_list]#get drug embeddings

## CT-Cond and CT-Drug Edges
In this section we create dataframes between CT-condition and CT-Drugs that will serve as edges to the Graph Data Object in PyG

First we create a dataframe CT-Condition with 2 columns where each row indicates that the value in Column 1 is connect to Value in Column 2. Next we filter the dataset with conditions that are in DRKG. Next, for every trial and cond codes we also assign an indicator to the code such for clinical trial "NCT02248428" we add "trial_" to make it "trial_NCT02248428". This can help us later in identifying the String codes once we map them to Integers.

The edge between clinical trial and drugs also follow the same steps as above.

In [None]:
trial_cond = df_cond[['nct_id', 'condition_mesh_code']] # select only trial and condition
trial_cond = trial_cond.rename(columns= {'nct_id':'id1','condition_mesh_code':'id2'})
trial_cond['type'] = "trial_cond"

final_condition_mesh_code = list(condition_mesh_dict.keys())
trial_cond = trial_cond[trial_cond['id2'].isin(final_condition_mesh_code)]

trial_cond['id1'] = trial_cond['id1'].apply(lambda x: "trial_" + x)
trial_cond['id2'] = trial_cond['id2'].apply(lambda x: "cond_" + x)

trial_drug = merged[['nct_id', 'drugbank_id']]
trial_drug = trial_drug.rename(columns= {'nct_id':'id1','drugbank_id':'id2'})
trial_drug['type'] = "trial_drug"

final_condition_mesh_code = list(values.keys())
trial_drug = trial_drug[trial_drug['id2'].isin(final_condition_mesh_code)]

trial_drug['id1'] = trial_drug['id1'].apply(lambda x: "trial_" + x)
trial_drug['id2'] = trial_drug['id2'].apply(lambda x: "drug_" + x)



```
# This is formatted as code
```

## Approved Drugs

In [None]:
# file_path = os.path.join("/project/uml_rachel_melamed/Panos/Mendelian_diseases_2/mamoon_project/data/updated", "[approved_drugs]_indications_snomedct.txt")
# df_drug_cond =  pd.read_csv(file_path,  on_bad_lines='skip', sep="\t")
# df_drug_cond = df_drug_cond.drop_duplicates(subset=["nct_id", "condition_mesh_code"])
# df_drug_cond = df_drug_cond.reset_index(drop=True)
# df_drug_cond.head(3)

# Create Graph Data

In [None]:
df_final = pd.concat([trial_cond, trial_drug])
df_final = df_final.drop_duplicates(subset=['id1', 'id2'])
df_final = df_final.reset_index(drop=True)
# unique_trials = list(df_final['id1'].unique())
# df_final = df_final[df_final["id1"].isin(unique_trials[:3000])]
df_final.shape

(412501, 3)

In [None]:

df_final.head()

Unnamed: 0,id1,id2,type
0,trial_NCT02248428,cond_D009101,trial_cond
1,trial_NCT02248428,cond_D054219,trial_cond
2,trial_NCT02248454,cond_D003922,trial_cond
3,trial_NCT02248493,cond_D010149,trial_cond
4,trial_NCT05354076,cond_D009369,trial_cond


## Get features from attributes of clinical trials

In [None]:
# aact_attr = pd.read_csv("/project/uml_rachel_melamed/ref_data/AACT/studies.txt", sep="|")
path = os.path.join(main_path, "clinical_trials_data/updated/studies.txt")
aact_attr = pd.read_csv(path, sep="|")

In [None]:
unique_trials = list(df_final['id1'].apply(lambda x: x.split("_")[1]).unique()) #get unique trials
trials_df = aact_attr[aact_attr['nct_id'].isin(unique_trials)] #filter our trials with main data
trials_df = trials_df[['nct_id','phase']]
trials_df = trials_df.reset_index(drop=True)
trials_df = pd.get_dummies(trials_df, columns = ["phase"])#create one-hot features for phase
trials_np = np.pad(trials_df.values, ((0,0), (0,400-8)))#pad to match shape of drug and cond features
trials_df = pd.DataFrame(trials_np)
trials_df = trials_df.set_index(0)

del aact_attr

Label Encoding to create node ids

In [None]:
from sklearn.preprocessing import LabelEncoder

df_final['id1'] = df_final['id1'].astype('str')
df_final['id2'] = df_final['id2'].astype('str')

le=LabelEncoder()
le.fit(np.concatenate((df_final['id1'], df_final['id2'])))

df_graph = df_final.copy()
df_graph['id_1']=le.transform(df_final['id1'])
df_graph['id_2']=le.transform(df_final['id2'])
df_graph = df_graph[['id_1', 'id_2', 'type']]

In [None]:

df_graph

Unnamed: 0,id_1,id_2,type
0,66312,1043,trial_cond
1,66312,2342,trial_cond
2,66313,422,trial_cond
3,66315,1180,trial_cond
4,110712,1092,trial_cond
...,...,...,...
412496,86163,2762,trial_drug
412497,107095,3776,trial_drug
412498,107050,3761,trial_drug
412499,36940,4104,trial_drug


In [None]:

# unique_trial_ids = list(df_final['id1'].unique())
# unique_cond_ids = list(df_final[df_final["type"] == 'trial_cond']['id2'].unique())
# unique_drug_ids = list(df_final[df_final["type"] == 'trial_drug']['id2'].unique())

# le_trial = LabelEncoder()
# le_trial.fit(unique_trial_ids)

# le_cond = LabelEncoder()
# le_cond.fit(unique_cond_ids)

# le_drug = LabelEncoder()
# le_drug.fit(unique_drug_ids)


# # transform using label encoders
# df_graph_trial_cond = df_final[df_final["type"] == 'trial_cond'] #copy data and filter trial-cond
# df_graph_trial_cond['id1'] = le_trial.transform(df_graph_trial_cond['id1']) #transform trial
# df_graph_trial_cond['id2'] = le_cond.transform(df_graph_trial_cond['id2'] ) #transform cond

# df_graph_trial_drug = df_final[df_final["type"] == 'trial_drug'] #copy data and filter trial-drug
# df_graph_trial_drug['id1'] = le_trial.transform(df_graph_trial_drug['id1']) #transform trial
# df_graph_trial_drug['id2'] = le_cond.transform(df_graph_trial_drug['id2'] ) #transform cond


# df_graph_trial_drug= df_final[df_final["type"] == 'trial_drug']




In [None]:

# # new label encoder for hetereogeneous
# df_trial_cond = df_final[df_final['type'] == "trial_cond"]
# df_trial_cond['id1'] = df_trial_cond['id1'].astype('str')
# df_trial_cond['id2'] = df_trial_cond['id2'].astype('str')

# le_trial_cond = LabelEncoder()
# le_trial_cond.fit(np.concatenate((df_trial_cond['id1'], df_trial_cond['id2'])))

# df_graph_trial_cond = df_trial_cond.copy()
# df_graph_trial_cond['id_1']=le_trial_cond.transform(df_graph_trial_cond['id1'])
# df_graph_trial_cond['id_2']=le_trial_cond.transform(df_graph_trial_cond['id2'])
# df_graph_trial_cond = df_graph_trial_cond[['id_1', 'id_2', 'type']]


# df_trial_drug = df_final[df_final['type'] == "trial_drug"]
# df_trial_drug['id1'] = df_trial_drug['id1'].astype('str')
# df_trial_drug['id2'] = df_trial_drug['id2'].astype('str')

# le_trial_drug = LabelEncoder()
# le_trial_drug.fit(np.concatenate((df_trial_drug['id1'], df_trial_drug['id2'])))

# df_graph_trial_drug = df_trial_drug.copy()
# df_graph_trial_drug['id_1']=le_trial_drug.transform(df_graph_trial_drug['id1'])
# df_graph_trial_drug['id_2']=le_trial_drug.transform(df_graph_trial_drug['id2'])
# df_graph_trial_drug = df_graph_trial_drug[['id_1', 'id_2', 'type']]




In [None]:
# homogeneous data

import numpy as np
from torch_geometric.data import Data, DataLoader
import torch
import scipy.sparse as sp
from torch_geometric.utils import to_undirected
trial_cond_edges = torch.tensor(df_graph[df_graph['type'] == "trial_cond"][['id_1', 'id_2']].values, dtype=torch.long)
trial_drug_edges = torch.tensor(df_graph[df_graph['type'] == "trial_drug"][['id_1', 'id_2']].values, dtype=torch.long)

edge_1 = torch.tensor(df_graph[['id_1', 'id_2']].values, dtype=torch.long)
# edge_2 = torch.tensor(df_graph[['id_2', 'id_1']].values, dtype=torch.long)

# edge = torch.cat((edge_1, edge_2), 0)

data_hom = Data(
            edge_index=edge_1.t().contiguous(),
            )

edge_attr_dict = {'trial_cond':0,'trial_drug':1}
df_graph['num_type'] = df_graph['type'].apply(lambda x: edge_attr_dict[x])
edge_attr = torch.tensor(df_graph['num_type'].values, dtype=torch.long)
data_hom.edge_attr = edge_attr

edge_unidrected, edge_attr_unidrected = to_undirected(data_hom.edge_index, data_hom.edge_attr)
data_hom_new = Data(
            edge_index=edge_unidrected.contiguous(),
            edge_attr = edge_attr_unidrected.contiguous()
            )


In [None]:

data_hom

Data(edge_index=[2, 412501], edge_attr=[412501])

In [None]:
# cond_features = torch.tensor(cond_embeddings, dtype=torch.float)
# drug_features = torch.tensor(drug_embeddings, dtype=torch.float)
# trial_features = torch.tensor(np.random.rand(109356, 2), dtype=torch.float)
# trial_features = F.pad(trial_features, (0, 400-2))

# x = torch.cat([cond_features, drug_features, trial_features], dim=0)
# x = x.numpy()


In [None]:

reindexing_cond_list = [i.split("_")[1] for i in le.classes_ if "cond_" in i]
reindexing_drug_list = [i.split("_")[1] for i in le.classes_ if "drug_" in i]
reindexing_trial_list = [i.split("_")[1] for i in le.classes_ if "trial_" in i]

main_reindexing_list = [i.split("_")[1] for i in le.classes_ ]
node_type_names = [i.split("_")[0] for i in le.classes_ ]

mapping = {'cond': 0, "drug": 1, "trial":2}
node_type_num = [mapping[i] for i in node_type_names]


In [None]:
# reindex cond features
df_cond = pd.DataFrame(cond_embeddings)
df_cond['id'] = list(condition_mesh_dict.keys())
df_cond = df_cond.set_index('id')
df_cond = df_cond.reindex(reindexing_cond_list)

# reindex drug features
df_drug = pd.DataFrame(drug_embeddings)
df_drug['id'] = list(values.keys())
df_drug = df_drug.set_index('id')
df_drug = df_drug.reindex(reindexing_drug_list)

# reindex trial features
trials_df = trials_df.reindex(reindexing_trial_list)
trials_df = trials_df.astype('float')
new_col_names = range(len(trials_df.columns))
trials_df.columns = new_col_names

# concat all three dataframes
features_df = pd.concat([df_cond, df_drug, trials_df], axis=0)
features_df = features_df.reindex(main_reindexing_list)

# convert to torch
x_features = torch.tensor(features_df.values, dtype=torch.float)

In [None]:
data_hom_new.x = x_features


Homogeneous Data Training

In [None]:

data_hom_new.x

tensor([[ 0.6079, -0.4999,  0.4021,  ..., -0.4506, -0.5400,  0.5435],
        [-0.1616,  0.7808, -0.7440,  ...,  0.2756,  0.6128, -0.5770],
        [ 0.6165,  0.4559, -0.1511,  ...,  0.5186,  0.6526, -0.1789],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  1.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [None]:
from torch_geometric.nn import GCNConv, GAE, RGCNConv, FastRGCNConv, SAGEConv, GATConv
from sklearn.model_selection import train_test_split

indices = torch.arange(data_hom_new.edge_index.shape[1])
train_idx, val_idx = train_test_split(indices, test_size=0.01)
train_mask, val_mask = torch.zeros_like(indices).bool(), torch.zeros_like(indices).bool()
train_mask[train_idx] = 1
val_mask[val_idx] = 1
train_mask, val_mask
data_hom_new.val_mask = val_mask
data_hom_new.train_mask = train_mask


from torch_geometric.utils import negative_sampling
data.val_neg_edge_index = negative_sampling(data_hom_new.edge_index, num_neg_samples=data_hom_new.val_mask.sum()) #Parisa: Should it not be "data_hom_new.val_neg_edge_index =....."?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class GNN_HOM(torch.nn.Module):
    def __init__(self,  hidden_dim, out_dim):
        super().__init__()
        self.conv1 = RGCNConv(400, hidden_dim, 2)
        self.conv2 = RGCNConv(hidden_dim, out_dim, 2)

    def forward(self, x, edge_index, edge_attr):

        x = F.relu(self.conv1(x, edge_index, edge_attr))
        return self.conv2(x, edge_index, edge_attr )



In [None]:

import torch
import torch.nn.functional as F


hdim = 16
model = GAE(GNN_HOM(hidden_dim=hdim*2, out_dim=hdim))#.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
training_losses, validation_losses, val_auc, val_ap = [], [], [], []

for epoch in range(40):
    model.train()
    optimizer.zero_grad()

    train_pos_edge_index = data_hom_new.edge_index[:, data_hom_new.train_mask].to(device)
    train_pos_edge_attr = data_hom_new.edge_attr[data_hom_new.train_mask].to(device)

    # x = data_hom_new.x.to(device)
    # x = x.to_dense()

    z = model.encode(x, train_pos_edge_index, train_pos_edge_attr)

    loss = model.recon_loss(z, train_pos_edge_index)
    training_loss = loss.item()
    loss.backward()
    optimizer.step()

    model.eval()


    train_pos_edge_index = data_hom_new.edge_index[:, data_hom_new.train_mask].to(device)

    val_pos_edge_index = data_hom_new.edge_index[:, data_hom_new.val_mask].to(device)
    val_pos_edge_attr = data_hom_new.edge_attr[data_hom_new.val_mask].to(device)

    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index, train_pos_edge_attr)
        validation_loss = model.recon_loss(z, val_pos_edge_index).item()

    auc, ap = model.test(z, val_pos_edge_index, data_hom_new.val_neg_edge_index)

    training_losses.append(training_loss)
    validation_losses.append(validation_loss)
    val_auc.append(auc)
    val_ap.append(ap)

#     if (epoch + 1) % 20 == 0:
    print("Epoch: {:03d}, Training loss: {:.4f}, Validation loss: {:.4f}, AUC: {:.4f}, AP: {:.4f}".format(
        epoch + 1, training_loss, validation_loss, auc, ap))

Epoch: 001, Training loss: 0.9062, Validation loss: 1.1639, AUC: 0.8791, AP: 0.9071
Epoch: 002, Training loss: 1.0507, Validation loss: 1.0311, AUC: 0.8942, AP: 0.9171
Epoch: 003, Training loss: 0.9154, Validation loss: 1.1038, AUC: 0.8667, AP: 0.8909
Epoch: 004, Training loss: 0.9819, Validation loss: 1.1163, AUC: 0.8750, AP: 0.8987
Epoch: 005, Training loss: 0.9772, Validation loss: 1.0577, AUC: 0.8931, AP: 0.9150
Epoch: 006, Training loss: 0.9353, Validation loss: 1.0311, AUC: 0.9015, AP: 0.9227
Epoch: 007, Training loss: 0.9167, Validation loss: 1.0239, AUC: 0.9017, AP: 0.9233
Epoch: 008, Training loss: 0.9265, Validation loss: 1.0446, AUC: 0.8994, AP: 0.9217
Epoch: 009, Training loss: 0.9328, Validation loss: 1.0312, AUC: 0.8976, AP: 0.9205
Epoch: 010, Training loss: 0.9254, Validation loss: 1.0277, AUC: 0.8960, AP: 0.9192
Epoch: 011, Training loss: 0.9150, Validation loss: 1.0399, AUC: 0.8944, AP: 0.9175
Epoch: 012, Training loss: 0.9112, Validation loss: 1.0387, AUC: 0.8936, AP:

In [None]:
data_het


HeteroData(
  [1mcond[0m={
    num_nodes=2517,
    x=[2517, 400]
  },
  [1mdrug[0m={
    num_nodes=1832,
    x=[1832, 400]
  },
  [1mtrial[0m={
    num_nodes=109008,
    x=[109008, 400]
  },
  [1m(trial, 0, cond)[0m={
    edge_index=[2, 217999],
    edge_attr=[217999]
  },
  [1m(trial, 1, drug)[0m={
    edge_index=[2, 194502],
    edge_attr=[194502]
  },
  [1m(cond, rev_0, trial)[0m={
    edge_index=[2, 217999],
    edge_attr=[217999]
  },
  [1m(drug, rev_1, trial)[0m={
    edge_index=[2, 194502],
    edge_attr=[194502]
  }
)

In [None]:

data_het.edge_index_dict

{('trial',
  '0',
  'cond'): tensor([[ 61963,  61963,  61964,  ..., 105954, 105954, 105952],
         [  1043,   2342,    422,  ...,    968,    970,   1092]]),
 ('trial',
  '1',
  'drug'): tensor([[ 61963,  81609,  52827,  ..., 102701,  32591,  32688],
         [   777,    777,    777,  ...,   1244,   1587,   1683]]),
 ('cond',
  'rev_0',
  'trial'): tensor([[  1043,   2342,    422,  ...,    968,    970,   1092],
         [ 61963,  61963,  61964,  ..., 105954, 105954, 105952]]),
 ('drug',
  'rev_1',
  'trial'): tensor([[   777,    777,    777,  ...,   1244,   1587,   1683],
         [ 61963,  81609,  52827,  ..., 102701,  32591,  32688]])}

Heterogeneous Data

In [None]:

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T


data_het = data_hom.to_heterogeneous(node_type = torch.tensor(node_type_num), edge_type = data_hom.edge_attr)
data_het.rename("0", "cond")
data_het.rename("1", "drug")
data_het.rename("2", "trial")

data_het = T.ToUndirected()(data_het)

data_het['cond'].x = torch.tensor(df_cond.values, dtype=torch.float)
data_het['drug'].x = torch.tensor(df_drug.values, dtype=torch.float)
data_het['trial'].x = torch.tensor(trials_df.values, dtype=torch.float)

In [None]:
# heterogeneous data from homogeneous data
# from torch_geometric.data import HeteroData
# import torch_geometric.transforms as T


# data = HeteroData()

# data['conditions'].x = torch.tensor(cond_embeddings, dtype=torch.float)
# data['drugs'].x = torch.tensor(drug_embeddings, dtype=torch.float)
# data['trial'].x = torch.tensor(np.random.rand(109356, 2), dtype=torch.float)

# data['trial', 'trial_cond', 'conditions'].edge_index = trial_cond_edges.t().contiguous()
# data['trial', 'trial_drug', 'drugs'].edge_index = trial_drug_edges.t().contiguous()

# # data['trial', 'trial_cond', 'conditions'].edge_attr = edge_attr[edge_attr == 0].contiguous()
# # data['trial', 'trial_drug', 'drugs'].edge_attr = edge_attr[edge_attr == 1].contiguous()


# data = T.ToUndirected()(data)


In [None]:
from torch_geometric.nn import GCNConv, GAE, RGCNConv, FastRGCNConv, SAGEConv, GATConv


class GNN(torch.nn.Module):
    def __init__(self,  hidden_dim, out_dim):
        super().__init__()
        self.conv1 = SAGEConv((-1,-1), hidden_dim, add_self_loops=False)
        self.conv2 = SAGEConv(hidden_dim, out_dim, add_self_loops=False)

    def forward(self, x, edge_index):

        print(x)
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)

def model.recon_loss(z1, z2, pos_edge_index, num_nodes, neg_edge_index= None, EPS = 1e-15):
      r"""Given latent variables :obj:`z`, computes the binary cross
      entropy loss for positive edges :obj:`pos_edge_index` and negative
      sampled edges.

      Args:
          z (torch.Tensor): The latent space :math:`\mathbf{Z}`.
          pos_edge_index (torch.Tensor): The positive edges to train against.
          neg_edge_index (torch.Tensor, optional): The negative edges to
              train against. If not given, uses negative sampling to
              calculate negative edges. (default: :obj:`None`)
      """
      def HetProductDecoder(z1, z2, edge_index):
        row, col = pos_edge_index
        value = (z1[row] * z2[col]).sum(dim=1)
        return torch.sigmoid(value)


      pos_loss = -torch.log(HetProductDecoder(z1, z2, pos_edge_index) + EPS).mean()

      if neg_edge_index is None:
          neg_edge_index = negative_sampling(pos_edge_index, num_nodes)
      neg_loss = -torch.log(1 - HetProductDecoder(z1, z2, pos_edge_index) + EPS).mean()

      return pos_loss + neg_loss

In [None]:

data_het

HeteroData(
  [1mcond[0m={
    num_nodes=2517,
    x=[2517, 400]
  },
  [1mdrug[0m={
    num_nodes=1832,
    x=[1832, 400]
  },
  [1mtrial[0m={
    num_nodes=109008,
    x=[109008, 400]
  },
  [1m(trial, 0, cond)[0m={
    edge_index=[2, 217999],
    edge_attr=[217999]
  },
  [1m(trial, 1, drug)[0m={
    edge_index=[2, 194502],
    edge_attr=[194502]
  },
  [1m(cond, rev_0, trial)[0m={
    edge_index=[2, 217999],
    edge_attr=[217999]
  },
  [1m(drug, rev_1, trial)[0m={
    edge_index=[2, 194502],
    edge_attr=[194502]
  }
)

In [1]:

from torch_geometric.nn import to_hetero
import torch
import torch.nn.functional as F

hdim = 16
model = GNN(hidden_dim=hdim*2, out_dim=hdim)#.to(device)
model = to_hetero(model, data_het.metadata())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

training_losses, validation_losses, val_auc, val_ap = [], [], [], []

for epoch in range(60):
    model.train()
    optimizer.zero_grad()

    z = model(data_het.x_dict, data_het.edge_index_dict)

    loss = 0
    for edge_type in data_het.edge_index_dict:

      edge_index_per_edge_type =  data_het.edge_index_dict[edge_type]
      node1 = edge_type[0]
      node2 = edge_type[2]

      z1 = z[node1]
      z2 = z[node2]
      num_nodes = (z[node1].shape[0], z[node2].shape[0])
      print(num_nodes)
      loss_per_edge_type = recon_loss(z1, z2, edge_index_per_edge_type, num_nodes)
      loss+= loss_per_edge_type

    training_loss = loss.item()
    loss.backward()
    optimizer.step()

    print("Epoch:", epoch+1, "Training Loss:", training_loss)

    model.eval()

    val_pos_edge_index = data.edge_index[:, data.val_mask].to(device)
    val_pos_edge_attr = data.edge_attr[data.val_mask].to(device)

    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index, train_pos_edge_attr)
        validation_loss = model.recon_loss(z, val_pos_edge_index).item()

    auc, ap = model.test(z, val_pos_edge_index, data.val_neg_edge_index)

    training_losses.append(training_loss)
    validation_losses.append(validation_loss)
    val_auc.append(auc)
    val_ap.append(ap)

    if (epoch + 1) % 20 == 0:
    print("Epoch: {:03d}, Training loss: {:.4f}, Validation loss: {:.4f}, AUC: {:.4f}, AP: {:.4f}".format(
        epoch + 1, training_loss, validation_loss, auc, ap))

## *********