# Data Dev Notebook

In [1]:
# Imports
import os
import glob
import anndata
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import torch
from torch_geometric.data import HeteroData

import plotly.express as px
import umap

### Data: IBKH

In [2]:
# DATA
filepath = 'data/iBKH'
filepath = os.path.expanduser(filepath)

In [3]:
drug_vocab = pd.read_csv(os.path.join(filepath, 'drug_vocab.csv'))
disease_vocab = pd.read_csv(os.path.join(filepath, 'disease_vocab.csv'))
gene_vocab = pd.read_csv(os.path.join(filepath, 'gene_vocab.csv'))

drug2id = {row['primary']: idx for idx, row in drug_vocab.iterrows()}
disease2id = {row['primary']: idx for idx, row in disease_vocab.iterrows()}
gene2id = {row['primary']: idx for idx, row in gene_vocab.iterrows()}

In [4]:
# Drug-Drug
filename = 'D_D'
D_D_df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
D_D_df

Unnamed: 0,Drug_1,Drug_2,Interaction
0,DrugBank:DB00001,DrugBank:DB06605,1
1,DrugBank:DB00001,DrugBank:DB06695,1
2,DrugBank:DB00001,DrugBank:DB01254,1
3,DrugBank:DB00001,DrugBank:DB01609,1
4,DrugBank:DB00001,DrugBank:DB01586,1
...,...,...,...
2684677,DrugBank:DB00190,DrugBank:DB01064,0
2684678,DrugBank:DB00487,DrugBank:DB06771,0
2684679,DrugBank:DB01201,DrugBank:DB01220,0
2684680,DrugBank:DB06147,DrugBank:DB00664,0


In [5]:
# Drug-Disease
filename = 'D_Di'
df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
D_Di_df = df.rename(columns={df.columns[-1]: 'Interaction'})
D_Di_df

Unnamed: 0,Drug,Disease,Interaction
0,DrugBank:DB00997,DOID:363,1
1,DrugBank:DB00206,DOID:10763,1
2,DrugBank:DB00960,DOID:10763,1
3,DrugBank:DB00665,DOID:10283,1
4,DrugBank:DB00290,DOID:2998,1
...,...,...,...
2717942,MeSH:D043168,DOID:9074,0
2717943,MeSH:D047188,DOID:2935,0
2717944,MeSH:D050822,DOID:3393,0
2717945,MeSH:D054428,DOID:9351,0


In [6]:
# Drug-Gene
filename = 'D_G'
df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
D_G_df = df.rename(columns={df.columns[-1]: 'Interaction'}).dropna(ignore_index=True)
D_G_df

Unnamed: 0,Drug,Gene,Interaction
0,DrugBank:DB00114,HGNC:4855,1
1,DrugBank:DB00117,HGNC:4855,1
2,DrugBank:DB00142,HGNC:29570,1
3,DrugBank:DB02340,HGNC:3531,1
4,DrugBank:DB11300,HGNC:3531,1
...,...,...,...
1303741,DrugBank:DB00619,HGNC:29650,0
1303742,DrugBank:DB00619,HGNC:18591,0
1303743,DrugBank:DB00619,HGNC:11088,0
1303744,DrugBank:DB00619,HGNC:16870,0


In [7]:
# Disease-Disease
filename = 'Di_Di'
df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
Di_Di_df = df.rename(columns={df.columns[-1]: 'Interaction'}).dropna(ignore_index=True)
Di_Di_df

Unnamed: 0,Disease_1,Disease_2,Interaction
0,DOID:0001816,DOID:175,1
1,DOID:175,DOID:176,1
2,DOID:0002116,DOID:10124,1
3,DOID:10124,DOID:5614,1
4,DOID:0014667,DOID:4,1
...,...,...,...
11067,DOID:219,DOID:8577,0
11068,DOID:2994,DOID:13499,0
11069,DOID:1793,DOID:10534,0
11070,DOID:219,DOID:3121,0


In [8]:
# Disease-Gene
filename = 'Di_G'
df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
Di_G_df = df.rename(columns={df.columns[-1]: 'Interaction'}).dropna(ignore_index=True)
Di_G_df

Unnamed: 0,Disease,Gene,Interaction
0,DOID:263,HGNC:5417,1
1,DOID:1909,HGNC:2464,1
2,DOID:10763,HGNC:2357,1
3,DOID:2394,HGNC:7166,1
4,DOID:12849,HGNC:8529,1
...,...,...,...
27538769,MeSH:C567767,NCBI:852700,0
27538770,MeSH:C535468,NCBI:856241,0
27538771,DOID:7148,NCBI:887096,0
27538772,DOID:0111967,HGNC:1637,0


In [9]:
# Gene-Gene
filename = 'G_G'
df = pd.read_csv(os.path.join(filepath, f'{filename}_res.csv')).iloc[:, 0:3]
G_G_df = df.rename(columns={df.columns[-1]: 'Interaction'}).dropna(ignore_index=True)
G_G_df

Unnamed: 0,Gene_1,Gene_2,Interaction
0,HGNC:14497,HGNC:27538,1
1,HGNC:3573,HGNC:26045,1
2,HGNC:44200,HGNC:14358,1
3,HGNC:9936,HGNC:20714,1
4,HGNC:8117,HGNC:3712,1
...,...,...,...
735151,HGNC:14605,HGNC:7,0
735152,HGNC:1504,HGNC:21498,0
735153,HGNC:1505,HGNC:25697,0
735154,HGNC:1507,HGNC:21498,0


## iKBH Dataset


In [12]:
from utils.datasets import IBKHDataset

In [13]:
iKBH_dataset = IBKHDataset(data_dir='data/iBKH')
graph = iKBH_dataset.build_hetero_data() # TODO: Fix wtv is wrong lol (idk i think its fine but double check if we are meant to have different flavoured edges or whatnot)