# Data Processing

In [1]:
# Imports
import os
import torch
import numpy as np
import pandas as pd
from scipy.io import mmread

---
# iBKH

In [2]:
# DATA
IBKH_PATH = 'data/iBKH'

In [3]:
disease_vocab = pd.read_csv(f'{IBKH_PATH}/disease_vocab.csv')
gene_vocab    = pd.read_csv(f'{IBKH_PATH}/gene_vocab.csv')

disease_conv = dict(zip(disease_vocab['primary'], disease_vocab['icd_9']))
gene_conv    = dict(zip(gene_vocab['primary'], gene_vocab['symbol']))

In [11]:
# Drug-Drug
FILE_NAME = 'D_D'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

df

Unnamed: 0,Drug_1,Drug_2,Interaction
0,DrugBank:DB00001,DrugBank:DB06605,1
1,DrugBank:DB00001,DrugBank:DB06695,1
2,DrugBank:DB00001,DrugBank:DB01254,1
3,DrugBank:DB00001,DrugBank:DB01609,1
4,DrugBank:DB00001,DrugBank:DB01586,1
...,...,...,...
2684677,DrugBank:DB00190,DrugBank:DB01064,0
2684678,DrugBank:DB00487,DrugBank:DB06771,0
2684679,DrugBank:DB01201,DrugBank:DB01220,0
2684680,DrugBank:DB06147,DrugBank:DB00664,0


In [12]:
# Drug-Disease
FILE_NAME = 'D_Di'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

# Update Labels
df.iloc[:, 1] = df.iloc[:, 1].map(disease_conv).fillna(df.iloc[:,1])

df

Unnamed: 0,Drug,Disease,Treats
0,DrugBank:DB00997,179,1
1,DrugBank:DB00206,401-405.99,1
2,DrugBank:DB00960,401-405.99,1
3,DrugBank:DB00665,185,1
4,DrugBank:DB00290,186,1
...,...,...,...
2717942,MeSH:D043168,710.0,0
2717943,MeSH:D047188,DOID:2935,0
2717944,MeSH:D050822,410-414.99,0
2717945,MeSH:D054428,250,0


In [13]:
# Drug-Gene
FILE_NAME = 'D_G'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

# Update Labels
df.iloc[:, 1] = df.iloc[:, 1].map(gene_conv).fillna(df.iloc[:,1])

df

Unnamed: 0,Drug,Gene,Target
0,DrugBank:DB00114,HDC,1
1,DrugBank:DB00117,HDC,1
2,DrugBank:DB00142,GLS2,1
3,DrugBank:DB02340,F13A1,1
4,DrugBank:DB11300,F13A1,1
...,...,...,...
1303742,DrugBank:DB00619,PKMYT1,0
1303743,DrugBank:DB00619,NEK9,0
1303744,DrugBank:DB00619,SLK,0
1303745,DrugBank:DB00619,MELK,0


In [14]:
# Disease-Disease
FILE_NAME = 'Di_Di'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

# Update Labels
df.iloc[:, 0] = df.iloc[:, 0].map(disease_conv).fillna(df.iloc[:,0])
df.iloc[:, 1] = df.iloc[:, 1].map(disease_conv).fillna(df.iloc[:,1])

df

Unnamed: 0,Disease_1,Disease_2,is_a
0,DOID:0001816,DOID:175,1
1,DOID:175,DOID:176,1
2,DOID:0002116,371.9,1
3,371.9,379.90,1
4,277.9,DOID:4,1
...,...,...,...
11067,153,556,0
11068,DOID:2994,152.1,0
11069,157.0,151,0
11070,153,156.0,0


In [None]:
# Disease-Gene
FILE_NAME = 'Di_G'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

# Update Labels
df.iloc[:, 0] = df.iloc[:, 0].map(disease_conv).fillna(df.iloc[:,0])
df.iloc[:, 1] = df.iloc[:, 1].map(gene_conv   ).fillna(df.iloc[:,1])

df

In [9]:
# Gene-Gene
FILE_NAME = 'G_G'
df = pd.read_csv(f'{IBKH_PATH}/{FILE_NAME}_res.csv').iloc[:, 0:3]
# df = df[df.iloc[:, 2] == 1].reset_index(drop=True).iloc[:, 0:2]

# Update Labels
df.iloc[:, 0] = df.iloc[:, 0].map(gene_conv).fillna(df.iloc[:,0])
df.iloc[:, 1] = df.iloc[:, 1].map(gene_conv).fillna(df.iloc[:,1])

df

Unnamed: 0,Gene_1,Gene_2,Covaries
0,IMP3,OR8U8,1
1,FADD,C1orf56,1
2,TRABD2B,IRX1,1
3,OPN1LW,ZDHHC16,1
4,TENM1,FKBP1B,1
...,...,...,...
735151,ADAMTS12,A2M,0
735152,CASP3,ATG16L1,0
735153,CASP4,GSDMD,0
735154,CASP6,ATG16L1,0


## Dataset


In [4]:
from utils.datasets import IBKHDataset

In [5]:
iKBH_dataset = IBKHDataset(data_dir='data/iBKH') # TODO: Save row2identity, so I dont have to re-run this every time.
input_triplet = iKBH_dataset.build_data()

Building triples...:   0%|          | 0/6 [00:00<?, ?relations/s]

In [6]:
# Inputs processing
torch.save(input_triplet, 'inputs/KGE/input_triplet.pt')

---
# EHR

In [6]:
# DATA
filepath = 'data/MIMIC-III'
filepath = os.path.expanduser(filepath)

In [7]:
icd_metadata_df = pd.read_csv(os.path.join(filepath, 'D_ICD_DIAGNOSES.csv'))
icd_metadata_df

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,55,0065,Amebic brain abscess,Amebic brain abscess
1,56,0066,Amebic skin ulceration,Amebic skin ulceration
2,57,0068,Amebic infection NEC,Amebic infection of other sites
3,58,0069,Amebiasis NOS,"Amebiasis, unspecified"
4,59,0070,Balantidiasis,Balantidiasis
...,...,...,...,...
14562,10053,V1368,"Hx-cong malform-skin,ms",Personal history of (corrected) congenital mal...
14563,11201,79508,Unsat cerv cytlogy smear,Unsatisfactory cervical cytology smear
14564,11202,79509,Abn pap cervix HPV NEC,Other abnormal Papanicolaou smear of cervix an...
14565,11203,79510,Abn gland pap smr vagina,Abnormal glandular Papanicolaou smear of vagina


In [8]:
ehr_df = pd.read_csv(os.path.join(filepath, 'DIAGNOSES_ICD.csv'),
                     usecols=['SUBJECT_ID','ICD9_CODE'],
                     ).dropna(ignore_index=True)

# Rename
ehr_df.rename(columns={
    'SUBJECT_ID': 'patient_id',
    'ICD9_CODE': 'icd9_code'
}, inplace=True)

# Isolate Immune conditions # TODO: Discuss immune condition issues
# immune_icd9_codes = [ '7100', '7140', '725', '2790', '2791', '9953', '2880', '5550', '5560', '99527', '2792', '2387', '2885', '2886', '9952' ]
# ehr_df = ehr_df[ehr_df['icd9_code'].isin(immune_icd9_codes)]

# Make into Bag-of-Words (BoW)
ehr_df = (ehr_df
          .pivot_table(index='patient_id',
                       columns='icd9_code',
                       aggfunc='size',
                       fill_value=0)
          .sort_index())

In [12]:
# Re-order based on iBKH
disease_order = [
    name for typ, name in iKBH_dataset.row2entity
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]

ehr_df = ehr_df[disease_order]
ehr_df = ehr_df.loc[~(ehr_df == 0).all(axis=1)]

ehr_data = ehr_df                     # DataFrame Shape(patient_id, icd_9)
X_ehr = torch.Tensor(ehr_data.values) # Tensor

In [13]:
# Inputs processing
input_filepath = 'inputs/GraphETM'

torch.save(X_ehr, os.path.join(input_filepath, 'X_ehr.pt'))
ehr_data.to_csv(os.path.join(input_filepath, 'input_EHR.csv'), index=False)

# (Optional) Metadata
icd_metadata_df.to_csv(os.path.join(input_filepath, 'optional/input_EHR_metadata.csv'), index=False)

In [14]:
# Save disease indices (for rho)
id_embed_ehr = [
    idx for idx, (typ, name) in enumerate(iKBH_dataset.row2entity)
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]

np.save(os.path.join(input_filepath, 'id_embed_ehr.npy'), np.array(id_embed_ehr))

---
# PBMC

In [6]:
# DATA
filepath = 'data/PBMC'
filepath = os.path.expanduser(filepath)

In [12]:
pbmc_data = mmread(os.path.join(filepath, 'matrix.mtx')).tocsc()
pbmc_names = pd.read_csv(os.path.join(filepath, 'genes.tsv'), sep='\t', header=None)
pbmc_data = pd.DataFrame(pbmc_data.toarray().T, columns=pbmc_names[1])

In [13]:
# Make dataframe unique
pbmc_data = pbmc_data.loc[:, ~pbmc_data.columns.duplicated()]

In [14]:
# Re-order based on iBKH
gene_order = [
    name for typ, name in iKBH_dataset.row2entity
    if typ == 'gene' and name in pbmc_data.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

pbmc_data = pbmc_data[gene_order]     # DataFrame
X_sc = torch.Tensor(pbmc_data.values) # Tensor

In [15]:
# PBMC Input Processing
input_filepath = 'inputs/GraphETM'

torch.save(X_sc, os.path.join(input_filepath, 'X_sc.pt'))
pbmc_data.to_csv(os.path.join(input_filepath, 'input_PBMC.csv'), index=False)

In [16]:
# Save gene indices (for rho)
id_embed_sc = [
    idx for idx, (typ, name) in enumerate(iKBH_dataset.row2entity)
    if typ == 'gene' and name in pbmc_data.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

np.save(os.path.join(input_filepath, 'id_embed_sc.npy'), np.array(id_embed_sc))

In [None]:
# DONE