# Merging 30 Datasets

## Imports

In [None]:
from tdc.multi_pred import DTI
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

## Drug-Target Interaction Prediction

https://tdcommons.ai/multi_pred_tasks/dti/

### Data Extraction

In [None]:
BindingDB_Kd = DTI(name = 'BindingDB_Kd')
BindingDB_Kd.convert_to_log(form = 'standard')
BindingDB_Kd = BindingDB_Kd.harmonize_affinities(mode = 'max_affinity')

In [None]:
DAVIS = DTI(name = 'DAVIS')
DAVIS.convert_to_log(form = 'standard')
DAVIS = DAVIS.get_data()

In [None]:
KIBA = DTI(name = 'KIBA')
KIBA.convert_to_log(form = 'standard')
KIBA = KIBA.get_data()

In [None]:
sns.histplot(BindingDB_Kd['Y'])

In [None]:
sns.histplot(DAVIS['Y'])

In [None]:
sns.histplot(KIBA['Y'])

In [None]:
BindingDB_Kd.describe()

###  Data Cleaning

In [None]:
interactive_ds = [BindingDB_Kd, DAVIS, KIBA]

In [None]:
clean_ds = []

for df in interactive_ds:
    min_value = df['Y'].min()
    max_value = df['Y'].max()
    bins = np.linspace(min_value,max_value,5)
    labels = [3,2,1,0] # labels, lower the values, the higher the label.
    df['Y_cat'] = pd.cut(df['Y'], bins=bins,labels=labels,include_lowest=True)
    df = df[['Drug','Target','Y_cat']]
    df = df.drop_duplicates(subset=['Drug', 'Target'], keep='first')
    df_clean = pd.pivot(df, index='Drug',columns='Target', values ='Y_cat')
    df_clean = df_clean.fillna(0)
    clean_ds.append(df_clean)
    
BindingDB_Kd_clean, DAVIS_clean, KIBA_clean = clean_ds[0],clean_ds[1],clean_ds[2]

In [None]:
BindingDB_Kd_clean

In [None]:
DTI_merged = pd.merge(BindingDB_Kd_clean,DAVIS_clean, how ='outer', on = 'Drug').merge(
    KIBA_clean, how='outer',on ='Drug').fillna(0)

In [None]:
DTI_merged

## ADME Property Prediction

https://tdcommons.ai/single_pred_tasks/adme/

### Data Extraction

In [None]:
from tdc.single_pred import ADME

In [None]:
Caco2_Wang = ADME(name = 'Caco2_Wang').get_data()

In [None]:
HIA_Hou = ADME(name = 'HIA_Hou').get_data()

In [None]:
Pgp_Broccatelli = ADME(name = 'Pgp_Broccatelli').get_data()

In [None]:
Bioavailability_Ma = ADME(name = 'Bioavailability_Ma').get_data()

In [None]:
Lipophilicity_AstraZeneca = ADME(name = 'Lipophilicity_AstraZeneca').get_data()

In [None]:
Solubility_AqSolDB = ADME(name = 'Solubility_AqSolDB').get_data()

In [None]:
HydrationFreeEnergy_FreeSolv = ADME(name = 'HydrationFreeEnergy_FreeSolv').get_data()

In [None]:
BBB_Martins = ADME(name = 'BBB_Martins').get_data()

In [None]:
PPBR_AZ = ADME(name = 'PPBR_AZ').get_data()

In [None]:
VDss_Lombardo = ADME(name = 'VDss_Lombardo').get_data()

In [None]:
CYP2C19_Veith = ADME(name = 'CYP2C19_Veith').get_data()

In [None]:
CYP2D6_Veith = ADME(name = 'CYP2D6_Veith').get_data()

In [None]:
CYP3A4_Veith = ADME(name = 'CYP3A4_Veith').get_data()

In [None]:
CYP1A2_Veith= ADME(name = 'CYP1A2_Veith').get_data()

In [None]:
CYP2C9_Veith = ADME(name = 'CYP2C9_Veith').get_data()

In [None]:
CYP2C9_Substrate_CarbonMangels = ADME(name = 'CYP2C9_Substrate_CarbonMangels').get_data()

In [None]:
CYP2D6_Substrate_CarbonMangels = ADME(name = 'CYP2D6_Substrate_CarbonMangels').get_data()

In [None]:
CYP3A4_Substrate_CarbonMangels = ADME(name = 'CYP3A4_Substrate_CarbonMangels').get_data()

In [None]:
Half_Life_Obach = ADME(name = 'Half_Life_Obach').get_data()

In [None]:
Clearance_Hepatocyte_AZ = ADME(name = 'Clearance_Hepatocyte_AZ').get_data()

### Data Cleaning

In [None]:


ADME_dfs = {'Caco2_Wang':Caco2_Wang,'HIA_Hou':HIA_Hou, 'Pgp_Broccatelli':Pgp_Broccatelli, 'Bioavailability_Ma': Bioavailability_Ma,
           'Lipophilicity_AstraZeneca': Lipophilicity_AstraZeneca, 'Solubility_AqSolDB': Solubility_AqSolDB,
           'HydrationFreeEnergy_FreeSolv':HydrationFreeEnergy_FreeSolv,'BBB_Martins': BBB_Martins,
           'PPBR_AZ': PPBR_AZ, 'VDss_Lombardo': VDss_Lombardo, 'CYP2C19_Veith': CYP2C19_Veith, 
           'CYP2D6_Veith':CYP2D6_Veith, 'CYP3A4_Veith':CYP3A4_Veith, 'CYP1A2_Veith':CYP1A2_Veith,
           'CYP2C9_Veith':CYP2C9_Veith, 'CYP2C9_Substrate_CarbonMangels': CYP2C9_Substrate_CarbonMangels,
           'CYP2D6_Substrate_CarbonMangels':CYP2D6_Substrate_CarbonMangels, 
            'CYP3A4_Substrate_CarbonMangels': CYP3A4_Substrate_CarbonMangels, 
           'Half_Life_Obach':Half_Life_Obach, 'Clearance_Hepatocyte_AZ': Clearance_Hepatocyte_AZ}


# rename Y as the name
for df in ADME_dfs.items():
    print(df[0])
    df[1].rename(columns= {"Y": df[0]}, inplace = True)
    df[1].drop(columns ='Drug_ID', inplace = True, axis = 0)

ADME_df_clean = []
for i in ADME_dfs:
    ADME_df_clean.append(ADME_dfs[i])

from functools import reduce
AMDE_merged = reduce(lambda left, right : pd.merge(left,right, on =['Drug'], 
                                                 how ='outer'),ADME_df_clean).fillna(0)

AMDE_merged

## Toxicity Prediction Task

https://tdcommons.ai/single_pred_tasks/tox/

### Data Extraction

In [None]:
from tdc.single_pred import Tox

In [None]:
LD50_Zhu = Tox(name = 'LD50_Zhu').get_data()

In [None]:
hERG = Tox(name = 'hERG').get_data()

In [None]:
from tdc.utils import retrieve_label_name_list
label_list = retrieve_label_name_list('herg_central')
herg_central = Tox(name = 'herg_central', label_name = label_list[0]).get_data()

In [None]:
AMES = Tox(name = 'AMES').get_data()

In [None]:
DILI = Tox(name = 'DILI').get_data()

In [None]:
Skin_Reaction = Tox(name = 'Skin Reaction').get_data()

In [None]:
Carcinogens_Lagunin = Tox(name = 'Carcinogens_Lagunin').get_data()

In [None]:
label_list = retrieve_label_name_list('Tox21')
Tox21 = Tox(name = 'Tox21', label_name = label_list[0]).get_data()

In [None]:
label_list = retrieve_label_name_list('Toxcast')
ToxCast = Tox(name = 'ToxCast', label_name = label_list[0]).get_data()

In [None]:
ClinTox = Tox(name = 'ClinTox').get_data()

### Data Cleaning

In [None]:
TOX_dfs = {'LD50_Zhu':LD50_Zhu, 'hERG': hERG, 'herg_central': herg_central, 
          'AMES': AMES, 'DILI': DILI, 'Skin_Reaction': Skin_Reaction, 
           'Carcinogens_Lagunin': Carcinogens_Lagunin, 'Tox21': Tox21, 'ToxCast': ToxCast,
          'ClinTox': ClinTox }

for df in TOX_dfs.items():
    print(df[0])
    df[1].rename(columns= {"Y": df[0]}, inplace = True)
    df[1].drop(columns ='Drug_ID', inplace = True, axis = 0)

TOX_df_clean = []
for i in TOX_dfs:
    TOX_df_clean.append(TOX_dfs[i])

from functools import reduce
TOX_merged = reduce(lambda left, right : pd.merge(left,right, on =['Drug'], 
                                                 how ='outer'),TOX_df_clean).fillna(0)
TOX_merged

## Major Merge

In [None]:
merged_df = DTI_merged.merge(AMDE_merged, on ='Drug', how='outer').merge(TOX_merged, on ='Drug', how='outer').fillna(0)

In [None]:
merged_df

In [None]:
merged_df_dropped = merged_df.copy()

In [None]:
# counting how many cols have a lot of 0s. 
count = 0 
for col in merged_df_dropped.columns[1:]:
    if merged_df_dropped[col].value_counts()[0]/len(merged_df_dropped[col]) > 0.9999:
        count += 1
print(count)

In [None]:
# dropping the cols that have a lot of 0s. 
for col in merged_df_dropped.columns[1:]:
    if merged_df_dropped[col].value_counts()[0]/len(merged_df_dropped[col]) > 0.9999:
        merged_df_dropped.drop(columns = col, inplace = True)

In [None]:
merged_df_dropped.to_csv('/Users/george/Desktop/LW-DDI Project/xgb_merged_df_dropped.csv')

In [None]:
merged_df_dropped.head