In [26]:
import numpy as np
import pandas as pd
import pathlib as path
import os
import requests as rq
from tqdm.notebook import tqdm

### IDでHarvard DataverseからTAB形式でダウンロード
### 区切り文字は\t

In [25]:
# TDC metadata.pyより name2id_dictionary
name2id = {
    "bbb_adenot": 4259565,
    "bbb_martins": 4259566,
    "b3db_classification": 7878566,
    "b3db_regression": 7878567,
    "bindingdb_ic50": 4291560,
    "bindingdb_kd": 4291555,
    "bindingdb_ki": 4291556,
    "bindingdb_patent": 4724851,
    "bioavailability_ma": 4259567,
    "caco2_wang": 4259569,
    "pampa_ncats": 6695858,
    "approved_pampa_ncats": 6695857,
    "clearance_edrug3d": 4259571,
    "clintox": 4259572,
    "cyp1a2_veith": 4259573,
    "cyp2c19_veith": 4259576,
    "cyp2c9_veith": 4259577,
    "cyp2d6_veith": 4259580,
    "cyp3a4_veith": 4259582,
    "cyp2c9_substrate_carbonmangels": 4259584,
    "cyp2d6_substrate_carbonmangels": 4259578,
    "cyp3a4_substrate_carbonmangels": 4259581,
    "carcinogens_lagunin": 4259570,
    "davis": 5219748,
    "drugbank": 4139573,
    "drugcomb": 4215720,
    "f20_edrug3d": 4259586,
    "f30_edrug3d": 4259589,
    "halflife_edrug3d": 4259587,
    "hia_hou": 4259591,
    "hiv": 4259593,
    "huri": 4139567,
    "hydrationfreeenergy_freesolv": 4259594,
    "kiba": 5255037,
    "lipophilicity_astrazeneca": 4259595,
    "pgp_broccatelli": 4259597,
    "ppbr_edrug3d": 4259600,
    "ppbr_ma": 4259603,
    "sarscov2_3clpro_diamond": 4259606,
    "sarscov2_vitro_touret": 4259607,
    "orexin1_receptor_butkiewicz": 6894447,
    "m1_muscarinic_receptor_agonists_butkiewicz": 6894443,
    "m1_muscarinic_receptor_antagonists_butkiewicz": 6894446,
    "potassium_ion_channel_kir2.1_butkiewicz": 6894442,
    "kcnq2_potassium_channel_butkiewicz": 6894444,
    "cav3_t-type_calcium_channels_butkiewicz": 6894445,
    "choline_transporter_butkiewicz": 6894441,
    "serine_threonine_kinase_33_butkiewicz": 6894448,
    "tyrosyl-dna_phosphodiesterase_butkiewicz": 6894440,
    "solubility_aqsoldb": 4259610,
    "tox21": 4259612,
    "toxcast": 4259613,
    "twosides": 4139574,
    "vd_edrug3d": 4259618,
    "mhc1_iedb-imgt_nielsen": 4167073,
    "mhc2_iedb_jensen": 4167074,
    "zinc": 4170963,
    "moses": 4170962,
    "chembl": 4170965,
    "chembl_v29": 5767979,
    "qed": 4170959,
    "drd2": 4170957,
    "logp": 4170961,
    "gdsc1": 4165726,
    "gdsc2": 4165727,
    "iedb_jespersen": 4165725,
    "pdb_jespersen": 4165724,
    "qm7": 6358510,
    "qm7b": 6358512,
    "qm8": 6358513,
    "qm9": 6179310,  ### 4167112, 6175612
    #  'scpdb': None,
    #  'dude': None,
    #  'crossdock': None,
    "tap": 4167113,
    "sabdab_chen": 4167164,
    "protein_sabdab": 4167357,
    "oncopolypharmacology": 4167358,
    "mirtarbase": 4167359,
    "disgenet": 4168282,
    "sabdab_liberis": 4168425,
    "uspto50k": 4171823,
    "buchwald-hartwig": 6175640,
    "uspto_yields": 4186956,
    "uspto_catalyst": 4171574,
    "uspto": 4171642,
    "hetionet": 4201734,
    "herg": 4259588,
    "herg_central": 5740618,
    "herg_karim": 6822246,
    "dili": 4259585,
    "ppbr_az": 6413140,
    "ames": 4259564,
    "skin_reaction": 4259609,
    "clearance_microsome_az": 4266186,
    "clearance_hepatocyte_az": 4266187,
    "ld50_zhu": 4267146,
    "half_life_obach": 4266799,
    "vdss_lombardo": 4267387,
    "leenay": 4279966,
    "test_single_pred": 4832455,
    "test_multi_pred": 4832456,
    "gdsc_gene_symbols": 5255026,
    "weber": 5790963,
    "primekg": 6180626,
    "primekg_drug_feature": 6180619,
    "primekg_disease_feature": 6180618,
    "drug_comb_meta_data": 7104245,
    "phase1": 7331305,
    "phase2": 7331306,
    "phase3": 7331307,
    "brown_mdm2_ace2_12ca5": 9649623,
    "scperturb_drug_AissaBenevolenskaya2021": 9845396,
    "scperturb_drug_SrivatsanTrapnell2020_sciplex2": 9845394,
    "scperturb_drug_SrivatsanTrapnell2020_sciplex3": 9845397,
    "scperturb_drug_SrivatsanTrapnell2020_sciplex4": 9845395,
    "scperturb_drug_ZhaoSims2021": 9845393,
    "scperturb_gene_NormanWeissman2019": 10133995,
    "scperturb_gene_ReplogleWeissman2022_rpe1": 10133996,
    "scperturb_gene_ReplogleWeissman2022_k562_essential": 10134031,
    "opentargets_ra_data_splits": 10141152,
    "opentargets_ibd_data_splits": 10141151,
    "opentargets_ra_data_splits_idx": 10143574,
    "opentargets_ibd_data_splits_idx": 10143573,
    "opentargets_ra_drug_evidence": 10141153,
    "opentargets_ibd_drug_evidence": 10141154,
    "hlm": 10218426,
    "rlm": 10218425,
    "tchard_full": 10228321,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_test-0": 10228304,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_test-1": 10228296,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_test-2": 10228328,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_test-3": 10228299,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_test-4": 10228330,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_train-0": 10228331,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_train-1": 10228334,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_train-2": 10228324,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_train-3": 10228325,
    "tchard_pep_cdr3b_cdr3a_mhc_only_neg_assays_train-4": 10228327,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_test-0": 10228320,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_test-1": 10228295,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_test-2": 10228297,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_test-3": 10228294,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_test-4": 10228309,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_train-0": 10228301,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_train-1": 10228310,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_train-2": 10228315,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_train-3": 10228311,
    "tchard_pep_cdr3b_cdr3a_mhc_only_sampled_negs_train-4": 10228335,
    "tchard_pep_cdr3b_only_neg_assays_test-0": 10228300,
    "tchard_pep_cdr3b_only_neg_assays_test-1": 10228302,
    "tchard_pep_cdr3b_only_neg_assays_test-2": 10228305,
    "tchard_pep_cdr3b_only_neg_assays_test-3": 10228298,
    "tchard_pep_cdr3b_only_neg_assays_test-4": 10228319,
    "tchard_pep_cdr3b_only_neg_assays_train-0": 10228312,
    "tchard_pep_cdr3b_only_neg_assays_train-1": 10228317,
    "tchard_pep_cdr3b_only_neg_assays_train-2": 10228333,
    "tchard_pep_cdr3b_only_neg_assays_train-3": 10228318,
    "tchard_pep_cdr3b_only_neg_assays_train-4": 10228314,
    "tchard_pep_cdr3b_only_sampled_negs_test-0": 10228329,
    "tchard_pep_cdr3b_only_sampled_negs_test-1": 10228332,
    "tchard_pep_cdr3b_only_sampled_negs_test-2": 10228303,
    "tchard_pep_cdr3b_only_sampled_negs_test-3": 10228306,
    "tchard_pep_cdr3b_only_sampled_negs_test-4": 10228308,
    "tchard_pep_cdr3b_only_sampled_negs_train-0": 10228323,
    "tchard_pep_cdr3b_only_sampled_negs_train-1": 10228313,
    "tchard_pep_cdr3b_only_sampled_negs_train-2": 10228322,
    "tchard_pep_cdr3b_only_sampled_negs_train-3": 10228316,
    "tchard_pep_cdr3b_only_sampled_negs_train-4": 10228326,
    "cell_tissue_mg_edgelist": 10407107,
    "pinnacle_global_ppi_edgelist": 10407108,
    "pinnacle_protein_embed": 10407128,
    "pinnacle_labels_dict": 10409635,
    "panpep": 10428565,
    "pinnacle_output1": 10431072,
    "pinnacle_output2": 10431073,
    "pinnacle_output3": 10431078,
    "pinnacle_output4": 10431080,
    "pinnacle_output5": 10431077,
    "pinnacle_output6": 10431076,
    "pinnacle_output7": 10431079,
    "pinnacle_output8": 10431074,
    "pinnacle_output9": 10431075,
    "pinnacle_output10": 10431081,
    "geneformer_gene_median_dictionary": 10626278,
    "geneformer_gene_name_id_dict": 10626276,
    "geneformer_token_dictionary": 10626277,
}

In [24]:
tox_key = ["herg", "herg_karim", "herg_central", "ames", "dili", "skin_reaction", "ld50_zhu", "carcinogens_lagunin", "clintox_t", "tox21_t", "toxcast_t"]
cyp_key = ["cyp1a2_veith", "cyp2c19_veith", "cyp2c9_veith", "cyp2d6_veith", "cyp3a4_veith", "cyp2c9_substrate_carbonmangels", "cyp2d6_substrate_carbonmangels", "cyp3a4_substrate_carbonmangels"]

### herg_central(化合物306893種)は重すぎてrequest中にconnection broken → 手動DL

In [38]:
for key in tqdm(tox_key[:2] + tox_key[3:]):
    id = name2id[key]
    url = "https://dataverse.harvard.edu/api/access/datafile/" + str(id)
    data = rq.get(url).content
    with open(f"./Tox/{key}.csv", mode="wb") as f:
        f.write(data)
    df = pd.read_csv(f"./Tox/{key}.csv", sep="\t", index_col=0)
    df.to_csv(f"./Tox/{key}.csv")

  0%|          | 0/10 [00:00<?, ?it/s]

In [45]:
herg_central = pd.read_csv("./Tox/herg_central.csv", sep="\t", index_col=0)
herg_central.to_csv("./Tox/herg_central.csv")

In [41]:
for key in tqdm(cyp_key):
    id = name2id[key]
    url = "https://dataverse.harvard.edu/api/access/datafile/" + str(id)
    data = rq.get(url).content
    with open(f"./CYP/{key}.csv", mode="wb") as f:
        f.write(data)
    df = pd.read_csv(f"./CYP/{key}.csv", sep="\t", index_col=0)
    df.to_csv(f"./CYP/{key}.csv")

  0%|          | 0/8 [00:00<?, ?it/s]

# TEST

In [15]:
name2id["ames"]

4259564

In [16]:
test_url = "https://dataverse.harvard.edu/api/access/datafile/4259564"
test_data = rq.get(test_url).content

In [17]:
with open("test.csv", mode="wb") as f:
    f.write(test_data)

In [20]:
test_df = pd.read_csv("test.csv", sep="\t")
display(test_df.head())

Unnamed: 0,Drug_ID,Drug,Y
0,Drug 0,O=[N+]([O-])c1ccc2ccc3ccc([N+](=O)[O-])c4c5ccc...,1
1,Drug 1,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1
2,Drug 2,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0
3,Drug 3,[N-]=[N+]=CC(=O)NCC(=O)NN,1
4,Drug 4,[N-]=[N+]=C1C=NC(=O)NC1=O,1


In [23]:
os.remove("test.csv")

In [46]:
display(pd.read_csv("./Tox/herg_central.csv"))

Unnamed: 0,ID,X,hERG_at_1uM,hERG_at_10uM,hERG_inhib
0,22416348,Cc1occc1C(=O)NCc1ccco1,20.17528,30.99165,0
1,26665387,COc1ccc(/C=C2\SC(=S)N(N3CCOCC3)C2=O)c(OC)c1,10.22630,13.05888,0
2,862531,C[C@H](NC(=O)Nc1cccc(C(F)(F)F)c1)C(=O)O,2.04420,0.06288,0
3,26732361,COc1cc(OC)c2ccc(=O)oc2c1C(CC(=O)N1CCOCC1)c1ccc...,21.80250,17.87858,0
4,49735227,COc1cccc(NC(=O)C2C3C=CC4(O3)C2C(=O)N(CCCN2CCCC...,8.33980,19.03128,0
...,...,...,...,...,...
306888,26728232,CC(=O)CSc1nnc(CN2C(=O)CSc3ccc(C(F)(F)F)cc32)n1C,7.94890,-12.31042,0
306889,7969890,O=C(COc1ccc(Cl)cc1)Nc1nnc(-c2ccc3c(c2)CCCC3)o1,15.81910,17.11278,0
306890,49732430,COc1ccc(C(=O)N2CCCC(CO)(Cc3ccccc3C)C2)o1,1.61830,-8.30242,0
306891,49718253,CCOC(=O)[C@@H]1[C@H]2COc3ccc(Br)cc3[C@H]2N2C(=...,4.93990,-4.14222,0
