# Get the ChEMBL ID

In [1]:
# import pkgs
import pandas as pd

from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

from pd_process import str2list

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
approved_drug_list_path = "../Get_approved_drug_from_cancer_gov/acute_myeloid_leukemia_approved_drugs.csv"
approved_drug_df = pd.read_csv(approved_drug_list_path)
approved_drug_list = approved_drug_df['drug_name'].str.lower().to_list()

In [3]:
chembl_id = []
molecule = new_client.molecule
for each_drug in tqdm(approved_drug_list):
    each_drug_name_list = each_drug.replace(")", "").replace("\xa0", " ").split(" (")
    
    temp_chembl_id_list = []
    for each_drug_name in each_drug_name_list:
        mols = molecule.filter(pref_name__iexact=each_drug_name)
        if len(mols) == 0:
            continue
        else:
            for mol in mols:
                mol_chembl_id = mol['molecule_hierarchy']['parent_chembl_id']
                mol_chembl_name = mol['pref_name']
                mol_chembl_max_phase = mol['max_phase']
                
                if mol_chembl_max_phase is not None and float(mol_chembl_max_phase) == 4.0:
                    temp_chembl_id_list.append(mol_chembl_id)
        
    if len(temp_chembl_id_list) == 0:
        chembl_id.append("Not found")
    elif len(temp_chembl_id_list) == 1:
        chembl_id.append(temp_chembl_id_list[0])
    else:
        chembl_id.append(str(temp_chembl_id_list))
approved_drug_df['chembl_id_hits'] = chembl_id

100%|██████████| 25/25 [00:00<00:00, 253.14it/s]


# Manually validation

In [4]:
approved_drug_df[approved_drug_df['chembl_id_hits'].str.contains("\[")]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,"['CHEMBL88', 'CHEMBL88']"


In [5]:
chembl_id[3] = "CHEMBL88"
# Reason: 
# The drug name "Cytarabine" has two chembl_id hits, but CHEMBL88 is the main structure. 
# The other one is a alternative form with one more water molecule.

In [6]:
approved_drug_df[approved_drug_df['chembl_id_hits'] == "Not found"]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found
16,Olutasidenib,https://www.cancer.gov/about-cancer/treatment/...,Not found
19,Quizartinib Dihydrochloride,https://www.cancer.gov/about-cancer/treatment/...,Not found


In [7]:
chembl_id[5] = "Not found"
# Reason:
# Daunorubicin hydrochloride and cytarabine liposome is a combination form of daunorubicin hydrochloride and cytarabine contained inside liposomes (very tiny particles of fat). 
# Thus, this combination form is not found in ChEMBL database.
# Furthermore, daunorubicin hydrochloride and cytarabine have been included in our list of approved drugs for acute myeloid leukemia.

In [8]:
mols = molecule.filter(pref_name__iexact="Olutasidenib")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[16] = "CHEMBL4297610"
# Reason:
# Olutasidenib is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL4297610 OLUTASIDENIB 1.0
CHEMBL4297610


In [9]:
mols = molecule.filter(pref_name__iexact="Quizartinib Dihydrochloride")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[19] = "CHEMBL576982"
# Reason:
# Quizartinib Dihydrochloride is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL2105709 QUIZARTINIB DIHYDROCHLORIDE 3.0
CHEMBL576982


In [10]:
approved_drug_df.drop(columns=['chembl_id_hits'], inplace=True)

In [11]:
approved_drug_df['parent_chembl_id'] = chembl_id
chembl_id_synonyms = []
for each_parent_chembl_id in tqdm(chembl_id):
    if each_parent_chembl_id == "Not found":
        chembl_id_synonyms.append("Not found")
    else:
        mols = molecule.filter(molecule_hierarchy__parent_chembl_id__iexact=each_parent_chembl_id).only('molecule_chembl_id')
        chembl_id_synonyms.append(str([mol['molecule_chembl_id'] for mol in mols]))
approved_drug_df['chembl_id_synonyms'] = chembl_id_synonyms

100%|██████████| 25/25 [00:00<00:00, 340.41it/s]


In [12]:
approved_drug_df.head()

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms
0,Arsenic Trioxide,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL2362016,['CHEMBL2362016']
1,Azacitidine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL1489,"['CHEMBL1489', 'CHEMBL3250420', 'CHEMBL3250421']"
2,Cerubidine (Daunorubicin Hydrochloride),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL178,"['CHEMBL178', 'CHEMBL1563', 'CHEMBL1200475']"
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL88,"['CHEMBL88', 'CHEMBL1200796', 'CHEMBL2364721']"
4,Cytarabine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL803,"['CHEMBL803', 'CHEMBL1256472']"


# Extract the assay ChEMBL ID from the activity data

In [13]:
approved_drug_df[approved_drug_df['chembl_id_synonyms'] == "Not found"]

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found


In [14]:
activity = new_client.activity
assay_chembl_id_list = []
for each_chembl_id_synonyms_str in tqdm(approved_drug_df['chembl_id_synonyms']):
    if each_chembl_id_synonyms_str == "Not found":
        assay_chembl_id_list.append("Not found")
    else:
        assay_chembl_ids = []
        for each_chembl_id in str2list(each_chembl_id_synonyms_str):
            found_activity = activity.filter(molecule_chembl_id=each_chembl_id).only('assay_chembl_id')
            if len(found_activity) == 0:
                continue
            temp = pd.DataFrame(found_activity).drop_duplicates()
            assay_chembl_ids.extend(temp['assay_chembl_id'].to_list())
        
        if len(assay_chembl_ids) > 0:
            assay_chembl_id_list.append(str(assay_chembl_ids))
        else:
            assay_chembl_id_list.append("Not found")

100%|██████████| 25/25 [00:23<00:00,  1.07it/s]


In [15]:
approved_drug_df['assay_chembl_id'] = assay_chembl_id_list

# Manual validation

In [17]:
approved_drug_df[approved_drug_df['assay_chembl_id'] == "Not found"]
# Reason:
# The assay_chembl_id is not found in ChEMBL database for these drugs.

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms,assay_chembl_id
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found,Not found
22,Tisagenlecleucel (Kymriah),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL3301574,['CHEMBL3301574'],Not found


# Save csv

In [16]:
approved_drug_df.to_csv("acute_myeloid_leukemia_approved_drugs_with_chembl_id.csv", index=False)

In [3]:
approved_drug_df = pd.read_csv("acute_myeloid_leukemia_approved_drugs_with_chembl_id.csv")

In [9]:
for i in range(len(approved_drug_df)):
    if approved_drug_df.iloc[i]['assay_chembl_id'] != "Not found":
        print(approved_drug_df.iloc[i]['drug_name'], ":", len(str2list(approved_drug_df.iloc[i]['assay_chembl_id'])))

Arsenic Trioxide : 52
Azacitidine : 485
Cerubidine (Daunorubicin Hydrochloride) : 1172
Cyclophosphamide : 864
Cytarabine : 1818
Daurismo (Glasdegib Maleate) : 39
Dexamethasone : 1615
Doxorubicin Hydrochloride : 12801
Enasidenib Mesylate : 29
Gemtuzumab Ozogamicin : 19
Gilteritinib Fumarate : 355
Idamycin PFS (Idarubicin Hydrochloride) : 450
Ivosidenib : 32
Midostaurin : 2669
Mitoxantrone Hydrochloride : 1495
Olutasidenib : 32
Pemazyre (Pemigatinib) : 134
Prednisone : 376
Quizartinib Dihydrochloride : 1613
Rituxan (Rituximab) : 19
Tabloid (Thioguanine) : 586
Venclexta (Venetoclax) : 147
Vincristine Sulfate : 1594
