# Get the ChEMBL ID

In [1]:
# import pkgs
import pandas as pd

from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
approved_drug_list_path = "../Get_approved_drug_from_cancer_gov/acute_myeloid_leukemia_approved_drugs.csv"
approved_drug_df = pd.read_csv(approved_drug_list_path)
approved_drug_list = approved_drug_df['drug_name'].str.lower().to_list()

In [3]:
chembl_id = []
molecule = new_client.molecule
for each_drug in tqdm(approved_drug_list):
    each_drug_name_list = each_drug.replace(")", "").replace("\xa0", " ").split(" (")
    
    temp_chembl_id_list = []
    for each_drug_name in each_drug_name_list:
        mols = molecule.filter(pref_name__iexact=each_drug_name)
        if len(mols) == 0:
            continue
        else:
            for mol in mols:
                mol_chembl_id = mol['molecule_chembl_id']
                mol_chembl_name = mol['pref_name']
                mol_chembl_max_phase = mol['max_phase']
                
                if mol_chembl_max_phase is not None and float(mol_chembl_max_phase) == 4.0:
                    temp_chembl_id_list.append(mol_chembl_id)
        
    if len(temp_chembl_id_list) == 0:
        chembl_id.append("Not found")
    elif len(temp_chembl_id_list) == 1:
        chembl_id.append(temp_chembl_id_list[0])
    else:
        chembl_id.append(str(temp_chembl_id_list))
approved_drug_df['chembl_id_hits'] = chembl_id

100%|██████████| 25/25 [00:00<00:00, 262.10it/s]


# Manually validation

In [4]:
approved_drug_df[approved_drug_df['chembl_id_hits'].str.contains("\[")]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,"['CHEMBL88', 'CHEMBL1200796']"


In [5]:
chembl_id[3] = "CHEMBL88"
# Reason: 
# The drug name "Cytarabine" has two chembl_id hits, but CHEMBL88 is the main structure. 
# The other one is a alternative form with one more water molecule.

In [6]:
approved_drug_df[approved_drug_df['chembl_id_hits'] == "Not found"]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found
16,Olutasidenib,https://www.cancer.gov/about-cancer/treatment/...,Not found
19,Quizartinib Dihydrochloride,https://www.cancer.gov/about-cancer/treatment/...,Not found


In [7]:
chembl_id[5] = "Not found"
# Reason:
# Daunorubicin hydrochloride and cytarabine liposome is a combination form of daunorubicin hydrochloride and cytarabine contained inside liposomes (very tiny particles of fat). 
# Thus, this combination form is not found in ChEMBL database.
# Furthermore, daunorubicin hydrochloride and cytarabine have been included in our list of approved drugs for acute myeloid leukemia.

In [8]:
mols = molecule.filter(pref_name__iexact="Olutasidenib")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])

chembl_id[16] = "CHEMBL4297610"
# Reason:
# Olutasidenib is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL4297610 OLUTASIDENIB 1.0


In [9]:
mols = molecule.filter(pref_name__iexact="Quizartinib Dihydrochloride")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])

chembl_id[19] = "CHEMBL2105709"
# Reason:
# Quizartinib Dihydrochloride is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL2105709 QUIZARTINIB DIHYDROCHLORIDE 3.0


In [10]:
approved_drug_df['chembl_id'] = chembl_id
approved_drug_df

Unnamed: 0,drug_name,resource_url,chembl_id_hits,chembl_id
0,Arsenic Trioxide,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL2362016,CHEMBL2362016
1,Azacitidine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL1489,CHEMBL1489
2,Cerubidine (Daunorubicin Hydrochloride),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL1563,CHEMBL1563
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,"['CHEMBL88', 'CHEMBL1200796']",CHEMBL88
4,Cytarabine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL803,CHEMBL803
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found
6,Daurismo (Glasdegib Maleate),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL4297534,CHEMBL4297534
7,Dexamethasone,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL384467,CHEMBL384467
8,Doxorubicin Hydrochloride,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL359744,CHEMBL359744
9,Enasidenib Mesylate,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL3989931,CHEMBL3989931


In [11]:
approved_drug_df.to_csv("acute_myeloid_leukemia_approved_drugs_with_chembl_id.csv", index=False)