# Get the ChEMBL ID

In [1]:
# import pkgs
import os
import pandas as pd

from chembl_webresource_client.new_client import new_client
from tqdm import tqdm


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
disease_name = "acute_myeloid_leukemia"
filename = f"{disease_name}_approved_drugs.csv"
approved_drug_list_path = f"../Get_approved_drug_from_cancer_gov/approved_drug_by_disease/{filename}"
approved_drug_df = pd.read_csv(approved_drug_list_path)
approved_drug_list = approved_drug_df['drug_name'].str.lower().to_list()

In [3]:
chembl_id = []
molecule = new_client.molecule
for each_drug in tqdm(approved_drug_list):
    each_drug_name_list = each_drug.replace(")", "").replace("\xa0", " ").split(" (")
    
    temp_chembl_id_list = []
    for each_drug_name in each_drug_name_list:
        mols = molecule.filter(pref_name__iexact=each_drug_name)
        if len(mols) == 0:
            continue
        else:
            for mol in mols:
                mol_chembl_id = mol['molecule_hierarchy']['parent_chembl_id']
                # mol_chembl_id = mol['molecule_chembl_id']
                mol_chembl_name = mol['pref_name']
                mol_chembl_max_phase = mol['max_phase']
                
                if mol_chembl_max_phase is not None and float(mol_chembl_max_phase) == 4.0:
                    temp_chembl_id_list.append(mol_chembl_id)
        
    if len(temp_chembl_id_list) == 0:
        chembl_id.append("Not found")
    elif len(temp_chembl_id_list) == 1:
        chembl_id.append(temp_chembl_id_list[0])
    else:
        chembl_id.append(str(temp_chembl_id_list))
approved_drug_df['chembl_id_hits'] = chembl_id

  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:00<00:00, 268.77it/s]


# Manually validation

In [4]:
approved_drug_df[approved_drug_df['chembl_id_hits'].str.contains("\[")]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,"['CHEMBL88', 'CHEMBL88']"


In [5]:
chembl_id[3] = "CHEMBL88"
# Reason: 
# The drug name "Cytarabine" has two chembl_id hits, but CHEMBL88 is the parent structure. 
# The other one is a alternative form with one more water molecule.

In [6]:
approved_drug_df[approved_drug_df['chembl_id_hits'] == "Not found"]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found
16,Olutasidenib,https://www.cancer.gov/about-cancer/treatment/...,Not found
19,Quizartinib Dihydrochloride,https://www.cancer.gov/about-cancer/treatment/...,Not found


In [7]:
chembl_id[5] = "Not found"
# Reason:
# Daunorubicin hydrochloride and cytarabine liposome is a combination form of daunorubicin hydrochloride and cytarabine contained inside liposomes (very tiny particles of fat). 
# Thus, this combination form is not found in ChEMBL database.
# Furthermore, daunorubicin hydrochloride and cytarabine have been included in our list of approved drugs for acute myeloid leukemia.

In [8]:
mols = molecule.filter(pref_name__iexact="Olutasidenib")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[16] = "CHEMBL4297610"
# Reason:
# Olutasidenib is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL4297610 OLUTASIDENIB 1.0
CHEMBL4297610


In [9]:
mols = molecule.filter(pref_name__iexact="Quizartinib Dihydrochloride")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[19] = "CHEMBL576982"
# Reason:
# Quizartinib Dihydrochloride is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL2105709 QUIZARTINIB DIHYDROCHLORIDE 3.0
CHEMBL576982


In [10]:
approved_drug_df.drop(columns=['chembl_id_hits'], inplace=True)

In [11]:
approved_drug_df['parent_chembl_id'] = chembl_id
chembl_id_synonyms = []
for each_parent_chembl_id in tqdm(chembl_id):
    if each_parent_chembl_id == "Not found":
        chembl_id_synonyms.append("Not found")
    else:
        mols = molecule.filter(molecule_hierarchy__parent_chembl_id__iexact=each_parent_chembl_id).only('molecule_chembl_id')
        chembl_id_synonyms.append(str([mol['molecule_chembl_id'] for mol in mols]))
approved_drug_df['chembl_id_synonyms'] = chembl_id_synonyms

100%|██████████| 25/25 [00:00<00:00, 336.03it/s]


# Manually check whether it is a specific cancer drug

In [12]:
for each in approved_drug_df['resource_url']:
    print(each)

https://www.cancer.gov/about-cancer/treatment/drugs/arsenictrioxide
https://www.cancer.gov/about-cancer/treatment/drugs/azacitidine
https://www.cancer.gov/about-cancer/treatment/drugs/daunorubicinhydrochloride
https://www.cancer.gov/about-cancer/treatment/drugs/cyclophosphamide
https://www.cancer.gov/about-cancer/treatment/drugs/cytarabine
https://www.cancer.gov/about-cancer/treatment/drugs/daunorubicinhydrochlorideandcytarabineliposome
https://www.cancer.gov/about-cancer/treatment/drugs/glasdegibmaleate
https://www.cancer.gov/about-cancer/treatment/drugs/dexamethasone
https://www.cancer.gov/about-cancer/treatment/drugs/doxorubicinhydrochloride
https://www.cancer.gov/about-cancer/treatment/drugs/enasidenibmesylate
https://www.cancer.gov/about-cancer/treatment/drugs/gemtuzumabozogamicin
https://www.cancer.gov/about-cancer/treatment/drugs/gilteritinibfumarate
https://www.cancer.gov/about-cancer/treatment/drugs/idarubicinhydrochloride
https://www.cancer.gov/about-cancer/treatment/drugs/iv

In [13]:
# specific: drug for the exact AML.
# acceptable: drug for not only AML but just multiple types of leukemia or lymphoma.
# wide-used: drug for not only AML or leukemia or lymphoma, but also other types of cancer.
labels = ['specific', 'acceptable', 'acceptable', 'wide-used', 'acceptable', 'specific', 'specific', 'acceptable', 'wide-used', 'specific', 'specific', 'specific', 'specific', 'wide-used', 'specific', 'wide-used', 'specific', 'wide-used', 'acceptable', 'specific', 'acceptable', 'specific', 'acceptable', 'acceptable', 'wide-used']

In [14]:
approved_drug_df['specificity'] = labels

# Save csv (checkpoint)

In [15]:
approved_drug_df.to_csv(f"{disease_name}_approved_drugs_with_chembl_id.csv", index=False)