# Get the ChEMBL ID

In [1]:
# import pkgs
import os
import pandas as pd
import numpy as np
import zipfile

from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

from pd_process import str2list

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
approved_drug_list_path = "../Get_approved_drug_from_cancer_gov/acute_myeloid_leukemia_approved_drugs.csv"
approved_drug_df = pd.read_csv(approved_drug_list_path)
approved_drug_list = approved_drug_df['drug_name'].str.lower().to_list()

In [3]:
chembl_id = []
molecule = new_client.molecule
for each_drug in tqdm(approved_drug_list):
    each_drug_name_list = each_drug.replace(")", "").replace("\xa0", " ").split(" (")
    
    temp_chembl_id_list = []
    for each_drug_name in each_drug_name_list:
        mols = molecule.filter(pref_name__iexact=each_drug_name)
        if len(mols) == 0:
            continue
        else:
            for mol in mols:
                mol_chembl_id = mol['molecule_hierarchy']['parent_chembl_id']
                mol_chembl_name = mol['pref_name']
                mol_chembl_max_phase = mol['max_phase']
                
                if mol_chembl_max_phase is not None and float(mol_chembl_max_phase) == 4.0:
                    temp_chembl_id_list.append(mol_chembl_id)
        
    if len(temp_chembl_id_list) == 0:
        chembl_id.append("Not found")
    elif len(temp_chembl_id_list) == 1:
        chembl_id.append(temp_chembl_id_list[0])
    else:
        chembl_id.append(str(temp_chembl_id_list))
approved_drug_df['chembl_id_hits'] = chembl_id

100%|██████████| 25/25 [00:00<00:00, 245.27it/s]


# Manually validation

In [4]:
approved_drug_df[approved_drug_df['chembl_id_hits'].str.contains("\[")]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,"['CHEMBL88', 'CHEMBL88']"


In [5]:
chembl_id[3] = "CHEMBL88"
# Reason: 
# The drug name "Cytarabine" has two chembl_id hits, but CHEMBL88 is the main structure. 
# The other one is a alternative form with one more water molecule.

In [6]:
approved_drug_df[approved_drug_df['chembl_id_hits'] == "Not found"]

Unnamed: 0,drug_name,resource_url,chembl_id_hits
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found
16,Olutasidenib,https://www.cancer.gov/about-cancer/treatment/...,Not found
19,Quizartinib Dihydrochloride,https://www.cancer.gov/about-cancer/treatment/...,Not found


In [7]:
chembl_id[5] = "Not found"
# Reason:
# Daunorubicin hydrochloride and cytarabine liposome is a combination form of daunorubicin hydrochloride and cytarabine contained inside liposomes (very tiny particles of fat). 
# Thus, this combination form is not found in ChEMBL database.
# Furthermore, daunorubicin hydrochloride and cytarabine have been included in our list of approved drugs for acute myeloid leukemia.

In [8]:
mols = molecule.filter(pref_name__iexact="Olutasidenib")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[16] = "CHEMBL4297610"
# Reason:
# Olutasidenib is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL4297610 OLUTASIDENIB 1.0
CHEMBL4297610


In [9]:
mols = molecule.filter(pref_name__iexact="Quizartinib Dihydrochloride")
for mol in mols:
    print(mol['molecule_chembl_id'], mol['pref_name'], mol['max_phase'])
    print(mol['molecule_hierarchy']['parent_chembl_id'])

chembl_id[19] = "CHEMBL576982"
# Reason:
# Quizartinib Dihydrochloride is a new drug and its max_phase should be 4.0. 
# The max_phase in ChEMBL database is incorrect.

CHEMBL2105709 QUIZARTINIB DIHYDROCHLORIDE 3.0
CHEMBL576982


In [10]:
approved_drug_df.drop(columns=['chembl_id_hits'], inplace=True)

In [11]:
approved_drug_df['parent_chembl_id'] = chembl_id
chembl_id_synonyms = []
for each_parent_chembl_id in tqdm(chembl_id):
    if each_parent_chembl_id == "Not found":
        chembl_id_synonyms.append("Not found")
    else:
        mols = molecule.filter(molecule_hierarchy__parent_chembl_id__iexact=each_parent_chembl_id).only('molecule_chembl_id')
        chembl_id_synonyms.append(str([mol['molecule_chembl_id'] for mol in mols]))
approved_drug_df['chembl_id_synonyms'] = chembl_id_synonyms

100%|██████████| 25/25 [00:00<00:00, 329.22it/s]


In [12]:
approved_drug_df.head()

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms
0,Arsenic Trioxide,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL2362016,['CHEMBL2362016']
1,Azacitidine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL1489,"['CHEMBL1489', 'CHEMBL3250420', 'CHEMBL3250421']"
2,Cerubidine (Daunorubicin Hydrochloride),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL178,"['CHEMBL178', 'CHEMBL1563', 'CHEMBL1200475']"
3,Cyclophosphamide,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL88,"['CHEMBL88', 'CHEMBL1200796', 'CHEMBL2364721']"
4,Cytarabine,https://www.cancer.gov/about-cancer/treatment/...,CHEMBL803,"['CHEMBL803', 'CHEMBL1256472']"


# Extract the assay ChEMBL ID from the activity data

In [13]:
approved_drug_df[approved_drug_df['chembl_id_synonyms'] == "Not found"]

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found


In [14]:
activity = new_client.activity
assay_chembl_id_list = []
for each_chembl_id_synonyms_str in tqdm(approved_drug_df['chembl_id_synonyms']):
    if each_chembl_id_synonyms_str == "Not found":
        assay_chembl_id_list.append("Not found")
    else:
        assay_chembl_ids = []
        for each_chembl_id in str2list(each_chembl_id_synonyms_str):
            found_activity = activity.filter(molecule_chembl_id=each_chembl_id).only('assay_chembl_id')
            if len(found_activity) == 0:
                continue
            temp = pd.DataFrame(found_activity).drop_duplicates()
            assay_chembl_ids.extend(temp['assay_chembl_id'].to_list())
        
        # remove duplicates
        assay_chembl_ids = pd.Series(assay_chembl_ids).drop_duplicates().to_list()
        if len(assay_chembl_ids) > 0:
            assay_chembl_id_list.append(str(assay_chembl_ids))
        else:
            assay_chembl_id_list.append("Not found")

100%|██████████| 25/25 [00:02<00:00, 10.90it/s]


In [15]:
approved_drug_df['assay_chembl_id'] = assay_chembl_id_list

# Manual validation

In [16]:
approved_drug_df[approved_drug_df['assay_chembl_id'] == "Not found"]
# Reason:
# The assay_chembl_id is not found in ChEMBL database for these drugs.

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms,assay_chembl_id
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found,Not found
22,Tisagenlecleucel (Kymriah),https://www.cancer.gov/about-cancer/treatment/...,CHEMBL3301574,['CHEMBL3301574'],Not found


# Save csv (checkpoint)

In [17]:
approved_drug_df.to_csv("acute_myeloid_leukemia_approved_drugs_with_chembl_id.csv", index=False)

# Read csv

In [2]:
approved_drug_df = pd.read_csv("acute_myeloid_leukemia_approved_drugs_with_chembl_id.csv")

In [57]:
for i in range(len(approved_drug_df)):
    if approved_drug_df.iloc[i]['assay_chembl_id'] != "Not found":
        print(f"{approved_drug_df.iloc[i]['drug_name']} ({approved_drug_df.iloc[i]['parent_chembl_id']}): {len(str2list(approved_drug_df.iloc[i]['assay_chembl_id']))}")

Idamycin PFS (Idarubicin Hydrochloride) (CHEMBL1117): 439
Gemtuzumab Ozogamicin (CHEMBL1201506): 19
Rituxan (Rituximab) (CHEMBL1201576): 19
Azacitidine (CHEMBL1489): 481
Cerubidine (Daunorubicin Hydrochloride) (CHEMBL178): 1142
Daurismo (Glasdegib Maleate) (CHEMBL2043437): 39
Arsenic Trioxide (CHEMBL2362016): 52
Venclexta (Venetoclax) (CHEMBL3137309): 147
Gilteritinib Fumarate (CHEMBL3301622): 355
Dexamethasone (CHEMBL384467): 1615
Enasidenib Mesylate (CHEMBL3989908): 27
Ivosidenib (CHEMBL3989958): 32
Pemazyre (Pemigatinib) (CHEMBL4297522): 134
Olutasidenib (CHEMBL4297610): 32
Doxorubicin Hydrochloride (CHEMBL53463): 12789
Quizartinib Dihydrochloride (CHEMBL576982): 1613
Mitoxantrone Hydrochloride (CHEMBL58): 1361
Midostaurin (CHEMBL608533): 2669
Prednisone (CHEMBL635): 376
Tabloid (Thioguanine) (CHEMBL727): 586
Cytarabine (CHEMBL803): 1804
Cyclophosphamide (CHEMBL88): 856
Vincristine Sulfate (CHEMBL90555): 1585


# Access the assays informations

## Direct access via chembl API

In [26]:
# Directory to extract the files
assays_dir = './assay_infos_filed_by_compound_using_api'
assay = new_client.assay

os.makedirs(assays_dir, exist_ok=True)
for i in tqdm(range(len(approved_drug_df))):
    parent_chembl_id = approved_drug_df.iloc[i]['parent_chembl_id']
    assay_chembl_id_list = str2list(approved_drug_df.iloc[i]['assay_chembl_id'])

    if parent_chembl_id == "Not found":
        continue
    else:
        assays_list = assay.filter(assay_chembl_id__in=assay_chembl_id_list)
            
        temp = pd.DataFrame(assays_list)
        temp.to_csv(f"{assays_dir}/{parent_chembl_id}.csv", sep=";", index=False)

 16%|█▌        | 4/25 [00:54<05:46, 16.52s/it]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff47f78ca90>>
Traceback (most recent call last):
  File "/home/yuyang/miniconda3/envs/aidd_proj/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


## (alternatives) Unzip backup assays

In [9]:
# Directory containing the chembl assay summary files
zip_dir = './assay_infos_zip_backup'

# Directory to extract the files
extract_dir = './assay_infos_filed_by_compound'

# Iterate through each .zip file in the directory
for filename in os.listdir(zip_dir):
    if filename.endswith('.zip'):
        # Extract the ChEMBL ID from the filename
        chembl_id = filename.split('.')[0]  # Assuming the format is 'CHEMBLxxx.zip'
        
        # Create the extract directory for the current ChEMBL ID
        extract_path = os.path.join(extract_dir, chembl_id)
        os.makedirs(extract_path, exist_ok=True)
        
        # Open the .zip file
        with zipfile.ZipFile(os.path.join(zip_dir, filename), 'r') as zip_ref:
            # Extract all contents to the extract directory
            zip_ref.extractall(extract_path)
        org_filename = os.listdir(extract_path)[0]
        os.rename(f"{extract_path}/{org_filename}", f"{extract_dir}/{chembl_id}.csv")
        os.removedirs(f"{extract_path}")

## Access the target chembl id if you using the unziped assay data  