In [1]:
# import pkgs
import os
import pandas as pd
import zipfile

from chembl_webresource_client.new_client import new_client
from tqdm import tqdm

from pd_process import str2list

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Extract the assay ChEMBL ID from the activity data

In [2]:
disease_name = "acute_myeloid_leukemia"

In [3]:
# Read csv
approved_drug_df = pd.read_csv(f"{disease_name}_approved_drugs_with_chembl_id.csv")

In [4]:
approved_drug_df[approved_drug_df['chembl_id_synonyms'] == "Not found"]

Unnamed: 0,drug_name,resource_url,parent_chembl_id,chembl_id_synonyms,specificity
5,Daunorubicin Hydrochloride and Cytarabine Lipo...,https://www.cancer.gov/about-cancer/treatment/...,Not found,Not found,specific


In [5]:
activity = new_client.activity
assay_chembl_id_list = []
for each_chembl_id_synonyms_str in tqdm(approved_drug_df['chembl_id_synonyms']):
    if each_chembl_id_synonyms_str == "Not found":
        assay_chembl_id_list.append("Not found")
    else:
        assay_chembl_ids = []
        for each_chembl_id in str2list(each_chembl_id_synonyms_str):
            found_activity = activity.filter(molecule_chembl_id=each_chembl_id).only('assay_chembl_id')
            if len(found_activity) == 0:
                continue
            temp = pd.DataFrame(found_activity).drop_duplicates()
            assay_chembl_ids.extend(temp['assay_chembl_id'].to_list())
        
        # remove duplicates
        assay_chembl_ids = pd.Series(assay_chembl_ids).drop_duplicates().to_list()
        if len(assay_chembl_ids) > 0:
            assay_chembl_id_list.append(str(assay_chembl_ids))
        else:
            assay_chembl_id_list.append("Not found")

100%|██████████| 25/25 [00:17<00:00,  1.42it/s]


In [6]:
approved_drug_df['assay_chembl_id'] = assay_chembl_id_list

In [7]:
for i in range(len(approved_drug_df)):
    if approved_drug_df.iloc[i]['assay_chembl_id'] != "Not found":
        print(f"({approved_drug_df.iloc[i]['parent_chembl_id']}) {approved_drug_df.iloc[i]['drug_name']}: {len(str2list(approved_drug_df.iloc[i]['assay_chembl_id']))}")

(CHEMBL2362016) Arsenic Trioxide: 52
(CHEMBL1489) Azacitidine: 481
(CHEMBL178) Cerubidine (Daunorubicin Hydrochloride): 1142
(CHEMBL88) Cyclophosphamide: 856
(CHEMBL803) Cytarabine: 1804
(CHEMBL2043437) Daurismo (Glasdegib Maleate): 39
(CHEMBL384467) Dexamethasone: 1615
(CHEMBL53463) Doxorubicin Hydrochloride: 12789
(CHEMBL3989908) Enasidenib Mesylate: 27
(CHEMBL1201506) Gemtuzumab Ozogamicin: 19
(CHEMBL3301622) Gilteritinib Fumarate: 355
(CHEMBL1117) Idamycin PFS (Idarubicin Hydrochloride): 439
(CHEMBL3989958) Ivosidenib: 32
(CHEMBL608533) Midostaurin: 2669
(CHEMBL58) Mitoxantrone Hydrochloride: 1361
(CHEMBL4297610) Olutasidenib: 32
(CHEMBL4297522) Pemazyre (Pemigatinib): 134
(CHEMBL635) Prednisone: 376
(CHEMBL576982) Quizartinib Dihydrochloride: 1613
(CHEMBL1201576) Rituxan (Rituximab): 19
(CHEMBL727) Tabloid (Thioguanine): 586
(CHEMBL3137309) Venclexta (Venetoclax): 147
(CHEMBL90555) Vincristine Sulfate: 1585


# Access the assays informations

## Direct access via chembl API

In [20]:
# Directory to extract the files
assays_dir = './assay_infos_filed_by_compound_using_api'
assay = new_client.assay

os.makedirs(assays_dir, exist_ok=True)
for i in tqdm(range(len(approved_drug_df))):
    parent_chembl_id = approved_drug_df.iloc[i]['parent_chembl_id']
    assay_chembl_id_str = approved_drug_df.iloc[i]['assay_chembl_id']

    if assay_chembl_id_str == "Not found":
        continue
    else:
        assay_chembl_id_list = str2list(assay_chembl_id_str)
        assays_list = assay.filter(assay_chembl_id__in=assay_chembl_id_list)
            
        temp = pd.DataFrame(assays_list)
        temp.to_csv(f"{assays_dir}/{parent_chembl_id}.csv", sep=";", index=False)

100%|██████████| 25/25 [15:11<00:00, 36.45s/it] 


In [None]:
# TO DO: parallelly send query to ChEMBL API when the demand of a file is large (>1000)

## (alternatives) Unzip backup assays

In [21]:
# Directory containing the chembl assay summary files
zip_dir = './assay_infos_zip_backup'

# Directory to extract the files
extract_dir = './assay_infos_filed_by_compound'

# Iterate through each .zip file in the directory
for filename in os.listdir(zip_dir):
    if filename.endswith('.zip'):
        # Extract the ChEMBL ID from the filename
        chembl_id = filename.split('.')[0]  # Assuming the format is 'CHEMBLxxx.zip'
        
        # Create the extract directory for the current ChEMBL ID
        extract_path = os.path.join(extract_dir, chembl_id)
        os.makedirs(extract_path, exist_ok=True)
        
        # Open the .zip file
        with zipfile.ZipFile(os.path.join(zip_dir, filename), 'r') as zip_ref:
            # Extract all contents to the extract directory
            zip_ref.extractall(extract_path)
        org_filename = os.listdir(extract_path)[0]
        os.rename(f"{extract_path}/{org_filename}", f"{extract_dir}/{chembl_id}.csv")
        os.removedirs(f"{extract_path}")

### Access the target chembl id if you using the unziped assay data  

In [None]:
# TO DO

# (compound - assay - target) relationship

In [8]:
assays_dir = './assay_infos_filed_by_compound_using_api'
target_chembl_id = []
for i in tqdm(range(len(approved_drug_df))):
    each_parent_chembl_id = approved_drug_df.iloc[i]['parent_chembl_id']
    assay_chembl_id_str = approved_drug_df.iloc[i]['assay_chembl_id']

    if assay_chembl_id_str == "Not found":
        target_chembl_id.append("Not found")
    else:
        assay_chembl_id_list = str2list(assay_chembl_id_str)
        each_assays = pd.read_csv(f"{assays_dir}/{each_parent_chembl_id}.csv", sep=";")
        each_target_chembl_id = each_assays['target_chembl_id'].drop_duplicates().to_list()
        target_chembl_id.append(str(each_target_chembl_id))

approved_drug_df['target_chembl_id'] = target_chembl_id

100%|██████████| 25/25 [00:00<00:00, 86.38it/s]


## save csv

In [9]:
# save
approved_drug_df.to_csv(f"{disease_name}_approved_drugs_with_chembl_id.csv", index=False)

## save subset of specific drug for AML

In [12]:
# save
approved_drug_df_specific = approved_drug_df[approved_drug_df['specificity'] == "specific"]
approved_drug_df_specific = approved_drug_df_specific[approved_drug_df_specific['assay_chembl_id'] != "Not found"].reset_index(drop=True)
approved_drug_df_specific.to_csv(f"{disease_name}_specific_approved_drugs_with_chembl_id.csv", index=False)

# save
approved_drug_df_specific = approved_drug_df[approved_drug_df['specificity'] == "wide-used"]
approved_drug_df_specific = approved_drug_df_specific[approved_drug_df_specific['assay_chembl_id'] != "Not found"].reset_index(drop=True)
approved_drug_df_specific.to_csv(f"{disease_name}_wideused_approved_drugs_with_chembl_id.csv", index=False)

# save
approved_drug_df_specific = approved_drug_df[approved_drug_df['specificity'] == "acceptable"]
approved_drug_df_specific = approved_drug_df_specific[approved_drug_df_specific['assay_chembl_id'] != "Not found"].reset_index(drop=True)
approved_drug_df_specific.to_csv(f"{disease_name}_acceptable_approved_drugs_with_chembl_id.csv", index=False)

# save the relationship

In [35]:
assays_dir = './assay_infos_filed_by_compound_using_api'
relationship_df = pd.DataFrame(columns=['parent_compound_chembl_id', 'assay_chembl_id', 'target_chembl_id'])

for i in tqdm(range(len(approved_drug_df))):
    each_parent_chembl_id = approved_drug_df.iloc[i]['parent_chembl_id']
    assay_chembl_id_str = approved_drug_df.iloc[i]['assay_chembl_id']

    if assay_chembl_id_str == "Not found":
        continue
    else:
        assay_chembl_id_list = str2list(assay_chembl_id_str)
        each_assays = pd.read_csv(f"{assays_dir}/{each_parent_chembl_id}.csv", sep=";")
        each_assays['parent_compound_chembl_id'] = each_parent_chembl_id
        each_relationship = each_assays[['parent_compound_chembl_id', 'assay_chembl_id', 'target_chembl_id']]
        relationship_df = pd.concat([relationship_df, each_relationship])

100%|██████████| 25/25 [00:00<00:00, 83.60it/s]


In [37]:
relationship_df.to_csv("acute_myeloid_leukemia_approved_drugs_assay_target_chembl_id_relationship.csv", index=False)

# Other testing (group by target ID?)

In [6]:
relationship_df = pd.read_csv("acute_myeloid_leukemia_approved_drugs_assay_target_chembl_id_relationship.csv")
relationship_df.groupby('target_chembl_id').size().sort_values(ascending=False).head(10)

target_chembl_id
CHEMBL612545     1896
CHEMBL3879801    1562
CHEMBL387        1191
CHEMBL375        1129
CHEMBL612558      804
CHEMBL392         754
CHEMBL395         514
CHEMBL399         478
CHEMBL400         418
CHEMBL386         406
dtype: int64