# Manually download assay surrounding the compounds and extract the target id

In [1]:
import os
import pandas as pd
import numpy as np
import zipfile
from chembl_webresource_client.new_client import new_client

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Directory containing the chembl assay summary files
zip_dir = './assay_groups_zip'

# Directory to extract the files
extract_dir = './unziped_groups'

# Iterate through each .zip file in the directory
for filename in os.listdir(zip_dir):
    if filename.endswith('.zip'):
        # Extract the ChEMBL ID from the filename
        chembl_id = filename.split('.')[0]  # Assuming the format is 'CHEMBLxxx.zip'
        
        # Create the extract directory for the current ChEMBL ID
        extract_path = os.path.join(extract_dir, chembl_id)
        os.makedirs(extract_path, exist_ok=True)
        
        # Open the .zip file
        with zipfile.ZipFile(os.path.join(zip_dir, filename), 'r') as zip_ref:
            # Extract all contents to the extract directory
            zip_ref.extractall(extract_path)

In [3]:
# Define the directory containing the assay summary folders
root_directory = './unziped_groups'

# Initialize a dictionary to store ChEMBL IDs based on folder names
chembl_ids_dict = {}

# Determine the maximum length of ChEMBL IDs across all folders
max_length = 0

# Walk through the root directory
for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    
    # Check if the item is a directory
    if os.path.isdir(folder_path):
        # Initialize a list to store ChEMBL IDs for this folder
        folder_chembl_ids = []
        
        # Look for CSV files within the directory
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                csv_file_path = os.path.join(folder_path, file_name)
                
                try:
                    # Read the assay information from the CSV file into a pandas dataframe
                    df = pd.read_csv(csv_file_path, usecols=[0])
                    
                    # Extract the values from the assay information
                    if not df.empty:
                        folder_chembl_ids.extend(df.iloc[:, 0])
                except Exception as e:
                    print(f"Error processing file {csv_file_path}: {e}")
        
        # Update the maximum length
        max_length = max(max_length, len(folder_chembl_ids))
        
        # Save ChEMBL IDs for this folder in the dictionary
        chembl_ids_dict[folder_name] = [value.split(';"')[0] for value in folder_chembl_ids]

# Ensure that all lists have the same length by padding with NaNs if necessary
for folder_name, chembl_ids in chembl_ids_dict.items():
    chembl_ids_dict[folder_name] += [np.nan] * (max_length - len(chembl_ids))

# Create a dataframe from the dictionary
assay_df = pd.DataFrame(chembl_ids_dict)

# Save the dataframe to a CSV file
assay_df.to_csv('./assay_ids.csv', index=False)
assay_df

Unnamed: 0,CHEMBL1200976,CHEMBL1201506,CHEMBL1201576,CHEMBL1417019,CHEMBL1489,CHEMBL1563,CHEMBL2105709,CHEMBL2362016,CHEMBL3137309,CHEMBL3301603,...,CHEMBL3989958,CHEMBL4297522,CHEMBL4297534,CHEMBL4297610,CHEMBL501867,CHEMBL608533,CHEMBL635,CHEMBL727,CHEMBL803,CHEMBL88
0,CHEMBL1614410,CHEMBL3137738,CHEMBL3137738,CHEMBL1614410,CHEMBL1613997,CHEMBL1614410,CHEMBL1908760,CHEMBL1697722,CHEMBL4672389,CHEMBL3991692,...,CHEMBL4409428,CHEMBL4810226,CHEMBL2045161,CHEMBL4325705,CHEMBL739006,CHEMBL1908760,CHEMBL646313,CHEMBL909307,CHEMBL707922,CHEMBL734245
1,CHEMBL1614045,CHEMBL3137736,CHEMBL3137736,CHEMBL763715,CHEMBL1614216,CHEMBL1647773,CHEMBL1908508,CHEMBL1794580,CHEMBL4672391,CHEMBL3991717,...,CHEMBL4309332,CHEMBL5135731,CHEMBL2045167,CHEMBL4325706,CHEMBL741835,CHEMBL1051316,CHEMBL771537,CHEMBL1964065,CHEMBL1613914,CHEMBL1613914
2,CHEMBL1613914,CHEMBL3137729,CHEMBL3137729,CHEMBL1614045,CHEMBL1614008,CHEMBL810265,CHEMBL1908414,CHEMBL931512,CHEMBL3795380,CHEMBL3991771,...,CHEMBL4309333,CHEMBL4810190,CHEMBL2045183,CHEMBL4325710,CHEMBL817773,CHEMBL1037553,CHEMBL635931,CHEMBL910504,CHEMBL679422,CHEMBL734250
3,CHEMBL1794311,CHEMBL3137727,CHEMBL3137727,CHEMBL1614331,CHEMBL1697722,CHEMBL1613914,CHEMBL1908660,CHEMBL931509,CHEMBL4182826,CHEMBL3991653,...,CHEMBL4409436,CHEMBL4810168,CHEMBL2045169,CHEMBL4325730,CHEMBL739904,CHEMBL1908508,CHEMBL1794311,CHEMBL1614445,CHEMBL970837,CHEMBL1964065
4,CHEMBL1613900,CHEMBL3137731,CHEMBL3137731,CHEMBL1613914,CHEMBL1794584,CHEMBL1794311,CHEMBL1908706,CHEMBL1794345,CHEMBL3865731,CHEMBL3991678,...,CHEMBL4409440,CHEMBL4810258,CHEMBL2045142,CHEMBL4325729,CHEMBL739001,CHEMBL1908414,CHEMBL1964065,CHEMBL965578,CHEMBL844951,CHEMBL965578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12766,,,,,,,,,,,...,,,,,,,,,,
12767,,,,,,,,,,,...,,,,,,,,,,
12768,,,,,,,,,,,...,,,,,,,,,,
12769,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# extract the assay target using chembl api
##CHEMBL3301574 data not available
#example of finding the assay target
assays = new_client.assay
extracted_assay = assays.filter(assay_chembl_id ='CHEMBL4672389')
extracted_assay_target = extracted_assay.only('target_chembl_id')[0]
extracted_assay_target

{'target_chembl_id': 'CHEMBL4680030'}

In [6]:
# Define the directory containing the assay summary folders
root_directory = './unziped_groups'

# Initialize an empty DataFrame to store the target IDs
target_df = pd.DataFrame(index=range(len(assay_df)), columns=assay_df.columns)

# Determine the maximum number of elements
max_length = assay_df.apply(lambda x: x.dropna().size).max()

# Iterate through each column in the DataFrame
for column_name in assay_df.columns:
    # Get the assay IDs from the column
    assay_ids = assay_df[column_name].dropna().unique()
    
    # Initialize a list to store the target IDs for this column
    target_ids = []
    
    # Iterate through each assay ID
    for assay_id in assay_ids:
        # If assay_id is null, append null values to the list
        if pd.isnull(assay_id):
            target_ids.extend([None] * (max_length - len(target_ids)))
            continue
        
        # Otherwise, proceed with API call
        extracted_assay = assays.filter(assay_chembl_id=assay_id)
        extracted_assay_target = extracted_assay.only('target_chembl_id')[0]['target_chembl_id']  # Extract only the target ID
        
        # Append the assay target ID to the list
        target_ids.append(extracted_assay_target)
    
    # Append null values to reach the maximum length
    target_ids.extend([None] * (max_length - len(target_ids)))
    
    # Update the corresponding values in target_df with the target IDs
    target_df[column_name] = target_ids

# Save the new DataFrame to a CSV file
target_df.to_csv('./assay_target_ids.csv', index=False)

In [7]:
target_df

Unnamed: 0,CHEMBL1200976,CHEMBL1201506,CHEMBL1201576,CHEMBL1417019,CHEMBL1489,CHEMBL1563,CHEMBL2105709,CHEMBL2362016,CHEMBL3137309,CHEMBL3301603,...,CHEMBL3989958,CHEMBL4297522,CHEMBL4297534,CHEMBL4297610,CHEMBL501867,CHEMBL608533,CHEMBL635,CHEMBL727,CHEMBL803,CHEMBL88
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12766,,,,,,,,,,,...,,,,,,,,,,
12767,,,,,,,,,,,...,,,,,,,,,,
12768,,,,,,,,,,,...,,,,,,,,,,
12769,,,,,,,,,,,...,,,,,,,,,,
