In [None]:
import os
import pandas as pd
import numpy as np
import zipfile
from chembl_webresource_client.new_client import new_client

In [24]:
# Directory containing the chembl assay summary files
zip_dir = '/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/assay_groups_zip'

# Directory to extract the files
extract_dir = '/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/unziped_groups'

# Iterate through each .zip file in the directory
for filename in os.listdir(zip_dir):
    if filename.endswith('.zip'):
        # Extract the ChEMBL ID from the filename
        chembl_id = filename.split('.')[0]  # Assuming the format is 'CHEMBLxxx.zip'
        
        # Create the extract directory for the current ChEMBL ID
        extract_path = os.path.join(extract_dir, chembl_id)
        os.makedirs(extract_path, exist_ok=True)
        
        # Open the .zip file
        with zipfile.ZipFile(os.path.join(zip_dir, filename), 'r') as zip_ref:
            # Extract all contents to the extract directory
            zip_ref.extractall(extract_path)

In [23]:
# Define the directory containing the assay summary folders
root_directory = '/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/unziped_groups'

# Initialize a dictionary to store ChEMBL IDs based on folder names
chembl_ids_dict = {}

# Determine the maximum length of ChEMBL IDs across all folders
max_length = 0

# Walk through the root directory
for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    
    # Check if the item is a directory
    if os.path.isdir(folder_path):
        # Initialize a list to store ChEMBL IDs for this folder
        folder_chembl_ids = []
        
        # Look for CSV files within the directory
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                csv_file_path = os.path.join(folder_path, file_name)
                
                try:
                    # Read the assay information from the CSV file into a pandas dataframe
                    df = pd.read_csv(csv_file_path, usecols=[0])
                    
                    # Extract the values from the assay information
                    if not df.empty:
                        folder_chembl_ids.extend(df.iloc[:, 0])
                except Exception as e:
                    print(f"Error processing file {csv_file_path}: {e}")
        
        # Update the maximum length
        max_length = max(max_length, len(folder_chembl_ids))
        
        # Save ChEMBL IDs for this folder in the dictionary
        chembl_ids_dict[folder_name] = [value.split(';"')[0] for value in folder_chembl_ids]

# Ensure that all lists have the same length by padding with NaNs if necessary
for folder_name, chembl_ids in chembl_ids_dict.items():
    chembl_ids_dict[folder_name] += [np.nan] * (max_length - len(chembl_ids))

# Create a dataframe from the dictionary
assay_df = pd.DataFrame(chembl_ids_dict)

# Save the dataframe to a CSV file
assay_df.to_csv('/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/assay_ids.csv', index=False)
assay_df

Unnamed: 0,CHEMBL3137309,CHEMBL3989931,CHEMBL1201506,CHEMBL384467,CHEMBL1489,CHEMBL2362016,CHEMBL803,CHEMBL2105709,CHEMBL608533,CHEMBL1563,...,CHEMBL4297534,CHEMBL727,CHEMBL501867,CHEMBL635,CHEMBL1201576,CHEMBL3301603,CHEMBL359744,CHEMBL1200976,CHEMBL3989958,CHEMBL4297522
0,CHEMBL4672389,CHEMBL4708483,CHEMBL3137738,CHEMBL686335,CHEMBL1613997,CHEMBL1697722,CHEMBL707922,CHEMBL1908760,CHEMBL1908760,CHEMBL1614410,...,CHEMBL2045161,CHEMBL909307,CHEMBL739006,CHEMBL646313,CHEMBL3137738,CHEMBL3991692,CHEMBL919903,CHEMBL1614410,CHEMBL4409428,CHEMBL4810226
1,CHEMBL4672391,CHEMBL4628666,CHEMBL3137736,CHEMBL685815,CHEMBL1614216,CHEMBL1794580,CHEMBL1613914,CHEMBL1908508,CHEMBL1051316,CHEMBL1647773,...,CHEMBL2045167,CHEMBL1964065,CHEMBL741835,CHEMBL771537,CHEMBL3137736,CHEMBL3991717,CHEMBL763715,CHEMBL1614045,CHEMBL4309332,CHEMBL5135731
2,CHEMBL3795380,CHEMBL4338883,CHEMBL3137729,CHEMBL635931,CHEMBL1614008,CHEMBL931512,CHEMBL679422,CHEMBL1908414,CHEMBL1037553,CHEMBL810265,...,CHEMBL2045183,CHEMBL910504,CHEMBL817773,CHEMBL635931,CHEMBL3137729,CHEMBL3991771,CHEMBL961174,CHEMBL1613914,CHEMBL4309333,CHEMBL4810190
3,CHEMBL4182826,CHEMBL4708484,CHEMBL3137727,CHEMBL919410,CHEMBL1697722,CHEMBL931509,CHEMBL970837,CHEMBL1908660,CHEMBL1908508,CHEMBL1613914,...,CHEMBL2045169,CHEMBL1614445,CHEMBL739904,CHEMBL1794311,CHEMBL3137727,CHEMBL3991653,CHEMBL906318,CHEMBL1794311,CHEMBL4409436,CHEMBL4810168
4,CHEMBL3865731,CHEMBL4628664,CHEMBL3137731,CHEMBL1964065,CHEMBL1794584,CHEMBL1794345,CHEMBL844951,CHEMBL1908706,CHEMBL1908414,CHEMBL1794311,...,CHEMBL2045142,CHEMBL965578,CHEMBL739001,CHEMBL1964065,CHEMBL3137731,CHEMBL3991678,CHEMBL947664,CHEMBL1613900,CHEMBL4409440,CHEMBL4810258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12766,,,,,,,,,,,...,,,,,,,CHEMBL4260995,,,
12767,,,,,,,,,,,...,,,,,,,CHEMBL4260998,,,
12768,,,,,,,,,,,...,,,,,,,CHEMBL4273898,,,
12769,,,,,,,,,,,...,,,,,,,CHEMBL3508160,,,


In [20]:
# extract the assay target using chembl api
##CHEMBL3301574 data not available
#example of finding the assay target
assays = new_client.assay
extracted_assay = assays.filter(assay_chembl_id ='CHEMBL4672389')
extracted_assay_target = extracted_assay.only('target_chembl_id')[0]
extracted_assay_target

{'target_chembl_id': 'CHEMBL4680030'}

In [34]:
import pandas as pd
# Define the directory containing the assay summary folders
root_directory = '/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/unziped_groups'

# Initialize an empty DataFrame to store the target IDs
target_df = pd.DataFrame(index=range(len(assay_df)), columns=assay_df.columns)

# Determine the maximum number of elements
max_length = assay_df.apply(lambda x: x.dropna().size).max()

# Iterate through each column in the DataFrame
for column_name in assay_df.columns:
    # Get the assay IDs from the column
    assay_ids = assay_df[column_name].dropna().unique()
    
    # Initialize a list to store the target IDs for this column
    target_ids = []
    
    # Iterate through each assay ID
    for assay_id in assay_ids:
        # If assay_id is null, append null values to the list
        if pd.isnull(assay_id):
            target_ids.extend([None] * (max_length - len(target_ids)))
            continue
        
        # Otherwise, proceed with API call
        extracted_assay = assays.filter(assay_chembl_id=assay_id)
        extracted_assay_target = extracted_assay.only('target_chembl_id')[0]['target_chembl_id']  # Extract only the target ID
        
        # Append the assay target ID to the list
        target_ids.append(extracted_assay_target)
    
    # Append null values to reach the maximum length
    target_ids.extend([None] * (max_length - len(target_ids)))
    
    # Update the corresponding values in target_df with the target IDs
    target_df[column_name] = target_ids

# Save the new DataFrame to a CSV file
target_df.to_csv('/data1/SBCS-ForniliLab/daved/Group_P/finding_assays/assay_target_ids.csv', index=False)

In [36]:
target_df

Unnamed: 0,CHEMBL3137309,CHEMBL3989931,CHEMBL1201506,CHEMBL384467,CHEMBL1489,CHEMBL2362016,CHEMBL803,CHEMBL2105709,CHEMBL608533,CHEMBL1563,...,CHEMBL4297534,CHEMBL727,CHEMBL501867,CHEMBL635,CHEMBL1201576,CHEMBL3301603,CHEMBL359744,CHEMBL1200976,CHEMBL3989958,CHEMBL4297522
0,CHEMBL4680030,CHEMBL3991501,CHEMBL612545,CHEMBL3368,CHEMBL276,CHEMBL1697861,CHEMBL386,CHEMBL5938,CHEMBL5938,CHEMBL1293299,...,CHEMBL2362975,CHEMBL2360,CHEMBL375,CHEMBL3253,CHEMBL612545,CHEMBL4105814,CHEMBL612558,CHEMBL1293299,CHEMBL2007625,CHEMBL227
1,CHEMBL4680030,CHEMBL3991501,CHEMBL3879801,CHEMBL3368,CHEMBL614696,CHEMBL364,CHEMBL1293226,CHEMBL4482,CHEMBL6003,CHEMBL384,...,CHEMBL614058,CHEMBL614300,CHEMBL375,CHEMBL376,CHEMBL3879801,CHEMBL3650,CHEMBL390,CHEMBL364,CHEMBL612558,CHEMBL614524
2,CHEMBL4625,CHEMBL3991501,CHEMBL612545,CHEMBL2362975,CHEMBL614696,CHEMBL387,CHEMBL613509,CHEMBL4179,CHEMBL5331,CHEMBL352,...,CHEMBL612558,CHEMBL612545,CHEMBL3394,CHEMBL2362975,CHEMBL612545,CHEMBL3961,CHEMBL614096,CHEMBL1293226,CHEMBL612558,CHEMBL226
3,CHEMBL612545,CHEMBL3991501,CHEMBL612545,CHEMBL375,CHEMBL1697861,CHEMBL6035,CHEMBL2094266,CHEMBL6167,CHEMBL4482,CHEMBL1293226,...,CHEMBL351,CHEMBL4331,CHEMBL375,CHEMBL1977,CHEMBL612545,CHEMBL1795192,CHEMBL3879801,CHEMBL1977,CHEMBL612558,CHEMBL4729
4,CHEMBL3879801,CHEMBL2007625,CHEMBL3879801,CHEMBL614300,CHEMBL1293258,CHEMBL364,CHEMBL612545,CHEMBL4954,CHEMBL4179,CHEMBL1977,...,CHEMBL6080,CHEMBL3681,CHEMBL375,CHEMBL614300,CHEMBL3879801,CHEMBL2543,CHEMBL392,CHEMBL3879801,CHEMBL612558,CHEMBL612558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12766,,,,,,,,,,,...,,,,,,,CHEMBL612656,,,
12767,,,,,,,,,,,...,,,,,,,CHEMBL614088,,,
12768,,,,,,,,,,,...,,,,,,,CHEMBL612545,,,
12769,,,,,,,,,,,...,,,,,,,CHEMBL612558,,,
