# ChEMBL data retreival using SMILES strings from a CSV file

In [6]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from rdkit import Chem

def annotate_with_chembl_ic50_and_targets(df, smiles_column="smiles"):
    """Annotate DataFrame with IC50, main target info, and all known targets for each SMILES."""
    ic50_values = []
    target_names = []
    target_ids = []
    all_targets = []

    for smiles in df[smiles_column]:
        mol = Chem.MolFromSmiles(smiles)
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        # Use canonical SMILES for querying

        molecules = new_client.molecule.filter(molecule_structures__canonical_smiles=canonical_smiles)
        if molecules:
            chembl_id = molecules[0]['molecule_chembl_id']
            activities = list(new_client.activity.filter(molecule_chembl_id=chembl_id)[:20])
            found_ic50 = False
            targets_set = set()
            for activity in activities:
                target_id = activity.get('target_chembl_id')
                if target_id:
                    target_info = new_client.target.filter(chembl_id=target_id)
                    if target_info:
                        targets_set.add(target_info[0].get('pref_name', ''))
                if not found_ic50 and activity.get('standard_type') == 'IC50' and activity.get('standard_value'):
                    ic50 = activity['standard_value']
                    units = activity.get('standard_units', '')
                    if target_id and target_info:
                        target_name = target_info[0].get('pref_name', '')
                    else:
                        target_name = ''
                    ic50_values.append(f"{ic50} {units}")
                    target_names.append(target_name)
                    target_ids.append(target_id)
                    found_ic50 = True
            if not found_ic50:
                ic50_values.append(None)
                target_names.append(None)
                target_ids.append(None)
            # Add all known targets as a semicolon-separated string
            all_targets.append("; ".join(sorted(targets_set)) if targets_set else None)
        else:
            ic50_values.append(None)
            target_names.append(None)
            target_ids.append(None)
            all_targets.append(None)

    df['IC50'] = ic50_values
    df['target_name'] = target_names
    df['target_id'] = target_ids
    df['all_known_targets'] = all_targets
    return df

df = pd.read_csv("Final_Filtered_models.csv")
df = annotate_with_chembl_ic50_and_targets(df, smiles_column="smiles")
df.to_csv("Final_Filtered_models_with_ChemBL.csv", index=False)
