In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

real_csv = "/home/nikki/egfr_lowdata_scoring/data/ligands_real/EGFR_activities.csv"

real_df = pd.read_csv(real_csv)
real_df = real_df[["molecule_chembl_id", "canonical_smiles"]].dropna().drop_duplicates()

real_ids = set(real_df["molecule_chembl_id"])

props = []
for smi in real_df["canonical_smiles"]:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        props.append([
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumHAcceptors(mol)
        ])

props_df = pd.DataFrame(props, columns=["MW", "logP", "HBD", "HBA"])

# Define filters (mean Â± tolerance)
mw_min, mw_max = props_df.MW.mean() - 50, props_df.MW.mean() + 50
logp_min, logp_max = props_df.logP.mean() - 1, props_df.logP.mean() + 1
hbd_min, hbd_max = props_df.HBD.mean() - 1, props_df.HBD.mean() + 1
hba_min, hba_max = props_df.HBA.mean() - 1, props_df.HBA.mean() + 1


In [2]:
#fecth chembl molecules
import requests

def fetch_chembl_molecules(limit=5000):
    url = "https://www.ebi.ac.uk/chembl/api/data/molecule"
    molecules = []
    offset = 0
    page_size = 1000

    while len(molecules) < limit:
        params = {
            "limit": page_size,
            "offset": offset,
            "format": "json"
        }
        r = requests.get(url, params=params)
        r.raise_for_status()
        data = r.json()["molecules"]

        for mol in data:
            if mol.get("molecule_structures"):
                molecules.append({
                    "molecule_chembl_id": mol["molecule_chembl_id"],
                    "smiles": mol["molecule_structures"]["canonical_smiles"]
                })

        offset += page_size
        print(f"Fetched {len(molecules)} molecules")

    return pd.DataFrame(molecules[:limit])

chembl_df = fetch_chembl_molecules(limit=5000)

Fetched 969 molecules
Fetched 1968 molecules
Fetched 2968 molecules
Fetched 3967 molecules
Fetched 4963 molecules
Fetched 5963 molecules


In [3]:
#remove real EGFR ligands
chembl_df = chembl_df[
    ~chembl_df["molecule_chembl_id"].isin(real_ids)
].reset_index(drop=True)


In [4]:
rows = []

for _, row in chembl_df.iterrows():
    mol = Chem.MolFromSmiles(row["smiles"])
    if not mol:
        continue

    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)

    if (
        mw_min <= mw <= mw_max and
        logp_min <= logp <= logp_max and
        hbd_min <= hbd <= hbd_max and
        hba_min <= hba <= hba_max
    ):
        rows.append(row)

synthetic_df = pd.DataFrame(rows)

# Sample up to 1000 synthetic ligands
synthetic_df = synthetic_df.sample(
    n=min(1000, len(synthetic_df)),
    random_state=42
).reset_index(drop=True)

# Assign synthetic IDs
synthetic_df["synthetic_id"] = [
    f"SYN{i:06d}" for i in range(1, len(synthetic_df) + 1)
]

# Output path
out_smi = "/home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/synthetic.smi"

# Write .smi file (SMILES + ID)
with open(out_smi, "w") as f:
    for _, row in synthetic_df.iterrows():
        f.write(f"{row['smiles']} {row['synthetic_id']}\n")

print(f"Saved {len(synthetic_df)} synthetic ligands to {out_smi}")


Saved 194 synthetic ligands to /home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/synthetic.smi


In [5]:
# After sampling synthetic_df and assigning synthetic_id
synthetic_df["synthetic_id"] = [
    f"SYN{i:06d}" for i in range(1, len(synthetic_df) + 1)
]

# Save ChemBL background CSV with synthetic IDs
chembl_out_csv = "/home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/chembl_background.csv"

synthetic_df_out = synthetic_df[["synthetic_id", "smiles"]]
synthetic_df_out.to_csv(chembl_out_csv, index=False)

print(f"Saved {len(synthetic_df_out)} synthetic ligands to {chembl_out_csv}")


Saved 194 synthetic ligands to /home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/chembl_background.csv


In [6]:
#smi to sdf conversion

from rdkit import Chem
from rdkit.Chem import AllChem
import os

input_smi = "/home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/synthetic.smi"
out_sdf_dir = "/home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/sdf"

os.makedirs(out_sdf_dir, exist_ok=True)

with open(input_smi) as f:
    for line in f:
        smi, lig_id = line.strip().split()

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            print(f"Skipping invalid SMILES: {lig_id}")
            continue

        mol = Chem.AddHs(mol)

        # Embed 3D
        status = AllChem.EmbedMolecule(mol, AllChem.ETKDG())
        if status != 0:
            print(f"Embedding failed: {lig_id}")
            continue

        # Minimize
        AllChem.UFFOptimizeMolecule(mol)

        # Write SDF
        out_sdf = os.path.join(out_sdf_dir, f"{lig_id}.sdf")
        writer = Chem.SDWriter(out_sdf)
        writer.write(mol)
        writer.close()


[23:30:40] UFFTYPER: Unrecognized charge state for atom: 7
[23:30:40] UFFTYPER: Unrecognized charge state for atom: 7
[23:30:43] UFFTYPER: Unrecognized charge state for atom: 8
[23:30:43] UFFTYPER: Unrecognized charge state for atom: 8
