In [4]:
!pip install requests



In [2]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import requests
import pandas as pd
import os

In [8]:
import requests
import pandas as pd
import os

# 1. Define and Create the path
output_dir = "/home/nikki/egfr_lowdata_scoring/data/ligands_real"
os.makedirs(output_dir, exist_ok=True)  # This creates the folder if it doesn't exist

# 2. API URL
url = "https://www.ebi.ac.uk/chembl/api/data/activity?target_chembl_id=CHEMBL203&limit=1000&format=json"

# 3. Send request
r = requests.get(url)
if r.status_code != 200:
    raise Exception(f"Failed to fetch data: {r.status_code}")

# 4. Convert JSON to pandas DataFrame
data = r.json()
df = pd.json_normalize(data['activities'])

# 5. Save as CSV
output_file = os.path.join(output_dir, "EGFR_activities.csv")
df.to_csv(output_file, index=False)


ConnectionError: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /chembl/api/data/activity?target_chembl_id=CHEMBL203&limit=1000&format=json (Caused by NameResolutionError("HTTPSConnection(host='www.ebi.ac.uk', port=443): Failed to resolve 'www.ebi.ac.uk' ([Errno -3] Temporary failure in name resolution)"))

In [3]:
output_dir = "/home/nikki/egfr_lowdata_scoring/data/ligands_real"
os.makedirs(output_dir, exist_ok=True)

egfr_activities_csv = os.path.join(output_dir, "EGFR_activities.csv")

base_dir = output_dir
in_csv  = egfr_activities_csv
out_csv = os.path.join(base_dir, "egfr_real_noisy.csv")
out_smi = os.path.join(base_dir, "egfr_real_noisy.smi")

df = pd.read_csv(in_csv)

# Keep only entries with SMILES & numeric standard_value in nM
df = df.dropna(subset=["canonical_smiles", "standard_value"])
df = df[df["standard_units"] == "nM"]

# Optional: drop non-equal relations to avoid >, <, ~ artifacts
df = df[df["relation"] == "="]

# Aggregate per ligand using *mean* IC50 (this already blurs things)
df_agg = (
    df.groupby(["molecule_chembl_id", "canonical_smiles"], as_index=False)
      .agg({"standard_value": "mean"})
)

# Define base labels with a *softer* threshold (e.g., 1 µM = 1000 nM)
threshold_nM = 1000.0  # adjust if you want easier/harder
df_agg["label"] = (df_agg["standard_value"] <= threshold_nM).astype(int)

# -----------------------
# 3. Add label noise around the decision boundary
# -----------------------
# Define a "borderline" band around the threshold where we inject noise
lower_band = threshold_nM / 3.0   # e.g. ~333 nM
upper_band = threshold_nM * 3.0   # e.g. ~3000 nM

band = (df_agg["standard_value"] > lower_band) & (df_agg["standard_value"] < upper_band)

# Flip labels for a fraction of ligands in the band
noise_rate = 0.3  # 30% flips in the band
rng = np.random.RandomState(42)
noise_mask = band & (rng.rand(len(df_agg)) < noise_rate)

df_agg.loc[noise_mask, "label"] = 1 - df_agg.loc[noise_mask, "label"]

print(f"Total ligands after aggregation: {len(df_agg)}")
print(f"Ligands in noisy band: {band.sum()}, labels flipped: {noise_mask.sum()}")

# -----------------------
# 4. Sample a subset (OPTIONAL) – do NOT enforce perfect balance
# -----------------------
# You can keep all, or downsample if it's too large
max_ligands = 80  # adjust as you like; more ligands = more stable metrics

if len(df_agg) > max_ligands:
    df_small = df_agg.sample(max_ligands, random_state=42)
else:
    df_small = df_agg.copy()

# No balancing; keep natural class ratio
print("Class counts in sampled set:")
print(df_small["label"].value_counts())

# -----------------------
# 5. Save CSV and SMI
# -----------------------
df_small_out = df_small[[
    "molecule_chembl_id",
    "canonical_smiles",
    "standard_value",   # keep for inspection
    "label"
]]

df_small_out.to_csv(out_csv, index=False)

with open(out_smi, "w") as f:
    for _, row in df_small_out.iterrows():
        f.write(f"{row['canonical_smiles']} {row['molecule_chembl_id']}\n")

print(f"Saved {len(df_small_out)} ligands")
print(f"CSV: {out_csv}")
print(f"SMI:  {out_smi}")

Total ligands after aggregation: 531
Ligands in noisy band: 109, labels flipped: 28
Class counts in sampled set:
label
1    43
0    37
Name: count, dtype: int64
Saved 80 ligands
CSV: /home/nikki/egfr_lowdata_scoring/data/ligands_real/egfr_real_noisy.csv
SMI:  /home/nikki/egfr_lowdata_scoring/data/ligands_real/egfr_real_noisy.smi


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
import os

smiles_file = "/home/nikki/egfr_lowdata_scoring/data/ligands_real/egfr_real_noisy.smi"
out_folder = "/home/nikki/egfr_lowdata_scoring/data/ligands_real/prepared_3D/"
os.makedirs(out_folder, exist_ok=True)

with open(smiles_file, 'r') as f:
    lines = f.readlines()

for line in lines:
    parts = line.strip().split()  # split by whitespace
    if len(parts) < 2:
        continue  # skip bad lines

    smi = parts[0]          # SMILES
    ligand_id = parts[1]    # CHEMBL ID

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        print(f"Failed to parse: {smi}")
        continue

    mol = Chem.AddHs(mol)  # add hydrogens

    # Embed 3D conformer
    AllChem.EmbedMolecule(mol, randomSeed=42)
    AllChem.UFFOptimizeMolecule(mol)  # energy minimization

    # Save as SDF
    writer = Chem.SDWriter(os.path.join(out_folder, f"{ligand_id}.sdf"))
    writer.write(mol)
    writer.close()

print("All ligands processed!")


All ligands processed!
