In [2]:
import os
import pandas as pd
import re

LOG_DIR = "/home/nikki/egfr_lowdata_scoring/results/synth/logs"
SMI_FILE = "/home/nikki/egfr_lowdata_scoring/data/ligands_synthetic/synthetic.smi"
OUT_CSV = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth_raw.csv"

# Load SMILES
smiles_dict = {}
with open(SMI_FILE) as f:
    for line in f:
        smi, sid = line.strip().split()
        smiles_dict[sid] = smi

rows = []

for log_file in os.listdir(LOG_DIR):
    if not log_file.endswith(".log"):
        continue

    lig_id = log_file.replace(".log", "")
    log_path = os.path.join(LOG_DIR, log_file)

    with open(log_path) as f:
        for line in f:
            # Match mode 1 docking score from Vina output
            match = re.match(r"\s*1\s+(-?\d+\.\d+)", line)
            if match:
                score = float(match.group(1))
                rows.append({
                    "synthetic_id": lig_id,
                    "smiles": smiles_dict.get(lig_id, None),
                    "docking_score": score
                })
                break  # only take best pose

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)

print(f"Saved {len(df)} rows to {OUT_CSV}")


Saved 188 rows to /home/nikki/egfr_lowdata_scoring/results/synth/features_synth_raw.csv


In [3]:
# compute_features_synth_high_conf.py
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors, AllChem
from rdkit.DataStructs import ConvertToNumpyArray
import numpy as np

# Paths
RAW_CSV = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth_raw.csv"
OUT_CSV = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth.csv"

# Load docking results
df = pd.read_csv(RAW_CSV)

rows = []

for _, row in df.iterrows():
    lig_id = row["synthetic_id"]
    smi = row["smiles"]
    docking_score = row["docking_score"]

    # ---- High-confidence pseudo-labels ----

    if docking_score <= -9.0:
        label = 1
    elif docking_score >= -8.0:
        label = 0
    else:
        # skip medium-confidence ligands
        continue

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        continue

    # ---- PhysChem descriptors ----
    MW = Descriptors.MolWt(mol)
    LogP = Crippen.MolLogP(mol)
    HBD = rdMolDescriptors.CalcNumHBD(mol)
    HBA = rdMolDescriptors.CalcNumHBA(mol)
    TPSA = rdMolDescriptors.CalcTPSA(mol)

    # ---- Morgan fingerprint ----
    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol,
        radius=2,
        nBits=1024
    )
    fp_array = np.zeros((1024,), dtype=int)
    ConvertToNumpyArray(fp, fp_array)

    # ---- Combine everything ----
    feature_row = {
        "ligand_id": lig_id,
        "label": label,
        "docking_score": docking_score,
        "MW": MW,
        "LogP": LogP,
        "HBD": HBD,
        "HBA": HBA,
        "TPSA": TPSA
    }

    # Add fingerprint bits
    for i in range(1024):
        feature_row[f"fp_{i}"] = fp_array[i]

    rows.append(feature_row)

# Save
features_df = pd.DataFrame(rows)
features_df.to_csv(OUT_CSV, index=False)

print(f"Saved {len(features_df)} high-confidence synthetic feature rows to {OUT_CSV}")


Saved 110 high-confidence synthetic feature rows to /home/nikki/egfr_lowdata_scoring/results/synth/features_synth.csv


In [4]:
# compute_features_synth
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors, AllChem
from rdkit.DataStructs import ConvertToNumpyArray
import numpy as np

# Paths
RAW_CSV = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth_raw.csv"
OUT_CSV = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth.csv"

# Load docking results
df = pd.read_csv(RAW_CSV)

rows = []

for _, row in df.iterrows():
    lig_id = row["synthetic_id"]
    smi = row["smiles"]
    docking_score = row["docking_score"]

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        continue

    # ---- Assign label based on docking score ----
    label = 1 if docking_score <= -8.0 else 0

    # ---- PhysChem descriptors ----
    MW = Descriptors.MolWt(mol)
    LogP = Crippen.MolLogP(mol)
    HBD = rdMolDescriptors.CalcNumHBD(mol)
    HBA = rdMolDescriptors.CalcNumHBA(mol)
    TPSA = rdMolDescriptors.CalcTPSA(mol)

    # ---- Morgan fingerprint ----
    fp = AllChem.GetMorganFingerprintAsBitVect(
        mol,
        radius=2,
        nBits=1024
    )
    fp_array = np.zeros((1024,), dtype=int)
    ConvertToNumpyArray(fp, fp_array)

    # ---- Combine everything ----
    feature_row = {
        "ligand_id": lig_id,
        "label": label,
        "docking_score": docking_score,
        "MW": MW,
        "LogP": LogP,
        "HBD": HBD,
        "HBA": HBA,
        "TPSA": TPSA
    }

    # Add fingerprint bits
    for i in range(1024):
        feature_row[f"fp_{i}"] = fp_array[i]

    rows.append(feature_row)

# Save
features_df = pd.DataFrame(rows)
features_df.to_csv(OUT_CSV, index=False)

print(f"Saved {len(features_df)} synthetic feature rows to {OUT_CSV}")


Saved 188 synthetic feature rows to /home/nikki/egfr_lowdata_scoring/results/synth/features_synth.csv


In [5]:
#ML model training

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

In [6]:
import pandas as pd
import os

# Paths
synth_path = "/home/nikki/egfr_lowdata_scoring/results/synth/features_synth.csv"
real_path = "/home/nikki/egfr_lowdata_scoring/results/real/features_real.csv"
output_path = "/home/nikki/egfr_lowdata_scoring/results/real+synth_features.csv"

# Load data
synth_df = pd.read_csv(synth_path)
real_df = pd.read_csv(real_path)

# Step 1: Rename ligand_id to molecule_chembl_id in synth_df
synth_df = synth_df.rename(columns={'ligand_id': 'molecule_chembl_id'})

# Step 2: Combine both files (row-wise)
combined_df = pd.concat([real_df, synth_df], ignore_index=True)

# Save combined file
combined_df.to_csv(output_path, index=False)



In [7]:
real_df = pd.read_csv("/home/nikki/egfr_lowdata_scoring/results/real/features_real.csv")
synth_df = pd.read_csv("/home/nikki/egfr_lowdata_scoring/results/real+synth_features.csv")

descriptor_cols = ["docking_score"]
fp_cols = [c for c in real_df.columns if c.startswith("fp_")]

feature_cols = descriptor_cols + fp_cols

In [8]:
X_real = real_df[feature_cols].copy()
y_real = real_df["label"].astype(int)

X_synth = synth_df[feature_cols].copy()

X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(
    X_real,
    y_real,
    test_size=0.2,
    stratify=y_real,
    random_state=42
)

In [9]:
rf_real = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

rf_real.fit(X_real_train, y_real_train)

probs_real = rf_real.predict_proba(X_real_test)[:, 1]

auc_real = roc_auc_score(y_real_test, probs_real)
pr_real = average_precision_score(y_real_test, probs_real)
acc_real = accuracy_score(y_real_test, probs_real > 0.5)

print("Baseline 1 – Real only")
print("ROC-AUC:", auc_real)
print("PR-AUC:", pr_real)
print("Accuracy:", acc_real)


Baseline 1 – Real only
ROC-AUC: 0.8125
PR-AUC: 0.8197781385281386
Accuracy: 0.75


In [10]:
X_synth_train = synth_df[feature_cols]
y_synth_train = synth_df["label"].astype(int)

X_train_aug = pd.concat([X_real_train, X_synth_train], axis=0)
y_train_aug = pd.concat([y_real_train, y_synth_train], axis=0)

In [11]:
rf_aug = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

rf_aug.fit(X_train_aug, y_train_aug)

probs_aug = rf_aug.predict_proba(X_real_test)[:, 1]

auc_aug = roc_auc_score(y_real_test, probs_aug)
pr_aug = average_precision_score(y_real_test, probs_aug)
acc_aug = accuracy_score(y_real_test, probs_aug > 0.5)

print("\nBaseline 2 - Real + Synthetic (labels)")
print("ROC-AUC:", auc_aug)
print("PR-AUC:", pr_aug)
print("Accuracy:", acc_aug)



Baseline 2 - Real + Synthetic (labels)
ROC-AUC: 1.0
PR-AUC: 1.0
Accuracy: 1.0


In [12]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve

# 1) ROC curves
fpr_real, tpr_real, _ = roc_curve(y_real_test, probs_real)
fpr_aug,  tpr_aug,  _ = roc_curve(y_real_test, probs_aug)

plt.figure(figsize=(5, 5))
plt.plot(fpr_real, tpr_real, label=f"Real only (AUC = {auc_real:.2f})")
plt.plot(fpr_aug,  tpr_aug,  label=f"Real + Synth (AUC = {auc_aug:.2f})", linestyle="--")
plt.plot([0, 1], [0, 1], "k--", alpha=0.5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.tight_layout()
plt.savefig("roc_comparison.png", dpi=300)
plt.close()

# 2) Precision–Recall curves
prec_real, rec_real, _ = precision_recall_curve(y_real_test, probs_real)
prec_aug,  rec_aug,  _ = precision_recall_curve(y_real_test, probs_aug)

plt.figure(figsize=(5, 5))
plt.plot(rec_real, prec_real, label=f"Real only (PR-AUC = {pr_real:.2f})")
plt.plot(rec_aug,  prec_aug,  label=f"Real + Synth (PR-AUC = {pr_aug:.2f})", linestyle="--")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curves")
plt.legend()
plt.tight_layout()
plt.savefig("pr_comparison.png", dpi=300)
plt.close()

# 3) Accuracy bar plot
models = ["Real only", "Real + Synth"]
accs   = [acc_real, acc_aug]

plt.figure(figsize=(4, 4))
bars = plt.bar(models, accs, color=["C0", "C1"])
plt.ylim(0, 1.05)
plt.ylabel("Accuracy")
plt.title("Accuracy Comparison")

# annotate bars
for b, a in zip(bars, accs):
    plt.text(b.get_x() + b.get_width() / 2, a + 0.02, f"{a:.2f}",
             ha="center", va="bottom", fontsize=10)

plt.tight_layout()
plt.savefig("accuracy_comparison.png", dpi=300)
plt.close()
