In [20]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

from xgboost import XGBClassifier

In [22]:
X = np.load("X_morgan.npy")
y = np.load("y_labels.npy")
df = pd.read_csv("qs_inhibitors_cleaned.csv")

assert len(df) == len(X) == len(y)
assert "smiles_canonical" in df.columns

In [23]:
def get_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return MurckoScaffold.MurckoScaffoldSmiles(mol=mol)

df["scaffold"] = df["smiles_canonical"].apply(get_murcko_scaffold)

In [24]:
def scaffold_split(df, test_fraction=0.2, seed=42):
    rng = np.random.default_rng(seed)

    scaffolds = df["scaffold"].unique()
    rng.shuffle(scaffolds)

    test_scaffolds = set()
    test_size = 0

    for scaffold in scaffolds:
        rows = df[df["scaffold"] == scaffold]
        test_scaffolds.add(scaffold)
        test_size += len(rows)

        if test_size / len(df) >= test_fraction:
            break

    test_idx = df[df["scaffold"].isin(test_scaffolds)].index.to_numpy()
    train_idx = df[~df["scaffold"].isin(test_scaffolds)].index.to_numpy()

    return train_idx, test_idx


In [25]:
train_idx, test_idx = scaffold_split(df)

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print("Train size:", len(train_idx))
print("Test size:", len(test_idx))
print("Train class balance:", np.mean(y_train))
print("Test class balance:", np.mean(y_test))

Train size: 131
Test size: 37
Train class balance: 0.5419847328244275
Test class balance: 0.5675675675675675


In [26]:
def evaluate(model, X, y):
    y_prob = model.predict_proba(X)[:, 1]
    y_pred = model.predict(X)

    return {
        "ROC_AUC": roc_auc_score(y, y_prob),
        "PR_AUC": average_precision_score(y, y_prob),
        "F1": f1_score(y, y_pred)
    }

In [27]:
logreg = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    n_jobs=-1
)

logreg.fit(X_train, y_train)
logreg_results = evaluate(logreg, X_test, y_test)

print("Logistic Regression:", logreg_results)

Logistic Regression: {'ROC_AUC': 0.6666666666666666, 'PR_AUC': 0.7465080889892921, 'F1': 0.68}


In [28]:
rf = RandomForestClassifier(
    n_estimators=500,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
rf_results = evaluate(rf, X_test, y_test)

print("Random Forest:", rf_results)

Random Forest: {'ROC_AUC': 0.6339285714285714, 'PR_AUC': 0.6761395259592724, 'F1': 0.45714285714285713}


In [29]:
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_results = evaluate(xgb, X_test, y_test)

print("XGBoost:", xgb_results)

XGBoost: {'ROC_AUC': 0.5059523809523809, 'PR_AUC': 0.5941239847171181, 'F1': 0.5833333333333334}


In [30]:
results = pd.DataFrame.from_dict(
    {
        "Logistic Regression": logreg_results,
        "Random Forest": rf_results,
        "XGBoost": xgb_results
    },
    orient="index"
)

results.to_csv("baseline_scaffold_results.csv")
print(results)

                      ROC_AUC    PR_AUC        F1
Logistic Regression  0.666667  0.746508  0.680000
Random Forest        0.633929  0.676140  0.457143
XGBoost              0.505952  0.594124  0.583333


In [33]:
from collections import defaultdict

N_SPLITS = 5
SEED_BASE = 42

all_results = []

for split_id in range(N_SPLITS):
    seed = SEED_BASE + split_id

    train_idx, test_idx = scaffold_split(df, seed=seed)

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    logreg = LogisticRegression(
        max_iter=500,
        class_weight="balanced",
        n_jobs=-1
    )
    logreg.fit(X_train, y_train)
    logreg_res = evaluate(logreg, X_test, y_test)

    rf = RandomForestClassifier(
        n_estimators=500,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=seed,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    rf_res = evaluate(rf, X_test, y_test)

    scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
    xgb = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        eval_metric="logloss",
        random_state=seed
    )
    xgb.fit(X_train, y_train)
    xgb_res = evaluate(xgb, X_test, y_test)

    for model_name, res in zip(
        ["LogReg", "RF", "XGB"],
        [logreg_res, rf_res, xgb_res]
    ):
        all_results.append({
            "split": split_id,
            "model": model_name,
            "ROC_AUC": res["ROC_AUC"],
            "PR_AUC": res["PR_AUC"],
            "F1": res["F1"]
        })

In [34]:
results_df = pd.DataFrame(all_results)
results_df.to_csv("baseline_scaffold_per_split.csv", index=False)

results_df

Unnamed: 0,split,model,ROC_AUC,PR_AUC,F1
0,0,LogReg,0.666667,0.746508,0.68
1,0,RF,0.633929,0.67614,0.457143
2,0,XGB,0.505952,0.594124,0.583333
3,1,LogReg,0.657895,0.671756,0.631579
4,1,RF,0.717105,0.721915,0.714286
5,1,XGB,0.598684,0.656349,0.666667
6,2,LogReg,0.56179,0.643705,0.645161
7,2,RF,0.696733,0.766222,0.62069
8,2,XGB,0.577415,0.68306,0.65625
9,3,LogReg,0.895833,0.852634,0.787879


In [35]:
summary = (
    results_df
    .groupby("model")[["ROC_AUC", "PR_AUC", "F1"]]
    .agg(["mean", "std"])
)

summary

Unnamed: 0_level_0,ROC_AUC,ROC_AUC,PR_AUC,PR_AUC,F1,F1
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
LogReg,0.692437,0.122934,0.706242,0.095113,0.694378,0.064026
RF,0.714637,0.127965,0.74353,0.106808,0.622236,0.155027
XGB,0.632382,0.114917,0.671771,0.08048,0.660884,0.079574


In [36]:
summary.to_csv("baseline_summary.csv")