In [2]:
import glob
import pandas as pd
import os

# -----------------------
# Paths
# -----------------------
log_dir = "/home/nikki/egfr_lowdata_scoring/results/real/logs"
labels_file = "/home/nikki/egfr_lowdata_scoring/data/ligands_real/egfr_real_noisy.csv"
out_csv = "/home/nikki/egfr_lowdata_scoring/results/real/egfr_real_docking_scores.csv"

# -----------------------
# Parse docking logs
# -----------------------
scores = []

for log_file in glob.glob(os.path.join(log_dir, "*.log")):
    ligand_id = os.path.basename(log_file).replace(".log", "")

    with open(log_file) as f:
        for line in f:
            if line.strip().startswith("1 "):  # best-ranked pose
                score = float(line.split()[1])
                scores.append({
                    "ligand_id": ligand_id,
                    "docking_score": score
                })
                break

df_scores = pd.DataFrame(scores)

# -----------------------
# Load ligand metadata
# -----------------------
df_labels = pd.read_csv(labels_file)

# -----------------------
# Merge correctly
# ligand_id == molecule_chembl_id
# -----------------------
df_final = df_labels.merge(
    df_scores,
    left_on="molecule_chembl_id",
    right_on="ligand_id",
    how="inner"
)

# -----------------------
# Clean final columns
# -----------------------
df_final = df_final[[
    "molecule_chembl_id",
    "canonical_smiles",
    "label",
    "docking_score"
]]

df_final.to_csv(out_csv, index=False)

print(f"Saved {len(df_final)} ligands with docking scores")
print(f"Output: {out_csv}")


Saved 77 ligands with docking scores
Output: /home/nikki/egfr_lowdata_scoring/results/real/egfr_real_docking_scores.csv


In [3]:
# Feature extraction script
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

def extract_features(input_csv, output_csv):
    # 1. Load your data (assumes columns: molecule_chembl_id,canonical_smiles, label, docking_score)
    df = pd.read_csv(input_csv)
    
    fps = []
    physchem_data = []

    for index, row in df.iterrows():
        mol = Chem.MolFromSmiles(row['canonical_smiles'])
        
        if mol:
            # --- A. Morgan Fingerprints (Structural Features) ---
            # Radius 2 is roughly equivalent to ECFP4
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            fps.append(np.array(fp))

            # --- B. PhysChem Descriptors (General Properties) ---
            mw = Descriptors.MolWt(mol)
            logp = Descriptors.MolLogP(mol)
            hbd = Descriptors.NumHDonors(mol)
            hba = Descriptors.NumHAcceptors(mol)
            tpsa = Descriptors.TPSA(mol)
            physchem_data.append([mw, logp, hbd, hba, tpsa])
        else:
            # Handle failed SMILES parsing
            fps.append(np.zeros(1024))
            physchem_data.append([0, 0, 0, 0, 0])

    # 2. Convert to DataFrames
    fp_cols = [f'fp_{i}' for i in range(1024)]
    df_fps = pd.DataFrame(fps, columns=fp_cols)
    
    phys_cols = ['MW', 'LogP', 'HBD', 'HBA', 'TPSA']
    df_phys = pd.DataFrame(physchem_data, columns=phys_cols)

    # 3. Combine everything
    # We keep the label and docking_score as our primary features/targets
    final_df = pd.concat([
        df[['molecule_chembl_id', 'label', 'docking_score']], 
        df_phys, 
        df_fps
    ], axis=1)

    # 4. Save for Week 2 modeling
    final_df.to_csv(output_csv, index=False)
    print(f"Feature extraction complete. Shape: {final_df.shape}")

# Run the function
extract_features('/home/nikki/egfr_lowdata_scoring/results/real/egfr_real_docking_scores.csv', '/home/nikki/egfr_lowdata_scoring/results/real/features_real.csv')

Feature extraction complete. Shape: (77, 1032)


In [4]:
#Model training and evaluation

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# 1. Load the data generated in Step 6
df = pd.read_csv('/home/nikki/egfr_lowdata_scoring/results/real/features_real.csv')

# Define feature groups
# We exclude metadata and the label
physchem_cols = ['MW', 'LogP', 'HBD', 'HBA', 'TPSA']
fp_cols = [c for c in df.columns if c.startswith('fp_')]
docking_col = ['docking_score']

# 2. Prepare Feature Sets
# Set A: 2D Only (Fingerprints + PhysChem)
X_A = df[physchem_cols + fp_cols]

# Set B: 2D + Docking Score
X_B = df[physchem_cols + fp_cols + docking_col]

y = df['label']

# 3. Train/Test Split (80/20)
# Stratify=y ensures both sets have a similar balance of active/inactive
X_A_train, X_A_test, X_B_train, X_B_test, y_train, y_test = train_test_split(
    X_A, X_B, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Train and Evaluate
def evaluate_model(X_train, X_test, y_train, y_test, name):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, preds)
    # Handle case where test set might only have one class during a tiny split
    try:
        auc = roc_auc_score(y_test, probs)
    except ValueError:
        auc = 0.5 
        
    print(f"--- Model: {name} ---")
    print(f"Accuracy: {acc:.2f}")
    print(f"ROC-AUC:  {auc:.2f}\n")

evaluate_model(X_A_train, X_A_test, y_train, y_test, "A: 2D Descriptors Only")
evaluate_model(X_B_train, X_B_test, y_train, y_test, "B: 2D + Docking Score")

--- Model: A: 2D Descriptors Only ---
Accuracy: 0.75
ROC-AUC:  0.79

--- Model: B: 2D + Docking Score ---
Accuracy: 0.75
ROC-AUC:  0.80

