In [4]:
# -*- coding: utf-8 -*-
"""
Seed-level protein regression (3 models)
- PLSR & RF: seed-level training -> sample-level (median) evaluation
- SVR: sample-level training/evaluation using seed-median spectra
- CV: Outer 5-fold GroupKFold; Inner 3-fold GroupKFold
- Output: print metrics only (no plots, no saved files)
"""

import re
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

RANDOM_STATE = 42
AGG = "median"

# =======================
# Utilities
# =======================
def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def snv(X):
    X = np.asarray(X, dtype=float)
    mu = X.mean(axis=1, keepdims=True)
    sd = X.std(axis=1, keepdims=True)
    sd[sd == 0] = 1.0
    return (X - mu) / sd

def find_protein_col(columns):
    for c in columns:
        cl = str(c).lower()
        if "protein" in cl:
            return c
    raise KeyError("Protein column not found.")

def sort_spec_cols(cols):
    def _f(c):
        try:
            return float(str(c).replace("nm",""))
        except:
            return np.inf
    return sorted(cols, key=_f)

def sample_level_metrics(y_true_seed, y_pred_seed, sample_ids):
    df = pd.DataFrame({
        "sample_id": sample_ids,
        "y_true": y_true_seed,
        "y_pred": y_pred_seed
    })
    g = df.groupby("sample_id")[["y_true","y_pred"]].median()
    return r2_score(g["y_true"], g["y_pred"]), rmse(g["y_true"], g["y_pred"])

def seed_to_sample_spectra(X_seed, y_seed, sample_ids):
    df_y = pd.DataFrame({"sample_id": sample_ids, "y": y_seed})
    y_s = df_y.groupby("sample_id")["y"].median()

    df_X = pd.DataFrame(X_seed)
    df_X["sample_id"] = sample_ids
    X_s = df_X.groupby("sample_id").median(numeric_only=True)

    X_s = X_s.loc[y_s.index]
    return X_s.values, y_s.values, y_s.index.values

# =======================
# Load data
# =======================
DATA_PATH = Path(__file__).resolve().parent / "Pea" / "seed" / "pea_patch_dataset_seed.csv"
df = pd.read_csv(DATA_PATH)

if "sample_id" not in df.columns:
    raise KeyError("CSV must contain sample_id column.")

spec_cols = sort_spec_cols([c for c in df.columns if str(c).endswith("nm")])
protein_col = find_protein_col(df.columns)

df = df.loc[df[protein_col].notna()].copy()

X_seed = df[spec_cols].values.astype(float)
y_seed = df[protein_col].values.astype(float)
g_seed = df["sample_id"].astype(str).values

print(f"[INFO] seeds={len(df)} | samples={df['sample_id'].nunique()} | bands={len(spec_cols)}")

# Sample-level spectra for SVR
X_s, y_s, g_s = seed_to_sample_spectra(X_seed, y_seed, g_seed)
print(f"[INFO] SVR uses sample-level spectra: X={X_s.shape}")

outer_cv = GroupKFold(n_splits=5)
inner_cv = GroupKFold(n_splits=3)

# =======================
# PLSR
# =======================
print("\n===== PLSR (seed-level) =====")
r2s, rmses = [], []

for fold, (tr, te) in enumerate(outer_cv.split(X_seed, y_seed, g_seed), 1):
    Xtr, Xte = X_seed[tr], X_seed[te]
    ytr, yte = y_seed[tr], y_seed[te]
    gtr, gte = g_seed[tr], g_seed[te]

    best_r2, best_k = -np.inf, None
    max_k = min(20, Xtr.shape[1], len(np.unique(gtr))-1)

    for k in range(1, max_k+1):
        scores = []
        for itr, iva in inner_cv.split(Xtr, ytr, gtr):
            mdl = Pipeline([
                ("snv", FunctionTransformer(snv, validate=False)),
                ("scaler", StandardScaler()),
                ("model", PLSRegression(n_components=k))
            ])
            mdl.fit(Xtr[itr], ytr[itr])
            pred = mdl.predict(Xtr[iva]).ravel()
            r2,_ = sample_level_metrics(ytr[iva], pred, gtr[iva])
            scores.append(r2)
        if np.mean(scores) > best_r2:
            best_r2, best_k = np.mean(scores), k

    final = Pipeline([
        ("snv", FunctionTransformer(snv, validate=False)),
        ("scaler", StandardScaler()),
        ("model", PLSRegression(n_components=best_k))
    ])
    final.fit(Xtr, ytr)
    pred = final.predict(Xte).ravel()

    r2, e = sample_level_metrics(yte, pred, gte)
    r2s.append(r2); rmses.append(e)
    print(f"Fold {fold}: R2={r2:.3f}, RMSE={e:.3f}, k={best_k}")

print(f"PLSR mean R2={np.mean(r2s):.3f} ± {np.std(r2s):.3f}")

# =======================
# SVR (sample-level)
# =======================
print("\n===== SVR (sample-level) =====")
r2s, rmses = [], []

SVR_GRID = [
    {"C":10, "gamma":"scale", "epsilon":0.05},
    {"C":100,"gamma":"scale","epsilon":0.05},
]

for fold,(tr,te) in enumerate(outer_cv.split(X_s,y_s,g_s),1):
    Xtr,Xte = X_s[tr],X_s[te]
    ytr,yte = y_s[tr],y_s[te]
    gtr = g_s[tr]

    best_r2,best_p = -np.inf,None
    for p in SVR_GRID:
        scores=[]
        for itr,iva in inner_cv.split(Xtr,ytr,gtr):
            mdl = Pipeline([
                ("snv", FunctionTransformer(snv, validate=False)),
                ("scaler", StandardScaler()),
                ("model", SVR(kernel="rbf", **p))
            ])
            mdl.fit(Xtr[itr],ytr[itr])
            scores.append(r2_score(ytr[iva], mdl.predict(Xtr[iva])))
        if np.mean(scores)>best_r2:
            best_r2,best_p=np.mean(scores),p

    final = Pipeline([
        ("snv", FunctionTransformer(snv, validate=False)),
        ("scaler", StandardScaler()),
        ("model", SVR(kernel="rbf", **best_p))
    ])
    final.fit(Xtr,ytr)
    pred = final.predict(Xte)

    r2 = r2_score(yte,pred)
    e = rmse(yte,pred)
    r2s.append(r2); rmses.append(e)
    print(f"Fold {fold}: R2={r2:.3f}, RMSE={e:.3f}")

print(f"SVR mean R2={np.mean(r2s):.3f} ± {np.std(r2s):.3f}")

# =======================
# RF
# =======================
print("\n===== RF (seed-level) =====")
r2s, rmses = [], []

for fold,(tr,te) in enumerate(outer_cv.split(X_seed,y_seed,g_seed),1):
    Xtr,Xte = X_seed[tr],X_seed[te]
    ytr,yte = y_seed[tr],y_seed[te]
    gtr,gte = g_seed[tr],g_seed[te]

    mdl = Pipeline([
        ("snv", FunctionTransformer(snv, validate=False)),
        ("model", RandomForestRegressor(
            n_estimators=600,
            max_depth=None,
            min_samples_leaf=1,
            random_state=RANDOM_STATE,
            n_jobs=-1
        ))
    ])
    mdl.fit(Xtr,ytr)
    pred = mdl.predict(Xte)

    r2,e = sample_level_metrics(yte,pred,gte)
    r2s.append(r2); rmses.append(e)
    print(f"Fold {fold}: R2={r2:.3f}, RMSE={e:.3f}")

print(f"RF mean R2={np.mean(r2s):.3f} ± {np.std(r2s):.3f}")


[INFO] seeds=12129 | samples=113 | bands=272
[INFO] SVR uses sample-level spectra: X=(113, 272)

===== PLSR (seed-level) =====
Fold 1: R2=0.262, RMSE=1.101, k=15
Fold 2: R2=0.323, RMSE=1.202, k=20
Fold 3: R2=0.087, RMSE=1.235, k=14
Fold 4: R2=0.329, RMSE=1.384, k=20
Fold 5: R2=0.431, RMSE=0.986, k=19
PLSR mean R2=0.286 ± 0.114

===== SVR (sample-level) =====
Fold 1: R2=0.269, RMSE=1.109
Fold 2: R2=0.046, RMSE=1.252
Fold 3: R2=0.226, RMSE=1.480
Fold 4: R2=0.003, RMSE=1.363
Fold 5: R2=-0.384, RMSE=1.669
SVR mean R2=0.032 ± 0.231

===== RF (seed-level) =====
Fold 1: R2=0.080, RMSE=1.230
Fold 2: R2=0.162, RMSE=1.337
Fold 3: R2=0.072, RMSE=1.245
Fold 4: R2=0.149, RMSE=1.559
Fold 5: R2=0.233, RMSE=1.145
RF mean R2=0.139 ± 0.059
