<a href="https://colab.research.google.com/github/nacha-suk/LLM-ML-OralBioavailability-Predictive-Models/blob/main/src/pk_analysis_and_logo_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd

MNL_descriptors = pd.read_excel('manual_API_ID_descriptor.xlsx')
MNL_dissolution = pd.read_excel('manual_dissolution_TQ.xlsx')
MNL_inVIVO = pd.read_excel('manual_in_vivo.xlsx')
MNL_add_data = pd.read_csv("manual_additional_data.csv", sep=";")

In [None]:
#Fills gaps in dissolution, we put 100 or more if prior T for Q was higher
pattern = re.compile(r"^T(\d+)%$")          # captures the number between T and %
diss_cols = []
for col in MNL_dissolution.columns:
    m = pattern.match(col)
    if m:
        time_pt = int(m.group(1))
        if time_pt >= 0 and time_pt % 5 == 0:   # 10 %, 20 %, 30 %, ‚Ä¶
            diss_cols.append(col)

# keep them in chronological order (important for "last" value)
diss_cols = sorted(diss_cols, key=lambda c: int(pattern.match(c).group(1)))

# -------------------------------------------------------------
# 3. Row-wise rule:
#    ‚Ä¢ last measured value ‚â• 100 ‚Üí propagate that value
#    ‚Ä¢ otherwise                ‚Üí fill with 100
# -------------------------------------------------------------
def fill_row(row):
    # index of the last non-NaN within the selected columns
    last_idx = row[diss_cols].last_valid_index()

    # If nothing measured at all, fill every target column with 100
    if last_idx is None:
        row[diss_cols] = 100
        return row

    last_val = row[last_idx]
    fill_val = last_val if last_val >= 100 else 100

    # Replace only the trailing NaNs
    pd.set_option('future.no_silent_downcasting', True)
    row[diss_cols] = row[diss_cols].fillna(fill_val)
    return row

MNL_dissolution = MNL_dissolution.apply(fill_row, axis=1)

In [None]:
MNL_inVIVO_clean = MNL_inVIVO.drop(["ID"], axis=1)
MNL_inVIVO_clean.isna().sum().sum()

np.int64(18)

In [None]:
MNL_dissolution_clean = MNL_dissolution.drop(["ID", "Best_Model", "Best_MSE", "Param_1", "Param_2", "Param_3", "Param_4"], axis=1)
MNL_dissolution_clean.isna().sum().sum()

np.int64(0)

In [None]:
MNL_descriptors_clean = MNL_descriptors.drop(['ID', 'No', 'API', 'SMILES'], axis=1)
MNL_descriptors_clean.isna().sum().sum()

np.int64(0)

In [None]:
MNL_add_data

Unnamed: 0,Dose [mg],Fast-0/Fed-1,pH,T [C],V [ml],solubilizers [%] - SLS,"appartus (basket =1,paddle=2,IV=4)",rpm/min,flow [ml/min]
0,2.000,0,5.8,37,500,0.0,1,100,0
1,2.000,0,5.8,37,500,0.0,1,100,0
2,2.000,0,5.8,37,500,0.0,1,100,0
3,2.000,0,5.8,37,500,0.0,1,100,0
4,1.000,0,5.8,37,500,0.0,1,100,0
...,...,...,...,...,...,...,...,...,...
134,200.000,1,7.5,37,200,0.0,1,150,0
135,200.000,1,7.5,37,200,0.0,1,150,0
136,0.375,0,6.8,37,500,0.0,1,100,0
137,75.000,1,7.2,37,1000,0.0,2,75,0


In [None]:
df_all = pd.concat(
    [MNL_add_data.reset_index(drop=True),
     MNL_dissolution_clean.reset_index(drop=True),
     MNL_descriptors_clean.reset_index(drop=True)],
    axis=1         # axis=1  ‚áí  dok≈Çadamy kolumny
)

MNL_X_data = df_all.loc[:, ~df_all.columns.duplicated()]


In [None]:
MNL_X_data
MNL_Cmax = MNL_inVIVO_clean["Cmax (ng/ml)"]
MNL_Tmax = MNL_inVIVO_clean["Tmax (h)"]
MNL_AUC = MNL_inVIVO_clean["AUClast (ng/ml*h)"]

In [None]:
"""
Manual example: Random-Forest regression for Cmax
-------------------------------------------------
‚Ä¢ Classic train / test split  (80 % / 20 %)z
‚Ä¢ 10-fold CV hyper-parameter search on the train part
‚Ä¢ External evaluation on the held-out test part
‚Ä¢ Leave-One-Group-Out validation (LOGO) ‚Äì groups defined by the identical
  (Cmax, Tmax, AUC) triple
‚Ä¢ Global R¬≤ and RMSE for the LOGO experiment
‚Ä¢ Export of predictions + errors to Excel
‚Ä¢ Pickle export of the tuned model
"""

# =============== 1. Imports ===================================================
from pathlib import Path
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, LeaveOneGroupOut
from sklearn.ensemble        import RandomForestRegressor
from sklearn.metrics         import (
    r2_score,
    mean_absolute_error,
    root_mean_squared_error,         # sklearn ‚â• 1.3
    mean_squared_error
)

# =============== 2. Data ======================================================
# Assume the four objects already exist in memory
#    MNL_X_data   ‚Üí feature matrix (DataFrame or ndarray)
#    MNL_Cmax     ‚Üí target Cmax
#    MNL_Tmax     ‚Üí target Tmax
#    MNL_AUC      ‚Üí target AUC
X  = MNL_X_data.copy()
yC = MNL_Cmax.copy()
yT = MNL_Tmax.copy()
yA = MNL_AUC.copy()

# =============== 3. Train / Test split + CV grid search ======================
# 80 % of data used for model selection, 20 % kept aside for final testing
X_train, X_test, yC_train, yC_test = train_test_split(
    X, yC, test_size=0.20, random_state=42
)

# Random-Forest base model + small hyper-parameter grid
rf   = RandomForestRegressor(random_state=42, n_jobs=-1)
grid = {
    "n_estimators":      [10, 20, 50, 100, 200],        # number of trees
    "max_depth":         [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf":  [1, 2, 3, 4]

}

#RandomForestRegressor(n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2,
#                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None,
#                      min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
#                      n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0,
#                      max_samples=None, monotonic_cst=None)


# 10-fold cross-validation on the training subset - just for testing if ML models work
cv10 = KFold(n_splits=10, shuffle=True, random_state=42)
gscv = GridSearchCV(
    estimator   = rf,
    param_grid  = grid,
    cv          = cv10,
    scoring     = "r2",
    n_jobs      = -1,
    verbose     = 1
)
gscv.fit(X_train, yC_train)

print("\n=== Hyper-parameter search results ===")
print(f"Best parameters      : {gscv.best_params_}")
print(f"Mean CV R¬≤ on TRAIN  : {gscv.best_score_:.3f}")

# Best estimator after CV
best_rf = gscv.best_estimator_

# =============== 3a. Evaluation on the 20 % TEST split ========================
y_pred_test = best_rf.predict(X_test)
print("\n=== 20 % hold-out TEST performance ===")
print(f"R¬≤  : {r2_score(yC_test, y_pred_test):.3f}")
print(f"MAE : {mean_absolute_error(yC_test, y_pred_test):.3f}")

# =============== 3b. Persist the tuned model =================================
Path("models").mkdir(exist_ok=True)
with open("models/random_forest_Cmax.pkl", "wb") as f:
    pickle.dump(best_rf, f)
print("Pickled model saved to models/random_forest_Cmax.pkl")

# =============== 4. Leave-One-Group-Out validation ===========================
# 4.1 Build a ‚Äúgroup ID‚Äù ‚Äì all rows sharing identical (C,T,A) belong to
#     the same group so they are removed together in each LOGO split.
triples   = pd.concat([yC, yT, yA], axis=1)
triples.columns = ["C", "T", "A"]
group_id  = triples.astype(str).agg("||".join, axis=1)      # simple hash

logo = LeaveOneGroupOut()

# Prepare a DataFrame to collect predictions from every fold
result_cols = ["group_id", "C_obs", "T_obs", "A_obs",
               "C_pred",  "abs_err"]
results = pd.DataFrame(columns=result_cols)

# Iterate over LOGO splits
for train_idx, test_idx in logo.split(X, yC, groups=group_id):

    X_tr, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_tr        = yC.iloc[train_idx]

    # Train a fresh RF with the best hyper-parameters found earlier
    model = RandomForestRegressor(**gscv.best_params_,
                                  random_state=42, n_jobs=-1)
    model.fit(X_tr, y_tr)

    # Predict for the held-out group
    y_val_pred = model.predict(X_val)

    # Store predictions together with true values and group ID
    fold_df = pd.DataFrame({
        "group_id": group_id[test_idx],   # integer IDs
        "C_obs"   : yC.iloc[test_idx].values,
        "T_obs"   : yT.iloc[test_idx].values,
        "A_obs"   : yA.iloc[test_idx].values,
        "C_pred"  : y_val_pred
    })
    fold_df["abs_err"] = (fold_df["C_pred"] - fold_df["C_obs"]).abs()
    results = pd.concat([results, fold_df], ignore_index=True)

# =============== 4a. Global metrics for the whole LOGO experiment ============
r2_full   = r2_score(results["C_obs"], results["C_pred"])
rmse_full = root_mean_squared_error(results["C_obs"], results["C_pred"])

print("\n=== LOGO overall metrics (all predictions combined) ===")
print(f"Global R¬≤   : {r2_full: .3f}")
print(f"Global RMSE : {rmse_full: .3f}")

# =============== 4b. Export detailed results to Excel ========================
out_path = "logo_predictions_Cmax_summary.xlsx"
results.to_excel(out_path, index=False)
print(f"Prediction table with metrics saved to {out_path}")

Fitting 10 folds for each of 120 candidates, totalling 1200 fits

=== Hyper-parameter search results ===
Best parameters      : {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 20}
Mean CV R¬≤ on TRAIN  : 0.758

=== 20 % hold-out TEST performance ===
R¬≤  : 0.889
MAE : 486.596
Pickled model saved to models/random_forest_Cmax.pkl


  results = pd.concat([results, fold_df], ignore_index=True)



=== LOGO overall metrics (all predictions combined) ===
Global R¬≤   :  0.774
Global RMSE :  1244.621
Prediction table with metrics saved to logo_predictions_Cmax_summary.xlsx


In [None]:
#2nd approach - more flexible for new models

# ============================================================
# DATA PREPARATION
# ============================================================
import json, pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import make_scorer, r2_score

# ---------- 1. load / copy data --------------------------------
X   = MNL_X_data.copy()
yC  = MNL_Cmax.copy()
yT  = MNL_Tmax.copy()
yA  = MNL_AUC.copy()

TARGET_NAME = "Cmax"                   # <---- change when needed
y = {"Cmax": yC, "Tmax": yT, "AUC": yA}[TARGET_NAME]

# ---------- 2. build numeric LOGO group IDs --------------------
triples   = pd.concat([yC.round(4), yT.round(4), yA.round(4)], axis=1)
group_id  = pd.factorize(triples.apply(tuple, axis=1))[0] + 1
logo      = LeaveOneGroupOut()

# ---------- 3. common objects ----------------------------------
R2 = make_scorer(r2_score)
OUT_DIR = Path("logo_models_single")
OUT_DIR.mkdir(exist_ok=True)

print(f"Prepared data for target = {TARGET_NAME!r}")
print(f"n_samples = {len(X)},  n_groups = {len(np.unique(group_id))}")

Prepared data for target = 'Cmax'
n_samples = 139,  n_groups = 75


In [None]:
# Helper = train-summary  +  persistence  +  OOF predictions

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error # Import root_mean_squared_error

def save_result(name: str, gscv):
    """
    1. print best params & LOGO CV score produced by GridSearchCV
    2. compute *out-of-fold* predictions with the same LOGO splitter
    3. print R¬≤ and RMSE on the **entire** data set using OOF predictions
    4. write
         ‚Ä¢ fitted pipeline          ‚Üí .pkl
         ‚Ä¢ meta-data (JSON)         ‚Üí .json
         ‚Ä¢ obs / pred per sample    ‚Üí predictions_*.csv
    5. push entry into the global RESULTS list for the leaderboard
    """
    best_pipe = gscv.best_estimator_
    best_cv_r2 = gscv.best_score_

    # ------- OOF predictions -------------------------------------
    y_true   = y.values               # ensure numpy
    preds    = np.empty_like(y_true, dtype=float)
    preds[:] = np.nan                 # initialise

    # Iterate over LOGO splits to generate OOF predictions
    for train_idx, test_idx in logo.split(X, y, groups=group_id):
        # Fit the best estimator on the training fold
        best_pipe.fit(X.iloc[train_idx], y.iloc[train_idx])
        # Predict on the held-out validation fold
        preds[test_idx] = best_pipe.predict(X.iloc[test_idx])

    # Now calculate OOF metrics using the collected predictions
    # Only consider samples where predictions were made (i.e., not NaN)
    valid_indices = ~np.isnan(preds)
    if np.isnan(preds).any() or np.isnan(y_true).any(): # Original check, keeping for safety
         n_nan = np.isnan(preds).sum() + np.isnan(y_true).sum()
         # Modify warning to be more specific
         print(f"‚ö†Ô∏è  Warning: {np.isnan(preds).sum()} NaNs in predictions, {np.isnan(y_true).sum()} NaNs in true values."
               " OOF R¬≤ / RMSE might be affected or set to NaN.")
         # Calculate metrics only on valid indices
         if np.sum(valid_indices) > 0: # Ensure there are valid predictions to calculate metrics
             oof_r2   = r2_score(y_true[valid_indices], preds[valid_indices])
             oof_rmse = root_mean_squared_error(y_true[valid_indices], preds[valid_indices])
         else: # No valid predictions
              oof_r2  = np.nan
              oof_rmse= np.nan
    else:
        # If no NaNs, calculate metrics on the full arrays
        oof_r2   = r2_score(y_true, preds)
        oof_rmse = root_mean_squared_error(y_true, preds)


    # ------- pretty print ----------------------------------------
    print(f"\n===== {name} summary =====")
    print("Best grid-search params  :", gscv.best_params_)
    print(f"Best LOGO CV R¬≤ (grid)   : {best_cv_r2:.3f}")
    print(f"Full data OOF   R¬≤       : {oof_r2:.3f}")
    print(f"Full data OOF   RMSE     : {oof_rmse:.3f}")

    # ------- save artefacts --------------------------------------
    pkl_path  = OUT_DIR / f"{TARGET_NAME}_{name}.pkl"
    meta_path = OUT_DIR / f"{TARGET_NAME}_{name}.json"
    pred_path = OUT_DIR / f"predictions_{TARGET_NAME}_{name}.csv"

    # pickle the fitted *full-data* model (already present in gscv) - This was causing the issue,
    # as the best_estimator_ of GridSearchCV is fitted on the *entire* training data (X, y) *after* the CV.
    # For OOF, we need the model fitted within each fold.
    # Let's save the model fitted on the full data *after* the OOF predictions are done, for consistency with original code structure
    best_pipe_full_data = gscv.best_estimator_ # Get the best estimator from GridSearchCV
    best_pipe_full_data.fit(X, y) # Fit it on the full data for saving

    with open(pkl_path, "wb") as fh:
        pickle.dump(best_pipe_full_data, fh) # Save the model fitted on full data


    # meta information
    meta = {
        "target"          : TARGET_NAME,
        "algorithm"       : name,
        "best_grid_params": gscv.best_params_,
        "grid_LOGO_R2"    : best_cv_r2, # Keep this key for the grid search score
        "OOF_R2"          : oof_r2,     # Use OOF_R2 for the calculated OOF score
        "OOF_RMSE"        : oof_rmse,   # Use OOF_RMSE for the calculated OOF RMSE
        "n_groups"        : int(group_id.max())
    }
    with open(meta_path, "w") as fh:
        json.dump(meta, fh, indent=2)

    # obs / pred per sample
    pd.DataFrame({
        "group_id"  : group_id,
        "observed"  : y_true,
        "predicted" : preds
    }).to_csv(pred_path, index=False)

    print(f"Files saved ‚Üí\n  ‚Ä¢ {pkl_path.name}\n  ‚Ä¢ {meta_path.name}"
          f"\n  ‚Ä¢ {pred_path.name}")

    # ------- leaderboard bookkeeping -----------------------------
    # Ensure the key for the grid search R2 is consistent with what the leaderboard expects
    # The leaderboard was updated to look for 'LOGO CV R¬≤' or 'grid_LOGO_R2'.
    # Let's stick to 'grid_LOGO_R2' in the meta dict and ensure the leaderboard code handles it.
    # If the leaderboard code is modified to look for 'OOF_R2', we would use that here.
    # Based on the last successful leaderboard run, it was looking for 'LOGO CV R¬≤' or 'grid_LOGO_R2'.
    # Let's add the calculated OOF_R2 to the dict that's added to RESULTS for the leaderboard.
    RESULTS.append(meta | {"pkl": str(pkl_path), "pred_csv": str(pred_path), "LOGO CV R¬≤": oof_r2}) # Add OOF_R2 with the key expected by the leaderboard

In [None]:
# ============================================================
# MLP  (feed-forward neural network)
# ============================================================
from sklearn.pipeline      import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

PIPE = Pipeline([
    ("scaler", StandardScaler()),
    ("model" , MLPRegressor(
        early_stopping = True,
        random_state   = 42))
])

GRID = {
    "model__hidden_layer_sizes": [(50,)],
    "model__alpha"             : [1e-4, 1e-3],
    "model__learning_rate_init": [0.001],
    "model__max_iter"                 : [20, 40]
}

print("\nüîπ MLP ‚Äì grid-search starting ‚Ä¶")
gscv = GridSearchCV(
        estimator = PIPE,
        param_grid= GRID,
        cv        = logo,
        scoring   = R2,
        n_jobs    = -1,
        verbose   = 2        # <-- prints progress of every fit
)
gscv.fit(X, y, groups=group_id)

print("\n===== MLP summary =====")
print("Best params :", gscv.best_params_)
print(f"LOGO CV R¬≤  : {gscv.best_score_:.3f}")

save_result("MLP", gscv)


üîπ MLP ‚Äì grid-search starting ‚Ä¶
Fitting 75 folds for each of 4 candidates, totalling 300 fits





===== MLP summary =====
Best params : {'model__alpha': 0.0001, 'model__hidden_layer_sizes': (50,), 'model__learning_rate_init': 0.001, 'model__max_iter': 20}
LOGO CV R¬≤  : nan

===== MLP summary =====
Best grid-search params  : {'model__alpha': 0.0001, 'model__hidden_layer_sizes': (50,), 'model__learning_rate_init': 0.001, 'model__max_iter': 20}
Best LOGO CV R¬≤ (grid)   : nan
Full data OOF   R¬≤       : -0.740
Full data OOF   RMSE     : 3453.166
Files saved ‚Üí
  ‚Ä¢ Cmax_MLP.pkl
  ‚Ä¢ Cmax_MLP.json
  ‚Ä¢ predictions_Cmax_MLP.csv


In [None]:
# Initialize an empty list to store results
RESULTS = []

In [None]:
# ============================================================
# Ridge regression #Add LASSO
# ============================================================
from sklearn.pipeline      import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import Ridge
from sklearn.model_selection import GridSearchCV

PIPE = Pipeline([
    ("scaler", StandardScaler()),
    ("model" , Ridge())
])

GRID = {"model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0]}

print("\nüîπ Ridge ‚Äì grid-search starting ‚Ä¶")
gscv = GridSearchCV(
        estimator = PIPE,
        param_grid= GRID,
        cv        = logo,
        scoring   = R2,
        n_jobs    = -1,
        verbose   = 2
)
gscv.fit(X, y, groups=group_id)

save_result("Ridge", gscv)


üîπ Ridge ‚Äì grid-search starting ‚Ä¶
Fitting 75 folds for each of 5 candidates, totalling 375 fits

===== Ridge summary =====
Best grid-search params  : {'model__alpha': 0.001}
Best LOGO CV R¬≤ (grid)   : nan
Full data OOF   R¬≤       : -84748.101
Full data OOF   RMSE     : 762114.352
Files saved ‚Üí
  ‚Ä¢ Cmax_Ridge.pkl
  ‚Ä¢ Cmax_Ridge.json
  ‚Ä¢ predictions_Cmax_Ridge.csv


In [None]:
# ============================================================
# kNN (k-Nearest Neighbors)
# ============================================================
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

PIPE = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsRegressor())
])

GRID = {
    "model__n_neighbors": [3, 5],
    "model__weights": ['uniform', 'distance'],
    "model__metric": ['euclidean', 'manhattan', 'minkowski']
}

print("\nüîπ kNN ‚Äì grid-search starting ‚Ä¶")
gscv = GridSearchCV(
    estimator=PIPE,
    param_grid=GRID,
    cv=logo,
    scoring=R2,
    n_jobs=-1,
    verbose=2  # <-- prints progress of every fit
)
gscv.fit(X, y, groups=group_id)

print("\n===== kNN summary =====")
print("Best params :", gscv.best_params_)
print(f"LOGO CV R¬≤  : {gscv.best_score_:.3f}")

save_result("kNN", gscv)


üîπ kNN ‚Äì grid-search starting ‚Ä¶
Fitting 75 folds for each of 12 candidates, totalling 900 fits

===== kNN summary =====
Best params : {'model__metric': 'euclidean', 'model__n_neighbors': 3, 'model__weights': 'uniform'}
LOGO CV R¬≤  : nan

===== kNN summary =====
Best grid-search params  : {'model__metric': 'euclidean', 'model__n_neighbors': 3, 'model__weights': 'uniform'}
Best LOGO CV R¬≤ (grid)   : nan
Full data OOF   R¬≤       : 0.749
Full data OOF   RMSE     : 1311.499
Files saved ‚Üí
  ‚Ä¢ Cmax_kNN.pkl
  ‚Ä¢ Cmax_kNN.json
  ‚Ä¢ predictions_Cmax_kNN.csv


In [None]:
# ============================================================
# Random-Forest regression
# ============================================================
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler        # scaler is optional
from sklearn.ensemble        import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

PIPE = Pipeline([
    # You can delete the scaler line for trees; it has no effect but keeps
    # the interface identical to other pipelines.
    ("scaler", StandardScaler()),
    ("model" , RandomForestRegressor(
        random_state = 42,
        n_jobs       = -1             # use all CPU cores
    ))
])

GRID = {
    "model__n_estimators"    : [20, 40, 60],
    "model__max_depth"       : [None, 5, 10],
    "model__min_samples_split": [2, 5]
}

print("\nüîπ Random-Forest ‚Äì grid-search starting ‚Ä¶")
gscv = GridSearchCV(
        estimator = PIPE,
        param_grid= GRID,
        cv        = logo,
        scoring   = R2,
        n_jobs    = -1,
        verbose   = 2                  # live progress
)
gscv.fit(X, y, groups=group_id)

save_result("RandomForest", gscv)


üîπ Random-Forest ‚Äì grid-search starting ‚Ä¶
Fitting 75 folds for each of 18 candidates, totalling 1350 fits

===== RandomForest summary =====
Best grid-search params  : {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 20}
Best LOGO CV R¬≤ (grid)   : nan
Full data OOF   R¬≤       : 0.800
Full data OOF   RMSE     : 1171.937
Files saved ‚Üí
  ‚Ä¢ Cmax_RandomForest.pkl
  ‚Ä¢ Cmax_RandomForest.json
  ‚Ä¢ predictions_Cmax_RandomForest.csv


In [None]:
# ============================================================
# Leaderboard
# ============================================================
import pandas as pd, textwrap, tabulate
import numpy as np # Import numpy for nan check

if not RESULTS:
    print("No models have been run yet.")
else:
    df = pd.DataFrame(RESULTS)

    # Ensure the necessary columns exist, adding them with NaN if not
    required_cols = ['algorithm', 'grid_LOGO_R2', 'OOF_R2']
    for col in required_cols:
        if col not in df.columns:
            df[col] = np.nan # Add missing columns with NaN

    # Filter out rows where OOF_R2 is NaN, as this is the sorting metric
    df_cleaned = df.dropna(subset=['OOF_R2'])

    if df_cleaned.empty:
        print("No models with valid 'OOF_R2' scores have been run yet.")
        # Optionally, display the full dataframe even if no valid scores
        # print("\nFull results dataframe:")
        # display(df)
    else:
        # Sort by OOF_R2
        df_sorted = df_cleaned.sort_values('OOF_R2', ascending=False)

        # Prepare data for tabulation, including both R2 scores
        df_display = df_sorted[['algorithm', 'grid_LOGO_R2', 'OOF_R2']].copy()
        df_display = df_display.rename(columns={'grid_LOGO_R2': 'Grid LOGO CV R¬≤', 'OOF_R2': 'OOF R¬≤'})

        print("\n================ Leaderboard ================")
        print(tabulate.tabulate(df_display, headers="keys",
                                floatfmt=".3f", showindex=range(1, len(df_display)+1)))

        # save to disk (using OOF_R2 for sorting in the saved files as well)
        csv_path = OUT_DIR / f"{TARGET_NAME}_leaderboard_OOF_R2.csv" # Changed filename to indicate OOF_R2
        md_path  = OUT_DIR / f"{TARGET_NAME}_leaderboard_OOF_R2.md"   # Changed filename to indicate OOF_R2
        df_sorted.to_csv(csv_path, index=False) # Save the full sorted df
        df_display.to_markdown(md_path, index=False) # Save the display version for markdown
        print(f"\nLeaderboard written to\n  ‚Ä¢ {csv_path}\n  ‚Ä¢ {md_path}")

        # Check again if df_sorted is not empty before accessing iloc[0]
        if not df_sorted.empty:
            champ = df_sorted.iloc[0]
            # Ensure the score_column exists before trying to format it
            # Now we can display both scores for the champion
            grid_r2_display = f"{champ['grid_LOGO_R2']:.3f}" if pd.notna(champ['grid_LOGO_R2']) else 'NaN'
            oof_r2_display = f"{champ['OOF_R2']:.3f}" if pd.notna(champ['OOF_R2']) else 'NaN'

            print(f"\nüèÜ Current champion: {champ.algorithm}  (Grid LOGO CV R¬≤ = {grid_r2_display}, OOF R¬≤ = {oof_r2_display})")
        else:
             print(f"\nNo champion determined as no models had valid OOF_R2 scores.")


    algorithm       Grid LOGO CV R¬≤      OOF R¬≤
--  ------------  -----------------  ----------
 1  RandomForest                nan       0.800
 2  kNN                         nan       0.749
 3  MLP                         nan      -0.740
 4  Ridge                       nan  -84748.101

Leaderboard written to
  ‚Ä¢ logo_models_single/Cmax_leaderboard_OOF_R2.csv
  ‚Ä¢ logo_models_single/Cmax_leaderboard_OOF_R2.md

üèÜ Current champion: RandomForest  (Grid LOGO CV R¬≤ = NaN, OOF R¬≤ = 0.800)


To do: implement more models from scikit-learn, add more hyperparameters. Repeat the same for manual database