### 0. Libraries and global setup
The code requires scikit-learn version 1.1 or later, fixed random seed.

In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import (mean_absolute_error, mean_squared_error, median_absolute_error, r2_score)

from sklearn.model_selection import (GridSearchCV, GroupShuffleSplit, ParameterGrid, StratifiedGroupKFold)
from sklearn.base import clone

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from pathlib import Path

RND = 42  # fixed seed across splits and models

PROJECT_ROOT = Path("..").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data_processed"

FEATURE_FILE = DATA_PROCESSED / "mamun_HER_features.csv"
df = pd.read_csv(FEATURE_FILE)

### 1. Load and basic wrangling

In [None]:
# dataset load
df = pd.read_csv(FEATURE_FILE)
df = df.rename(columns={"reactionEnergy_eV": "Eads"})

# mark Au-containing rows
df["is_Au"] = df[["surf_A", "surf_B"]].apply(lambda r: "Au" in r.values, axis=1)

# target and features definition
target = "Eads"
feature_cols = ["facet", "SA", "SB", "GCN", "WAR", "WEN", "WIE", "Psi", "vol_per_atom"]
num_cols = feature_cols[:]  # all treated as numeric in this pipeline

# group ID definitions for "systems" (order-invariant surf pair + facet + site)
surf_pair = df[["surf_A", "surf_B"]].astype(str).apply(lambda r: "-".join(sorted(r)), axis=1)
sys_id = surf_pair + "_" + df["facet"].astype(str) + "_" + df["site_simple_collapsed"].astype(str)
# sys_id is used for both the outer train/test split and CV


### 2. Data split
Group aware 80/20 outer train/test split by system, Au-containing systems share in the test/train set is emergent.

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=RND)
tr_idx, te_idx = next(gss.split(df, groups=sys_id))

train_df = df.iloc[tr_idx].copy()
test_df = df.iloc[te_idx].copy()

X_train, y_train = train_df[feature_cols], train_df[target]
X_test, y_test = test_df[feature_cols], test_df[target]

# group IDs for cross-validation on the training set
groups_train = sys_id.loc[X_train.index].values

### 3. Pre-processing

In [None]:
pre = ColumnTransformer([("num", MinMaxScaler(), num_cols)]) # optional for Extra Trees

### 4. ExtraTrees Regressor
Extra Trees Regressor with the hyperparameter grid.

In [None]:
extr = ExtraTreesRegressor(random_state=RND)

param_grid = {
    "extr__n_estimators": [1000],
    "extr__max_features": list(range(1, len(feature_cols) + 1)) + ["sqrt"],
    "extr__max_depth": [100, 300, 600, 900],
    "extr__min_samples_split": [2, 3, 4],
}

pipe = Pipeline([("pre", pre), ("extr", extr)])

### 5. Cross-validation

In [None]:
strat_labels = train_df.loc[X_train.index, "is_Au"].astype(int).values
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=RND)

# precompute splits to pass y_train to the model and stratify on is_Au
cv = list(sgkf.split(X_train, y=strat_labels, groups=groups_train))

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,              
    n_jobs=-1,
    verbose=2,
    scoring="neg_mean_absolute_error",
)

grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

### 6. Evaluation

In [None]:
y_pred = grid.best_estimator_.predict(X_test)

mask_au = test_df["is_Au"].values.astype(bool)
mask_non_au = ~mask_au

def show_metrics(label, y_true, y_hat):
    mae = mean_absolute_error(y_true, y_hat)
    rmse = np.sqrt(mean_squared_error(y_true, y_hat))
    r2 = r2_score(y_true, y_hat)
    mad = median_absolute_error(y_true, y_hat)
    print(f"{label:12s} | MAE: {mae:.4f}  RMSE: {rmse:.4f}  RÂ²: {r2:.4f}  MAD: {mad:.4f}")

print(f"\nGroup-aware test set: {len(test_df)} rows " 
      f"({mask_au.sum()} Au, {mask_non_au.sum()} non-Au)")

show_metrics("All rows", y_test, y_pred)
show_metrics("Au only", y_test[mask_au], y_pred[mask_au])
show_metrics("non-Au only", y_test[mask_non_au], y_pred[mask_non_au])


### 7. Per-fold metrics

In [None]:
strat_labels = train_df.loc[X_train.index, "is_Au"].astype(int).values

rows = []
for k, (tr, va) in enumerate(cv, start=1): # reuse precomputed SGKF splits
    est = clone(grid.best_estimator_) # same pipeline with best hyperparams
    est.fit(X_train.iloc[tr], y_train.iloc[tr])
    y_hat = est.predict(X_train.iloc[va])

    y_true = y_train.iloc[va]
    mae_all = mean_absolute_error(y_true, y_hat)

    au_mask = (strat_labels[va] == 1)
    mae_au = mean_absolute_error(y_true[au_mask], y_hat[au_mask]) if au_mask.any() else np.nan

    rows.append({"Fold": k, "MAE_all": mae_all, "MAE_Au": mae_au})

per_fold = pd.DataFrame(rows, columns=["Fold", "MAE_all", "MAE_Au"])
print(per_fold.to_string(index=False))
print("Mean MAE_all:", per_fold["MAE_all"].mean())
print("Mean MAE_Au:", np.nanmean(per_fold["MAE_Au"]))