Load and clean data

In [None]:
import sys
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV, LeaveOneGroupOut

# Set ROOT path to access other directories in project
ROOT = Path.cwd().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import SnowDepth.data_loader as DL
import SnowDepth.data_splitter as DS
import SnowDepth.optimal_features as OF

In [7]:
# Assign seed
seed = 18

# Path to TIFF files
data_dir = ROOT/"data"/"tif_files"

# Select holdout AOI
holdout_aoi="ID_BS"

# Select amount of features to select from FF algos
top_k = 10

# Load dataframe
df = DL.build_df(str(data_dir), drop_invalid=True, upper_threshold=3)

dev_df  = df[df['aoi_name'] != holdout_aoi].copy()
hold_df = df[df['aoi_name'] == holdout_aoi].copy()

In [8]:
ff_algos = OF.optimal_feature_sets(dev_df, top_k=10, n_per_aoi=10000)

base_cols = ["aoi_name", "row", "col", "SD"]

# HSIC
dev_df_HSIC  = dev_df[base_cols + ff_algos["HSIC"]].copy()
hold_df_HSIC = hold_df[base_cols + ff_algos["HSIC"]].copy()

# PCC
dev_df_PCC  = dev_df[base_cols + ff_algos["PCC"]].copy()
hold_df_PCC = hold_df[base_cols + ff_algos["PCC"]].copy()

# MI
dev_df_MI  = dev_df[base_cols + ff_algos["MI"]].copy()
hold_df_MI = hold_df[base_cols + ff_algos["MI"]].copy()


Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.
HSIC selected: ['IAFE', 'Gamma_VH_RTC', 'cos_Aspect', 'Gamma_VV_RTC', 'Beta_ratio', 'Slope', 'LIA', 'Gamma_RTC_ratio', 'Beta_VH', 'sin_Aspect']
PCC selected: ['IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Elevation', 'Beta_VH', 'Slope', 'LIA']
MI selected): ['IAFE', 'Elevation', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Gamma_RTC_sum', 'cos_Aspect', 'Beta_VH', 'Slope', 'Gamma_ratio']


Split data for training XGBoost

In [9]:
# HSIC
X_dev_HSIC, y_dev_HSIC, groups_HSIC, X_hold_HSIC, y_hold_HSIC = DS.ML_split(
    dev_df=dev_df_HSIC,
    hold_df=hold_df_HSIC,
    seed=seed,
    pxs_per_aoi=10000
)

# PCC
X_dev_PCC, y_dev_PCC, groups_PCC, X_hold_PCC, y_hold_PCC = DS.ML_split(
    dev_df=dev_df_PCC,
    hold_df=hold_df_PCC,
    seed=seed,
    pxs_per_aoi=10000
)

# MI
X_dev_MI, y_dev_MI, groups_MI, X_hold_MI, y_hold_MI = DS.ML_split(
    dev_df=dev_df_MI,
    hold_df=hold_df_MI,
    seed=seed,
    pxs_per_aoi=10000
)


Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'Gamma_VH_RTC', 'cos_Aspect', 'Gamma_VV_RTC', 'Beta_ratio', 'Slope', 'LIA', 'Gamma_RTC_ratio', 'Beta_VH', 'sin_Aspect']
X_dev shape: (50000, 10)
X_hold shape: (1655811, 10)
Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Elevation', 'Beta_VH', 'Slope', 'LIA']
X_dev shape: (50000, 8)
X_hold shape: (1655811, 8)
Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'Elevation', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Gamma_RTC_sum', 'cos_Aspect', 'Beta_VH', 'Slope', 'Gamma_ratio']
X_dev shape: (50000, 9)
X_hold shape: (1655811, 9)


Train XGBoost and tune hyperparameters

In [10]:
xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",  
    n_jobs=1,             
    random_state=seed,
    eval_metric="rmse"
)

# Hyperparameters
param_dist = {
    "n_estimators": [400, 600, 800, 1000],
    "learning_rate": [0.03, 0.05, 0.07, 0.1, 0.15],
    "max_depth": [4, 5, 6, 8, 10],
    "min_child_weight": [1, 2, 4, 6, 8, 12, 16],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0, 0.5, 1, 2, 5, 10],
    "reg_alpha": [0, 1e-4, 1e-3, 1e-2, 0.1, 1],
    "max_bin": [256, 512]
}

logo = LeaveOneGroupOut()

HSIC_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30, 
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
HSIC_search.fit(X_dev_HSIC, y_dev_HSIC, groups=groups_HSIC)

# CV results
print("Results - XGBoost with HSIC feature set")
print("HSIC — Best hyperparameters:", HSIC_search.best_params_)
print("HSIC — Best CV RMSE:", -HSIC_search.best_score_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Results - XGBoost with HSIC feature set
HSIC — Best hyperparameters: {'subsample': 0.9, 'reg_lambda': 0.5, 'reg_alpha': 1, 'n_estimators': 600, 'min_child_weight': 2, 'max_depth': 10, 'max_bin': 512, 'learning_rate': 0.03, 'colsample_bytree': 0.7}
HSIC — Best CV RMSE: 0.5146892786026


In [11]:
# XGBoost with PCC feature set

xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=1,
    random_state=seed,
    eval_metric="rmse"
)

param_dist = {
    "n_estimators": [400, 600, 800, 1000],
    "learning_rate": [0.03, 0.05, 0.07, 0.1, 0.15],
    "max_depth": [4, 5, 6, 8, 10],
    "min_child_weight": [1, 2, 4, 6, 8, 12, 16],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0, 0.5, 1, 2, 5, 10],
    "reg_alpha": [0, 1e-4, 1e-3, 1e-2, 0.1, 1],
    "max_bin": [256, 512]
}

logo = LeaveOneGroupOut()

PCC_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
PCC_search.fit(X_dev_PCC, y_dev_PCC, groups=groups_PCC)

# CV results
print("Results - XGBoost with PCC feature set")
print("PCC — Best hyperparameters:", PCC_search.best_params_)
print("PCC — Best CV RMSE:", -PCC_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Results - XGBoost with PCC feature set
PCC — Best hyperparameters: {'subsample': 1.0, 'reg_lambda': 10, 'reg_alpha': 1, 'n_estimators': 1000, 'min_child_weight': 2, 'max_depth': 6, 'max_bin': 256, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
PCC — Best CV RMSE: 0.5190581262111664


In [12]:
# XGBoost with PCC feature set

xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=1,
    random_state=seed,
    eval_metric="rmse"
)

param_dist = {
    "n_estimators": [400, 600, 800, 1000],
    "learning_rate": [0.03, 0.05, 0.07, 0.1, 0.15],
    "max_depth": [4, 5, 6, 8, 10],
    "min_child_weight": [1, 2, 4, 6, 8, 12, 16],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0, 0.5, 1, 2, 5, 10],
    "reg_alpha": [0, 1e-4, 1e-3, 1e-2, 0.1, 1],
    "max_bin": [256, 512]
}

logo = LeaveOneGroupOut()

PCC_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
PCC_search.fit(X_dev_PCC, y_dev_PCC, groups=groups_PCC)

# CV results
print("Results - XGBoost with PCC feature set")
print("PCC — Best hyperparameters:", PCC_search.best_params_)
print("PCC — Best CV RMSE:", -PCC_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Results - XGBoost with PCC feature set
PCC — Best hyperparameters: {'subsample': 1.0, 'reg_lambda': 10, 'reg_alpha': 1, 'n_estimators': 1000, 'min_child_weight': 2, 'max_depth': 6, 'max_bin': 256, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
PCC — Best CV RMSE: 0.5190581262111664


In [1]:
# XGBoost with MI feature set

xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=1,
    random_state=seed,
    eval_metric="rmse"
)

param_dist = {
    "n_estimators": [400, 600, 800, 1000],
    "learning_rate": [0.03, 0.05, 0.07, 0.1, 0.15],
    "max_depth": [4, 5, 6, 8, 10],
    "min_child_weight": [1, 2, 4, 6, 8, 12, 16],
    "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
    "reg_lambda": [0, 0.5, 1, 2, 5, 10],
    "reg_alpha": [0, 1e-4, 1e-3, 1e-2, 0.1, 1],
    "max_bin": [256, 512]
}

logo = LeaveOneGroupOut()

MI_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
MI_search.fit(X_dev_MI, y_dev_MI, groups=groups_MI)

# CV results
print("Results - XGBoost with MI feature set")
print("MI — Best hyperparameters:", MI_search.best_params_)
print("MI — Best CV RMSE:", -MI_search.best_score_)

NameError: name 'XGBRegressor' is not defined

In [None]:
# Print best XGBoost model and feature set

xgb_results = {
    "HSIC": (-HSIC_search.best_score_, HSIC_search.best_params_),
    "PCC":  (-PCC_search.best_score_,  PCC_search.best_params_),
    "MI":   (-MI_search.best_score_,   MI_search.best_params_),
}

print("\nCross-validation RMSE results (XGBoost):")
for name, (rmse, params) in xgb_results.items():
    print(f"\n{name} — CV RMSE: {rmse:.4f}")
    print(f"{name} — Best hyperparameters: {params}")

# Find the winner
best_xgb_method = min(xgb_results, key=lambda k: xgb_results[k][0])
best_xgb_rmse, best_xgb_params = xgb_results[best_xgb_method]

print(f"\n🏆 Best feature set with XGBoost: {best_xgb_method} "
      f"(CV RMSE = {best_xgb_rmse:.4f})")
print(f"Best XGB hyperparameters: {best_xgb_params}")
