In [1]:
import sys
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, LeaveOneGroupOut
import time

# Set ROOT path to access other directories in project
ROOT = Path.cwd().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import SnowDepth.data_loader as DL
import SnowDepth.data_splitter as DS
import SnowDepth.optimal_features as OF
from SnowDepth.config import HOLDOUT_AOI
from SnowDepth.config import SEED

In [2]:
# Assign seed
seed = SEED

# Path to TIFF files
data_dir = ROOT/"data"/"tif_files"

# Select holdout AOI
holdout_aoi=HOLDOUT_AOI

# Select max amount of features to select from FF algos
top_k = 10

# Load dataframe
df = DL.build_df(str(data_dir), drop_invalid=True, upper_threshold=3)

# Development dataframe we will use for training models
dev_df  = df[df['aoi_name'] != holdout_aoi].copy()

In [3]:
# Run Feature filtering algorithms
ff_algos = OF.optimal_feature_sets(dev_df, top_k=10, n_per_aoi=20000)

base_cols = ["aoi_name", "row", "col", "SD"]

# HSIC
dev_df_HSIC  = dev_df[base_cols + ff_algos["HSIC"]].copy()
# PCC
dev_df_PCC  = dev_df[base_cols + ff_algos["PCC"]].copy()
# MI
dev_df_MI  = dev_df[base_cols + ff_algos["MI"]].copy()

Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.
HSIC selected: ['Veg_height', 'IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Sigma_ratio', 'Beta_VH', 'Gamma_RTC_ratio', 'Slope', 'LIA']
PCC selected: ['Veg_height', 'IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'sin_Aspect', 'Gamma_RTC_ratio', 'Beta_VH']
MI selected): ['IAFE', 'Elevation', 'Veg_height', 'cos_Aspect', 'Gamma_RTC_sum', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Slope', 'Beta_VH']


Split data for training XGBoost

In [4]:
# HSIC
X_dev_HSIC, y_dev_HSIC, groups_HSIC = DS.ML_split(
    dev_df=dev_df_HSIC,
    pxs_per_aoi=10000
)
# PCC
X_dev_PCC, y_dev_PCC, groups_PCC = DS.ML_split(
    dev_df=dev_df_PCC,
    pxs_per_aoi=10000
)
# MI
X_dev_MI, y_dev_MI, groups_MI = DS.ML_split(
    dev_df=dev_df_MI,
    pxs_per_aoi=10000
)

Total samples: 50000 across 5 AOIs
Features used: ['Veg_height', 'IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Sigma_ratio', 'Beta_VH', 'Gamma_RTC_ratio', 'Slope', 'LIA']
X_dev shape: (50000, 10)
Total samples: 50000 across 5 AOIs
Features used: ['Veg_height', 'IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'sin_Aspect', 'Gamma_RTC_ratio', 'Beta_VH']
X_dev shape: (50000, 7)
Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'Elevation', 'Veg_height', 'cos_Aspect', 'Gamma_RTC_sum', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Slope', 'Beta_VH']
X_dev shape: (50000, 9)


Train RF and tune hyperparameters

In [5]:

param_dist = {
    "n_estimators": [200, 300, 400, 600],
    "max_depth": [12, 14, 16, 18],
    "max_features": ["sqrt", 0.3, 0.5, 3],
    "min_samples_leaf": [2, 5, 10, 20, 50, 0.005, 0.01],
    "max_samples": [0.3, 0.5],
}

def run_rf_search(tag, X, y, groups, seed):
    rf = RandomForestRegressor(
        random_state=seed,
        bootstrap=True,
        n_jobs=-1
    )
    logo = LeaveOneGroupOut()
    search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=30,
        cv=logo,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=2,
        random_state=seed
    )

    start = time.time()
    search.fit(X, y, groups=groups)
    end = time.time()
    elapsed = (end - start) / 60.0  # minutes

    best_rmse = -search.best_score_
    best_params = search.best_params_

    print(f"\nResults - RF with {tag} feature set")
    print(f"Best hyperparameters: {best_params}")
    print(f"Best CV RMSE: {best_rmse:.4f}")
    print(f"Training time: {elapsed:.2f} minutes")

    return tag, best_rmse, best_params, elapsed


# Run all feature sets
rf_results = {}
timings = {}

for tag, X, y, g in [
    ("HSIC", X_dev_HSIC, y_dev_HSIC, groups_HSIC),
    ("PCC",  X_dev_PCC,  y_dev_PCC,  groups_PCC),
    ("MI",   X_dev_MI,   y_dev_MI,   groups_MI),
]:
    tag, rmse, params, time_min = run_rf_search(tag, X, y, g, seed)
    rf_results[tag] = (rmse, params)
    timings[tag] = time_min


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Results - RF with HSIC feature set
Best hyperparameters: {'n_estimators': 400, 'min_samples_leaf': 20, 'max_samples': 0.5, 'max_features': 0.3, 'max_depth': 16}
Best CV RMSE: 0.4692
Training time: 5.43 minutes
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Results - RF with PCC feature set
Best hyperparameters: {'n_estimators': 200, 'min_samples_leaf': 5, 'max_samples': 0.5, 'max_features': 0.3, 'max_depth': 18}
Best CV RMSE: 0.4761
Training time: 4.74 minutes
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Results - RF with MI feature set
Best hyperparameters: {'n_estimators': 300, 'min_samples_leaf': 2, 'max_samples': 0.3, 'max_features': 3, 'max_depth': 16}
Best CV RMSE: 0.4700
Training time: 4.93 minutes


In [6]:
# Print summary leaderboard
print("\nCross-validation RMSE results (Random Forest):")
for name, (rmse, params) in rf_results.items():
    print(f"\n{name} — CV RMSE: {rmse:.4f}")
    print(f"{name} — Best hyperparameters: {params}")
    print(f"{name} — Training time: {timings[name]:.2f} minutes")

# Find and print the overall best
best_rf_method = min(rf_results, key=lambda k: rf_results[k][0])
best_rf_rmse, best_rf_params = rf_results[best_rf_method]

print(f"\n🏆 Best feature set with RF: {best_rf_method} "
      f"(CV RMSE = {best_rf_rmse:.4f}, time = {timings[best_rf_method]:.2f} min)")
print(f"Best RF hyperparameters: {best_rf_params}")


Cross-validation RMSE results (Random Forest):

HSIC — CV RMSE: 0.4692
HSIC — Best hyperparameters: {'n_estimators': 400, 'min_samples_leaf': 20, 'max_samples': 0.5, 'max_features': 0.3, 'max_depth': 16}
HSIC — Training time: 5.43 minutes

PCC — CV RMSE: 0.4761
PCC — Best hyperparameters: {'n_estimators': 200, 'min_samples_leaf': 5, 'max_samples': 0.5, 'max_features': 0.3, 'max_depth': 18}
PCC — Training time: 4.74 minutes

MI — CV RMSE: 0.4700
MI — Best hyperparameters: {'n_estimators': 300, 'min_samples_leaf': 2, 'max_samples': 0.3, 'max_features': 3, 'max_depth': 16}
MI — Training time: 4.93 minutes

🏆 Best feature set with RF: HSIC (CV RMSE = 0.4692, time = 5.43 min)
Best RF hyperparameters: {'n_estimators': 400, 'min_samples_leaf': 20, 'max_samples': 0.5, 'max_features': 0.3, 'max_depth': 16}
