In [None]:
import sys
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, LeaveOneGroupOut
from sklearn.metrics import mean_squared_error

# Set ROOT path to access other directories in project
ROOT = Path.cwd().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

import SnowDepth.data_loader as DL
import SnowDepth.data_splitter as DS
import SnowDepth.optimal_features as OF

In [2]:
# Assign seed
seed = 18

# Path to TIFF files
data_dir = ROOT/"data"/"tif_files"

# Select holdout AOI
holdout_aoi="ID_BS"

# Select amount of features to select from FF algos
top_k = 10

# Load dataframe
df = DL.build_df(str(data_dir), drop_invalid=True, upper_threshold=3)

dev_df  = df[df['aoi_name'] != holdout_aoi].copy()
hold_df = df[df['aoi_name'] == holdout_aoi].copy()

In [4]:
ff_algos = OF.optimal_feature_sets(dev_df, top_k=10, n_per_aoi=10000)

base_cols = ["aoi_name", "row", "col", "SD"]

# HSIC
dev_df_HSIC  = dev_df[base_cols + ff_algos["HSIC"]].copy()
hold_df_HSIC = hold_df[base_cols + ff_algos["HSIC"]].copy()

# PCC
dev_df_PCC  = dev_df[base_cols + ff_algos["PCC"]].copy()
hold_df_PCC = hold_df[base_cols + ff_algos["PCC"]].copy()

# MI
dev_df_MI  = dev_df[base_cols + ff_algos["MI"]].copy()
hold_df_MI = hold_df[base_cols + ff_algos["MI"]].copy()


Block HSIC Lasso B = 20.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.
HSIC (top 10): ['IAFE', 'Gamma_VH_RTC', 'cos_Aspect', 'Gamma_VV_RTC', 'Beta_ratio', 'Slope', 'LIA', 'Gamma_RTC_ratio', 'Beta_VH', 'sin_Aspect']
PCC  (top 10): ['IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Elevation', 'Beta_VH', 'Slope', 'LIA']
MI   (top 10): ['IAFE', 'Elevation', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Gamma_RTC_sum', 'cos_Aspect', 'Beta_VH', 'Slope', 'Gamma_ratio']


In [5]:
# HSIC
X_dev_HSIC, y_dev_HSIC, groups_HSIC, X_hold_HSIC, y_hold_HSIC = DS.RF_split(
    dev_df=dev_df_HSIC,
    hold_df=hold_df_HSIC,
    seed=seed,
    pxs_per_aoi=10000
)

# PCC
X_dev_PCC, y_dev_PCC, groups_PCC, X_hold_PCC, y_hold_PCC = DS.RF_split(
    dev_df=dev_df_PCC,
    hold_df=hold_df_PCC,
    seed=seed,
    pxs_per_aoi=10000
)

# MI
X_dev_MI, y_dev_MI, groups_MI, X_hold_MI, y_hold_MI = DS.RF_split(
    dev_df=dev_df_MI,
    hold_df=hold_df_MI,
    seed=seed,
    pxs_per_aoi=10000
)


Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'Gamma_VH_RTC', 'cos_Aspect', 'Gamma_VV_RTC', 'Beta_ratio', 'Slope', 'LIA', 'Gamma_RTC_ratio', 'Beta_VH', 'sin_Aspect']
X_dev shape: (50000, 10)
X_hold shape: (1655811, 10)
Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'cos_Aspect', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Elevation', 'Beta_VH', 'Slope', 'LIA']
X_dev shape: (50000, 8)
X_hold shape: (1655811, 8)
Total samples: 50000 across 5 AOIs
Features used: ['IAFE', 'Elevation', 'Gamma_VH_RTC', 'Gamma_VV_RTC', 'Gamma_RTC_sum', 'cos_Aspect', 'Beta_VH', 'Slope', 'Gamma_ratio']
X_dev shape: (50000, 9)
X_hold shape: (1655811, 9)


Train RF and tune hyperparameters

In [None]:
''' RF with features from PCC '''

rf = RandomForestRegressor(
    random_state=seed,
    bootstrap=True,
    n_jobs=-1
)

# Hyperparameters
param_dist = {
    "n_estimators": [200, 300, 400, 600],
    "max_depth": [12, 14, 16, 18],
    "max_features": ["sqrt", 0.3, 0.5, 3],
    "min_samples_leaf": [2, 5, 10, 20, 50, 0.005, 0.01],
    "max_samples": [0.3, 0.5],
}

logo = LeaveOneGroupOut()

# Randomized search
HSIC_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,                               
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
HSIC_search.fit(X_dev_HSIC, y_dev_HSIC, groups=groups_HSIC)

# Results
print("Results - RF with HSIC feature set")
print("HSIC — Best hyperparameters:", HSIC_search.best_params_)
print("HSIC — Best CV RMSE:", -HSIC_search.best_score_)


Fitting 5 folds for each of 40 candidates, totalling 200 fits
HSIC — Best hyperparameters: {'n_estimators': 300, 'min_samples_leaf': 50, 'max_samples': 0.5, 'max_features': 3, 'max_depth': 18}
HSIC — Best CV RMSE: 0.48799733352329416
HSIC — Hold-out RMSE: 0.4976368738047478




In [None]:
''' RF with features from PCC '''

rf = RandomForestRegressor(
    random_state=seed,
    bootstrap=True,
    n_jobs=-1
)

# Hyperparameters
param_dist = {
    "n_estimators": [200, 300, 400, 600],
    "max_depth": [12, 14, 16, 18],
    "max_features": ["sqrt", 0.3, 0.5, 3],
    "min_samples_leaf": [2, 5, 10, 20, 50, 0.005, 0.01],
    "max_samples": [0.3, 0.5],
}

logo = LeaveOneGroupOut()

PCC_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
PCC_search.fit(X_dev_PCC, y_dev_PCC, groups=groups_PCC)

# Results
print("Results - RF with PCC feature set")
print("PCC — Best hyperparameters:", PCC_search.best_params_)
print("PCC — Best CV RMSE:", -PCC_search.best_score_)


In [None]:
''' RF with features from MI '''

rf = RandomForestRegressor(
    random_state=seed,
    bootstrap=True,
    n_jobs=-1
)

# Hyperparameters
param_dist = {
    "n_estimators": [200, 300, 400, 600],
    "max_depth": [12, 14, 16, 18],
    "max_features": ["sqrt", 0.3, 0.5, 3],
    "min_samples_leaf": [2, 5, 10, 20, 50, 0.005, 0.01],
    "max_samples": [0.3, 0.5],
}

logo = LeaveOneGroupOut()

MI_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,
    cv=logo,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
    random_state=seed
)

# Fit
MI_search.fit(X_dev_MI, y_dev_MI, groups=groups_MI)

# Results
print("Results - RF with MI feature set")
print("Best hyperparameters:", MI_search.best_params_)
print("Best CV RMSE:", -MI_search.best_score_)
