modelado (paramétrico, no paramétrico y ML) para estimar parámetros de la vegetación a partir de variables SAR

**Objetivo (Y)**: estimar parámetros biofísicos de vegetación medidos en campo (biomasa, contenido de agua, indice de biomasa por altura IBA).

**Predictores (X)**: derivados de SAR Sentinel‑1

# 1- Carga y preparación de datos

In [1]:
from utils.model_prep import load_dataset

# 1) Datos
data_path = '../../data/processed/SARGrass.csv'
df = load_dataset(data_path)
df.head()

Unnamed: 0,id_point,datetime,VWC,total_biomass,IBA,Sigma0_RATIO_VH_VV,Gamma0_RATIO_VH_VV,PC4,Alpha
0,G1-1,2025-02-19 11:11:00,315.945035,1263.78014,1853.050059,0.809757,0.800494,1.222716,25.555786
1,G1-13,2025-02-19 11:48:00,166.89669,667.58676,755.188643,0.809795,0.80054,1.29496,30.238604
2,G1-17,2025-02-19 11:59:00,152.69669,610.78676,690.935249,0.809814,0.800561,1.309476,28.057308
3,G1-25,2025-02-19 12:17:00,188.19669,752.78676,865.272138,0.809852,0.800572,1.36153,22.238356
4,G1-9,2025-02-19 11:30:00,283.145035,1132.58014,1926.156701,0.809778,0.800519,1.462057,24.861599


# 2- Modeling
## Lineal

In [2]:
from utils.model_prep import select_xy, random_splits

# Definir las columnas de características y objetivo
target_col = ['VWC', 'total_biomass', 'IBA']
feature_cols = ['Sigma0_RATIO_VH_VV', 'Gamma0_RATIO_VH_VV', 'PC4', 'Alpha']
X, y = select_xy(df, target_col, feature_cols)
num_features = list(X.columns)

# Dividir en conjuntos de prueba y validacion
splits = random_splits(X, y, val_size=0.3, random_state=42)
(X_train, y_train) = splits["train"]
(X_val,   y_val)   = splits["val"]

In [3]:
out_dir = '../../results/grass/artifacts_linear'

from models.linear import run_baseline_from_splits
# Ejecutar el baseline lineal para cada target_col
for target in target_col:
    print(f"Ejecutando LR para: {target}")
    metrics_LR = run_baseline_from_splits(
        X_train, y_train[target],
        X_val, y_val[target],
        target,
        num_features,
        model_type="ridge",
        alpha=1.0,
        outdir=out_dir
    )
    print(f"Resultados para {target}: {metrics_LR}")

Ejecutando LR para: VWC
Resultados para VWC: {'train': {'RMSE': 97.23853675457148, 'MAE': 75.77603384945944, 'R2': 0.24734444898153207, 'rRMSE(%)': 42.646550162941246}, 'val': {'RMSE': 98.56567851142259, 'MAE': 83.30143823038335, 'R2': 0.09409565903495942, 'rRMSE(%)': 48.92987800813021}}
Ejecutando LR para: total_biomass
Resultados para total_biomass: {'train': {'RMSE': 388.9541470182859, 'MAE': 303.1041353978377, 'R2': 0.24734444898153207, 'rRMSE(%)': 42.64655016294124}, 'val': {'RMSE': 394.2627140456904, 'MAE': 333.2057529215335, 'R2': 0.09409565903495931, 'rRMSE(%)': 48.92987800813022}}
Ejecutando LR para: IBA
Resultados para IBA: {'train': {'RMSE': 1406.4941678956948, 'MAE': 952.4627345691533, 'R2': 0.36371793978272826, 'rRMSE(%)': 61.46970245606338}, 'val': {'RMSE': 1088.0183264923426, 'MAE': 828.3224805358325, 'R2': -0.18890173871931037, 'rRMSE(%)': 62.477798825445674}}


<Figure size 640x480 with 0 Axes>

## No paramétrico

In [5]:
from models.nonparam import run_nw_from_splits
import numpy as np
outdir = '../../results/grass/artifacts_nadaraya-watson'
bandwidth_grid = np.logspace(-3, 1, 50) 

# Ejecutar el baseline Nadaraya-Watson para cada target_col
for target in target_col:
    print(f"Ejecutando NW para {target}")
    metrics_nw = run_nw_from_splits(
        X_train, y_train[target],
        X_val, y_val[target],
        target_col=target,
        num_features=num_features,
        bandwidth_grid=bandwidth_grid,
        reg_type="lc",
        outdir=out_dir
    )
    print(f'Metricas Nadaraya-Watson para {target}: {metrics_nw}')

Ejecutando NW para VWC
Metricas Nadaraya-Watson para VWC: {'train': {'RMSE': 82.72496791318007, 'MAE': 62.52106178825335, 'R2': 0.4552557511688826, 'rRMSE(%)': 36.28123799046451}, 'val': {'RMSE': 91.25269625737661, 'MAE': 78.8680981874229, 'R2': 0.22353422094000663, 'rRMSE(%)': 45.29957448899374}}
Ejecutando NW para total_biomass
Metricas Nadaraya-Watson para total_biomass: {'train': {'RMSE': 330.89987165272026, 'MAE': 250.0842471530134, 'R2': 0.4552557511688827, 'rRMSE(%)': 36.28123799046451}, 'val': {'RMSE': 365.0107850295065, 'MAE': 315.4723927496916, 'R2': 0.22353422094000652, 'rRMSE(%)': 45.29957448899375}}
Ejecutando NW para IBA
Metricas Nadaraya-Watson para IBA: {'train': {'RMSE': 1375.7011829252244, 'MAE': 854.9812578562891, 'R2': 0.39127375208165616, 'rRMSE(%)': 60.12391968136422}, 'val': {'RMSE': 974.1628941281334, 'MAE': 756.4840495637523, 'R2': 0.04690386709234706, 'rRMSE(%)': 55.939823659744036}}


## Random Forest

In [6]:
from models.random_forest import run_rf_from_splits
outdir = '../../results/grass/artifacts_random_forest'

param_dist = {
    "n_estimators": np.arange(100, 1500, 100),  # Puedes probar desde 100 hasta 1500 árboles
    "max_depth": [None] + list(np.arange(10, 50, 5)),  # Profundidad del árbol
    "min_samples_split": np.arange(2, 21, 2),  # Mínimo de muestras para dividir un nodo
    "min_samples_leaf": np.arange(1, 21, 2),  # Mínimo de muestras para ser hoja
    "max_features": [None, "sqrt", "log2"],  # Número de features a probar
    "bootstrap": [True, False]  # Usar muestras con reemplazo o no
}

for target in target_col:
    print(f"Ejecutando RF para {target}")
    metrics_rf = run_rf_from_splits(
        X_train=X_train, 
        y_train=y_train[target],
        X_val=X_val, 
        y_val=y_val[target],
        target_col=target,
        param_dist=param_dist,
        num_features=num_features,
        outdir=out_dir
    )
    print(f'Metricas Random Forest para {target}: {metrics_rf}')

Ejecutando RF para VWC
Fitting 5 folds for each of 100 candidates, totalling 500 fits




Metricas Random Forest para VWC: {'train': {'RMSE': 66.5904734856184, 'MAE': 50.65666150699758, 'R2': 0.6470252454320325, 'rRMSE(%)': 29.205025730140104}, 'val': {'RMSE': 83.08465975146959, 'MAE': 70.14755648433736, 'R2': 0.35631616849881154, 'rRMSE(%)': 41.24480577197356}}
Ejecutando RF para total_biomass
Fitting 5 folds for each of 100 candidates, totalling 500 fits




Metricas Random Forest para total_biomass: {'train': {'RMSE': 266.36189394247356, 'MAE': 202.62664602799032, 'R2': 0.6470252454320327, 'rRMSE(%)': 29.205025730140093}, 'val': {'RMSE': 332.47255611984605, 'MAE': 280.6656836144401, 'R2': 0.35579731430451655, 'rRMSE(%)': 41.26142552275446}}
Ejecutando RF para IBA
Fitting 5 folds for each of 100 candidates, totalling 500 fits




Metricas Random Forest para IBA: {'train': {'RMSE': 1383.1801166472806, 'MAE': 811.4371847653186, 'R2': 0.38463713887074524, 'rRMSE(%)': 60.4507804967711}, 'val': {'RMSE': 971.2887573705914, 'MAE': 701.8023380624339, 'R2': 0.05251953493389261, 'rRMSE(%)': 55.77478072456348}}


## LightGBM

In [11]:
from models.lightgbm import run_lgbm_from_splits
outdir = '../../results/grass/artifacts_lgbm'

param_dist = {
    "n_estimators": np.arange(100, 1500, 100),  # Puedes probar desde 100 hasta 1500 árboles
    "max_depth": [None, 10, 20, 30, 40, 50],  # Profundidad del árbol
    "min_samples_split": np.arange(2, 21, 2),  # Mínimo de muestras para dividir un nodo
    "min_samples_leaf": np.arange(1, 21, 2),  # Mínimo de muestras para ser hoja
    "max_features": [None, "sqrt", "log2"],  # Número de features a probar
    "bootstrap": [True, False]  # Usar muestras con reemplazo o no
}

# Ejecutar el modelo LightGBM para cada variable
for target in target_col:
    print(f"Ejecutando LGBM para: {target}")
    metrics_LGBM = run_lgbm_from_splits(
        X_train, y_train[target],
        X_val, y_val[target],
        target_col=target,
        param_dist=param_dist,
        num_features=num_features,
        outdir=outdir
    )
    print(f"Resultados para {target}: {metrics_LGBM}")

Ejecutando LGBM para: VWC
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Resultados para VWC: {'train': {'RMSE': 63.42428084518919, 'MAE': 48.78378652860449, 'R2': 0.679793206115118, 'rRMSE(%)': 27.81640761871781}, 'val': {'RMSE': 82.2558133224513, 'MAE': 70.44282784506282, 'R2': 0.3690947943823878, 'rRMSE(%)': 40.8333506359483}}
Ejecutando LGBM para: total_biomass
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Resultados para total_biomass: {'train': {'RMSE': 253.69712338075675, 'MAE': 195.13514611441795, 'R2': 0.679793206115118, 'rRMSE(%)': 27.81640761871781}, 'val': {'RMSE': 329.02325328980527, 'MAE': 281.7713113802513, 'R2': 0.3690947943823878, 'rRMSE(%)': 40.833350635948314}}
Ejecutando LGBM para: IBA
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Resultados para IBA: {'train': {'RMSE': 976.0321872708165, 'MAE': 615.0703289927521, 'R2': 0.6935905981064212, 'rRMSE(%)': 42.65670594911926}, 'val': {'RMSE': 957.1450664295775, 'MAE': 716.

# Prediction maps