In [1]:
# Data manipulation
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio

# Machine learning
import lightgbm as lgb

# Optimization
import optuna
import optuna.visualization as vis

# System utilities
import os
from PIL import Image
import joblib

from pygam import LinearGAM
from sklearn.metrics import r2_score

In [2]:
#lidar-based target data
lidar_path = 'data/raster/lidar_products/2019_lidar_cover.tif'

#rap data (predictor)
rap_path = 'data/raster/rap/woody_cover_2019.tif'

#static terrain predictors
dem_path = 'data/raster/ned/ned.tif'
slope_path = 'data/raster/terrain/slope.tif'
hli_path = 'data/raster/terrain/hli.tif'
twi_path = 'data/raster/terrain/twi.tif'
covertype_path = 'data/raster/terrain/covertype.tif'
#train sample locations
train_points_path = 'data/vector/train_points_with_folds.geojson'

gpd_train_points = gpd.read_file(train_points_path)

# Read raster data for all points first
with rasterio.open(lidar_path) as src:
    lidar = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(rap_path) as src:
    rap = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(dem_path) as src:
    dem = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(slope_path) as src:
    slope = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(hli_path) as src:
    hli = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(twi_path) as src:
    twi = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

Y = np.array(lidar)

# Create feature dataframe for all points, including tpi values for each ngb_val
X = pd.DataFrame({
    'rap': rap,
    'dem': dem, 
    'slope': slope,
    'hli': hli,
    'twi': twi
})

# Pre-allocate arrays for tpi and weights
tpi_values = {}

ngb_vals = [30, 60, 120, 240, 480, 960, 1920]

# Iterate over possible parameter values and read corresponding rasters into X
for ngb_val in ngb_vals:
    tpi_path = f'data/raster/terrain/tpi_{ngb_val}.tif'
    with rasterio.open(tpi_path) as src:
        tpi_values[ngb_val] = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]



In [3]:
def normalized_gini(y_pred, y_true):
    """
    Computes the normalized Gini coefficient.
    
    Parameters:
    -----------
    y_pred : array-like
        The predicted values (e.g., model predictions).
    y_true : array-like
        The true target values.
    
    Returns:
    --------
    float
        The normalized Gini coefficient.
    """
    def sum_gini(a_pred, a_true):
        # Sort true values by descending predicted values
        order = np.argsort(-a_pred)
        a_true_sorted = a_true[order]
        
        # Construct the random cumulative array
        n = len(a_pred)
        random_cum = np.arange(1, n+1) / n
        
        # Calculate the Lorentz curve (cumulative) for the sorted true values
        lorentz = np.cumsum(a_true_sorted) / np.sum(a_true_sorted)
        
        # Gini is the area between the Lorentz curve and the uniform distribution
        return np.sum(lorentz - random_cum)
    
    return sum_gini(np.array(y_pred).astype(int), np.array(y_true).astype(int)) / sum_gini(np.array(y_true).astype(int), np.array(y_true).astype(int))

#def objective(trial):
def objective(trial, X, tpi_values, Y, gpd_train_points):
    params = {
        'objective': 'tweedie',
        'metric': None,
        'random_state': 42,
        'n_jobs': 1,
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 250, 3000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 30),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.95),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1e1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1e1, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1e-6, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 1e1, log=True),
        'tpi_ngb': trial.suggest_categorical('tpi_ngb', ngb_vals),
        'n_splines': trial.suggest_int('n_splines', 10, 30),
    }
    
    def process_fold(fold_idx, params, X, tpi_values, Y, gpd_train_points):
        train_mask = (gpd_train_points['fold'] != fold_idx)
        test_mask = (gpd_train_points['fold'] == fold_idx)

        # Add tpi to the feature dataframe
        X_fold = X.copy()
        X_fold['tpi'] = tpi_values[params['tpi_ngb']]

        X_train = X_fold[train_mask]
        X_test = X_fold[test_mask]
        y_train = np.array(Y)[train_mask]
        y_test = np.array(Y)[test_mask]

        # Create the LightGBM model
        lgb_model = lgb.LGBMRegressor(**params)

        # Fit the LightGBM model
        lgb_model.fit(X_train, y_train)

        # 1. Predict with LightGBM on train and test sets
        lgb_train_preds = lgb_model.predict(X_train)
        lgb_test_preds = lgb_model.predict(X_test)

        # 2. Stack a GAM
        gam = LinearGAM(n_splines=params['n_splines'])
        gam.fit(lgb_train_preds.reshape(-1, 1), y_train)

        # 4. Predict with the GAM model
        final_preds = gam.predict(lgb_test_preds.reshape(-1, 1))
        final_preds = np.clip(final_preds, 0, 100)

        # 5. Evaluate the stacked model
        giniscore = normalized_gini(final_preds, y_test)
        r2score = r2_score(y_test, final_preds)

        return np.mean([giniscore, r2score])

    num_folds = 5

    scores = joblib.Parallel(n_jobs=num_folds)(
        joblib.delayed(process_fold)(fold_idx, params, X, tpi_values, Y, gpd_train_points)
        for fold_idx in range(num_folds)
    )
    final_score = np.mean(scores)
    if final_score > 0 and final_score < 1:
        return final_score
    else:
        return 0


In [4]:
results_dir = f'results/optimization'
os.makedirs(results_dir, exist_ok=True)

# Define the save_study_callback function
def save_study_callback(study, trial):
    joblib.dump(study, os.path.join(results_dir, "study.pkl"))

    values = [t.value for t in study.trials]
    #min_value = np.median(values) * 0.8
    #max_value = np.max(values) + 0.01

    fig_history = vis.plot_optimization_history(study)
    #fig_history.update_yaxes(range=[min_value, max_value])  # Set y-axis range
    fig_history.write_image(os.path.join(results_dir, "optimization_history.png"))

    fig_slice = vis.plot_slice(study)
    #fig_slice.update_yaxes(range=[min_value, max_value])  # Set y-axis range
    fig_slice.write_image(os.path.join(results_dir, "slice_plot.png"))

trial_count = 150

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X, tpi_values, Y, gpd_train_points),
                n_trials=trial_count, callbacks=[save_study_callback])

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-01-15 22:38:56,427] A new study created in memory with name: no-name-9fa5f8f3-4e15-4337-ad8a-77253b693d1d
[I 2025-01-15 22:39:16,555] Trial 0 finished with value: 0.6888555177468159 and parameters: {'n_estimators': 1377, 'subsample': 0.7046372292735499, 'max_depth': 15, 'subsample_freq': 1, 'colsample_bytree': 0.5016902247730652, 'tweedie_variance_power': 1.5177153882044048, 'learning_rate': 0.0006077494557096616, 'num_leaves': 444, 'min_child_samples': 24, 'reg_alpha': 0.07567437756066178, 'reg_lambda': 1.2829226813567438, 'min_split_gain': 7.32502894801912e-08, 'min_child_weight': 0.0001253704323556288, 'tpi_ngb': 240, 'n_splines': 23}. Best is trial 0 with value: 0.6888555177468159.
[I 2025-01-15 22:39:42,381] Trial 1 finished with value: 0.6643747303214316 and parameters: {'n_estimators': 2584, 'subsample': 0.7027785321588713, 'max_depth': 10, 'subsample_freq': 9, 'colsample_bytree': 0.9847527416402613, 'tweedie_variance_power': 1.7975342707985185, 'learning_rate': 0.018023

Best trial:
  Value:  0.7347756890534961
  Params: 
    n_estimators: 1149
    subsample: 0.9761096708362259
    max_depth: 34
    subsample_freq: 16
    colsample_bytree: 0.9473248299715323
    tweedie_variance_power: 1.407663068135031
    learning_rate: 0.003065732381164207
    num_leaves: 23
    min_child_samples: 46
    reg_alpha: 0.30629005023048184
    reg_lambda: 5.2217107812405485
    min_split_gain: 4.752879344329507e-08
    min_child_weight: 0.026347269517283937
    tpi_ngb: 240
    n_splines: 12
