In [1]:
# Data manipulation
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio

# Machine learning
import lightgbm as lgb

# Optimization
import optuna
import optuna.visualization as vis

# System utilities
import os
from PIL import Image
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#lidar-based target data
lidar_path = 'data/raster/lidar_products/2019_lidar_cover.tif'

#rap data (predictor)
rap_path = 'data/raster/rap/woody_cover_2019.tif'

#static terrain predictors
dem_path = 'data/raster/ned/ned.tif'
slope_path = 'data/raster/terrain/slope.tif'
hli_path = 'data/raster/terrain/hli.tif'
twi_path = 'data/raster/terrain/twi.tif'
covertype_path = 'data/raster/terrain/covertype.tif'
#train sample locations
train_points_path = 'data/vector/train_points_with_folds.geojson'

gpd_train_points = gpd.read_file(train_points_path)

# Read raster data for all points first
with rasterio.open(lidar_path) as src:
    lidar = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(rap_path) as src:
    rap = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(dem_path) as src:
    dem = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(slope_path) as src:
    slope = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(hli_path) as src:
    hli = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

with rasterio.open(twi_path) as src:
    twi = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]

Y = np.array(lidar)

# Create feature dataframe for all points, including tpi values for each ngb_val
X = pd.DataFrame({
    'rap': rap,
    'dem': dem, 
    'slope': slope,
    'hli': hli,
    'twi': twi
})

# Pre-allocate arrays for tpi and weights
tpi_values = {}

ngb_vals = [30, 60, 120, 240, 480, 960, 1920]

# Iterate over possible parameter values and read corresponding rasters into X
for ngb_val in ngb_vals:
    tpi_path = f'data/raster/terrain/tpi_{ngb_val}.tif'
    with rasterio.open(tpi_path) as src:
        tpi_values[ngb_val] = [x[0] for x in src.sample([(p.x, p.y) for p in gpd_train_points.geometry])]



In [3]:
#def objective(trial):
def objective(trial, X, tpi_values, Y, gpd_train_points):
    params = {
        'objective': 'tweedie',
        'metric': 'l1',
        'random_state': 42,
        'n_jobs': 1,
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 250, 3000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 30),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tweedie_variance_power': trial.suggest_float('tweedie_variance_power', 1.1, 1.95),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1e1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1e1, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 1e-6, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 1e1, log=True),
        'tpi_ngb': trial.suggest_categorical('tpi_ngb', ngb_vals),
    }
    
    def process_fold(fold_idx, params, X, tpi_values, Y, gpd_train_points):
        train_mask = (gpd_train_points['fold'] != fold_idx)
        test_mask = (gpd_train_points['fold'] == fold_idx)
        
        # Add tpi to the feature dataframe
        X_fold = X.copy()
        X_fold['tpi'] = tpi_values[params['tpi_ngb']]

        X_train = X_fold[train_mask]
        X_test = X_fold[test_mask]
        y_train = np.array(Y)[train_mask]
        y_test = np.array(Y)[test_mask]
        # Create the LightGBM model
        lgb_model = lgb.LGBMRegressor(**params)

        lgb_model.fit(X_train, y_train,
                      eval_set=[(X_test, y_test)],
                      eval_metric=params['metric'])
        score = lgb_model.evals_result_['valid_0'][params['metric']][-1]
        return score

    num_folds = 5

    scores = joblib.Parallel(n_jobs=num_folds)(
        joblib.delayed(process_fold)(fold_idx, params, X, tpi_values, Y, gpd_train_points)
        for fold_idx in range(num_folds)
    )

    return np.mean(scores)

In [4]:
for covertype in sorted(gpd_train_points['covertype'].unique()):
    results_dir = f'results/optimization/{covertype}'
    os.makedirs(results_dir, exist_ok=True)
    
    # Filter X and Y for the current covertype
    covertype_mask = (gpd_train_points['covertype'] == covertype)
    X_covertype = X[covertype_mask]
    Y_covertype = Y[covertype_mask]
    gpd_train_points_covertype = gpd_train_points.loc[covertype_mask, ['geometry', 'fold']]

    # Filter tpi_values for the current covertype
    tpi_values_covertype = {}
    for ngb_val in ngb_vals:
        tpi_values_covertype[ngb_val] = np.array(tpi_values[ngb_val])[covertype_mask]

    # Define the save_study_callback function
    def save_study_callback(study, trial):
        joblib.dump(study, os.path.join(results_dir, "study.pkl"))

        values = [t.value for t in study.trials]
        min_value = min(values) * 0.95
        max_value = np.median(values) * 1.25

        fig_history = vis.plot_optimization_history(study)
        fig_history.update_yaxes(range=[min_value, max_value])  # Set y-axis range
        fig_history.write_image(os.path.join(results_dir, "optimization_history.png"))

        fig_slice = vis.plot_slice(study)
        fig_slice.update_yaxes(range=[min_value, max_value])  # Set y-axis range
        fig_slice.write_image(os.path.join(results_dir, "slice_plot.png"))

    trial_count = 500

    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X_covertype, tpi_values_covertype, Y_covertype, gpd_train_points_covertype),
                    n_trials=trial_count, callbacks=[save_study_callback])

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2025-01-14 15:39:32,412] A new study created in memory with name: no-name-da5a1268-6a4d-45fc-8bbd-7ba99eeea536
[I 2025-01-14 15:39:46,185] Trial 0 finished with value: 3.3763605637013954 and parameters: {'n_estimators': 2671, 'subsample': 0.8724195110386592, 'max_depth': 15, 'subsample_freq': 28, 'colsample_bytree': 0.5121906603521176, 'tweedie_variance_power': 1.5926647794698015, 'learning_rate': 0.01464146499311861, 'num_leaves': 366, 'min_child_samples': 20, 'reg_alpha': 0.0165389479873754, 'reg_lambda': 0.13609023768888243, 'min_split_gain': 7.921946918657805e-08, 'min_child_weight': 1.4781840062888323, 'tpi_ngb': 240}. Best is trial 0 with value: 3.3763605637013954.
[I 2025-01-14 15:39:58,466] Trial 1 finished with value: 3.2385830492735392 and parameters: {'n_estimators': 1720, 'subsample': 0.8349788633683186, 'max_depth': 32, 'subsample_freq': 17, 'colsample_bytree': 0.9175217915857055, 'tweedie_variance_power': 1.2158557626205357, 'learning_rate': 0.006141895690095589, 'num_

Best trial:
  Value:  3.066643967973585
  Params: 
    n_estimators: 270
    subsample: 0.8528717694200082
    max_depth: 36
    subsample_freq: 6
    colsample_bytree: 0.9150092117343438
    tweedie_variance_power: 1.6771806929236144
    learning_rate: 0.029544109189416138
    num_leaves: 329
    min_child_samples: 16
    reg_alpha: 0.03397859128532658
    reg_lambda: 0.002295770144335798
    min_split_gain: 3.9564783185510735e-08
    min_child_weight: 0.016153125267266727
    tpi_ngb: 480


[I 2025-01-14 16:21:19,482] Trial 0 finished with value: 12.173001339935277 and parameters: {'n_estimators': 2162, 'subsample': 0.6655760363453623, 'max_depth': 13, 'subsample_freq': 13, 'colsample_bytree': 0.8387214506585167, 'tweedie_variance_power': 1.6783276275197672, 'learning_rate': 0.00022347002030592545, 'num_leaves': 663, 'min_child_samples': 27, 'reg_alpha': 9.307365879850568, 'reg_lambda': 0.0028384116085730707, 'min_split_gain': 5.120017905952177e-07, 'min_child_weight': 0.00892328104319157, 'tpi_ngb': 1920}. Best is trial 0 with value: 12.173001339935277.
[I 2025-01-14 16:21:21,772] Trial 1 finished with value: 10.552594493396374 and parameters: {'n_estimators': 1525, 'subsample': 0.57454061677008, 'max_depth': 38, 'subsample_freq': 14, 'colsample_bytree': 0.6809839901828456, 'tweedie_variance_power': 1.1537305685462602, 'learning_rate': 0.0006767235278234046, 'num_leaves': 619, 'min_child_samples': 21, 'reg_alpha': 1.546890362939461, 'reg_lambda': 0.0002774029356485457, '

Best trial:
  Value:  7.5550065840922205
  Params: 
    n_estimators: 1722
    subsample: 0.6175976333373286
    max_depth: 40
    subsample_freq: 15
    colsample_bytree: 0.9489396667169212
    tweedie_variance_power: 1.6299154067174042
    learning_rate: 0.002384276013692625
    num_leaves: 147
    min_child_samples: 5
    reg_alpha: 0.39828245466883533
    reg_lambda: 0.4170737930997784
    min_split_gain: 2.2370453825162082e-07
    min_child_weight: 0.005066700375674121
    tpi_ngb: 60


[I 2025-01-14 16:57:02,332] Trial 0 finished with value: 16.478740248807934 and parameters: {'n_estimators': 1565, 'subsample': 0.6726339318276301, 'max_depth': 49, 'subsample_freq': 21, 'colsample_bytree': 0.9242920259037555, 'tweedie_variance_power': 1.7468183668410033, 'learning_rate': 0.0039978697989152216, 'num_leaves': 840, 'min_child_samples': 12, 'reg_alpha': 7.447014812187097, 'reg_lambda': 0.6856271428082801, 'min_split_gain': 3.4411562447696435e-08, 'min_child_weight': 0.0006303247905061642, 'tpi_ngb': 120}. Best is trial 0 with value: 16.478740248807934.
[I 2025-01-14 16:57:06,148] Trial 1 finished with value: 22.04028146966968 and parameters: {'n_estimators': 1463, 'subsample': 0.9860858949737903, 'max_depth': 37, 'subsample_freq': 19, 'colsample_bytree': 0.7331910776810642, 'tweedie_variance_power': 1.1391948272302692, 'learning_rate': 0.00025921059774145015, 'num_leaves': 670, 'min_child_samples': 17, 'reg_alpha': 0.33350261541287984, 'reg_lambda': 0.6724017915023703, 'm

Best trial:
  Value:  15.872265280818075
  Params: 
    n_estimators: 1469
    subsample: 0.8717509704402022
    max_depth: 32
    subsample_freq: 3
    colsample_bytree: 0.9696311630143675
    tweedie_variance_power: 1.8350705011485644
    learning_rate: 0.0022758028631871714
    num_leaves: 30
    min_child_samples: 39
    reg_alpha: 0.017323125640874692
    reg_lambda: 0.274146084223165
    min_split_gain: 3.7877281001286076e-08
    min_child_weight: 0.00010414568062456292
    tpi_ngb: 240
