In [None]:
import numpy as np
from helper import log_multi_target_run, transform_targets, compute_metrics
from tiny_mlflow import log_multi_target_run_local
import xgboost as xgb
import gc
import cudf
import cupy as cp


In [None]:
DROP = ['id']
TARGETS = ["n2o", "no3", "yield", "soc"]

gdf = cudf.read_parquet("../data_processing/data/df_WIWH_training.parquet")

X_train = gdf.drop(columns=TARGETS+DROP)
y_train = gdf[TARGETS]

del gdf
gc.collect()


0

In [3]:
X_train.columns

Index(['soil', 'climate', 'cropping_systems', 'crop_rotation', 'n_synth_type',
       'n_org_type', 'n_org_replication', 'n_synth_replication', 'irrigation',
       'manu_depth', 'n_org_amount', 'n_synthamount', 'fert_amount_1',
       'fert_amount_2', 'fert_amount_3', 'manu_amount_1', 'manu_amount_2',
       'manu_amount_3', 'prec_days', 'total_nitrogen',
       'total_precipitation_year', 'total_average_temperature_year',
       'total_precipitation_growing_season',
       'total_average_temperature_growing_season', 'total_precipitation_autum',
       'total_average_temperature_autum', 'total_precipitation_winter',
       'total_average_temperature_winter', 'total_precipitation_spring',
       'total_average_temperature_spring', 'bd', 'corg', 'norg', 'sand',
       'silt', 'clay', 'ph', 'sks', 'wcmax', 'wcmin',
       'total_precipitation_3_after_fert_1',
       'total_precipitation_3_after_fert_2',
       'total_precipitation_3_after_fert_3',
       'total_precipitation_3_after_manu

In [4]:
def train_xgb_models(X_train_cudf, y_train_cudf, log_to_mlflow=False):
    # features as NumPy on CPU
    X_np = X_train_cudf.to_numpy().astype("float32")
    feature_names = list(X_train_cudf.columns)

    # transform each target (log1p for n2o/no3/yield, Yeo Johnson for soc)
    y_orig_np, y_trans_np, transformers = transform_targets(y_train_cudf)

    models = {}   # target -> Booster
    metrics = {}  # target -> dict(rmse, mae, r2)

    # base params for all targets
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",   # with device='cuda' this uses GPU hist
        "device": "cuda",
        "sampling_method": "gradient_based",
        "max_bin": 256,
        "max_depth": 9,
        "min_child_weight": 48,
        "subsample": 0.8,
        "colsample_bytree": 0.6,
        "learning_rate": 0.013556,
        "gamma": 1.074052,
        "reg_alpha": 0.151021,
        "reg_lambda": 11.097351,
    }
    num_boost_round = 10_000
    model_name = "xgboost_v1"

    for target in TARGETS:
        print(f"Training XGBoost for target '{target}'...")

        y_trans = y_trans_np[target].astype("float32")

        # DMatrix lives on CPU, training still runs on GPU
        dtrain = xgb.DMatrix(
            X_np,
            label=y_trans,
            feature_names=feature_names,
        )

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
        )

        # predictions in transformed space
        preds_trans = booster.predict(dtrain)

        # metrics in original scale
        metrics[target], _ = compute_metrics(
            target,
            preds_trans,
            y_orig_np[target],
            transformers[target],
            model_name,
        )

        models[target] = booster

        print(f"Done training XGBoost for target '{target}'")

    if log_to_mlflow:
        log_multi_target_run(
            model_family_name="xgboost",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_xgb",
            run_name="xgb_train_full",
        )

    else:
        # Local JSON logger
        log_multi_target_run_local(
            model_family_name="xgboost",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_xgb",
            run_name="xgb_train_full",
            base_dir="experiments",  # optional
        )

    return models, metrics, transformers


In [5]:
%%time

xgb_models, xgb_metrics, xgb_transformers = train_xgb_models(X_train, y_train, False)
print("XGB metrics:", xgb_metrics)

Training XGBoost for target 'n2o'...
Done training XGBoost for target 'n2o'
Training XGBoost for target 'no3'...
Done training XGBoost for target 'no3'
Training XGBoost for target 'yield'...
Done training XGBoost for target 'yield'
Training XGBoost for target 'soc'...
Done training XGBoost for target 'soc'
XGB metrics: {'n2o': {'rmse': 0.49866217, 'mae': 0.33381912112236023, 'r2': 0.8050349950790405}, 'no3': {'rmse': 19.224316, 'mae': 9.476849555969238, 'r2': 0.8359580039978027}, 'yield': {'rmse': 626.6981, 'mae': 437.010009765625, 'r2': 0.9346851110458374}, 'soc': {'rmse': 288.7198, 'mae': 210.98281860351562, 'r2': 0.8637679815292358}}
CPU times: user 41min 50s, sys: 1min 27s, total: 43min 18s
Wall time: 45min 1s
