In [1]:
import numpy as np
import lightgbm as lgb
from helper import log_multi_target_run, transform_targets, compute_metrics
from tiny_mlflow import log_multi_target_run_local
import gc
import cudf
import cupy as cp
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

In [2]:
DROP = ['id']
TARGETS = ["n2o", "no3", "yield", "soc"]

gdf = cudf.read_parquet("data/final_processed_data_201125.parquet")

X_train = gdf.drop(columns=TARGETS+DROP)
y_train = gdf[TARGETS]

del gdf
gc.collect()


0

In [5]:
X_train.columns

Index(['soil', 'climate', 'cropping_systems', 'crop_rotation', 'n_synth_type',
       'n_org_type', 'n_org_replication', 'n_synth_replication', 'irrigation',
       'manu_depth', 'n_org_amount', 'n_synthamount', 'id', 'fert_amount_1',
       'fert_amount_2', 'fert_amount_3', 'manu_amount_1', 'manu_amount_2',
       'manu_amount_3', 'prec_days', 'total_nitrogen',
       'total_precipitation_year', 'total_average_temperature_year',
       'total_precipitation_growing_season',
       'total_average_temperature_growing_season', 'total_precipitation_autum',
       'total_average_temperature_autum', 'total_precipitation_winter',
       'total_average_temperature_winter', 'total_precipitation_spring',
       'total_average_temperature_spring', 'bd', 'corg', 'norg', 'sand',
       'silt', 'clay', 'ph', 'sks', 'wcmax', 'wcmin',
       'total_precipitation_3_after_fert_1',
       'total_precipitation_3_after_fert_2',
       'total_precipitation_3_after_fert_3',
       'total_precipitation_3_afte

In [3]:
def train_lgb_models(X_train_cudf, y_train_cudf, log_to_mlflow=False):
    # LightGBM works nicely with pandas DataFrame
    X_pd = X_train_cudf.to_pandas()
    feature_names = list(X_train_cudf.columns)

    # detect categorical features by column names
    # If you encoded categoricals to int codes already, you may want to
    # build this from your known cat_cols instead of dtypes.
    cat_features = [
        col for col in X_pd.columns
        if str(X_pd[col].dtype) == "category" or X_pd[col].dtype == "object"
    ]

    # transform targets: log1p for n2o/no3/yield, Yeo Johnson for soc
    y_orig_np, y_trans_np, transformers = transform_targets(y_train_cudf)

    models = {}
    metrics = {}
    model_name = "lightgbm_v1"

    for target in TARGETS:
        print(f"Training LightGBM for target '{target}'...")

        y_trans = y_trans_np[target]

        model = lgb.LGBMRegressor(
            objective="regression",
            metric="rmse",
            boosting_type="gbdt",
            n_estimators=5000,
            learning_rate=0.01,
            num_leaves=256,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            reg_alpha=0.0,
            random_state=42,
            device_type="gpu",   # set to "cpu" if GPU is not available
        )

        model.fit(
            X_pd,
            y_trans,
            categorical_feature=cat_features,
        )

        preds_trans = model.predict(X_pd)

        metrics[target], _ = compute_metrics(
            target,
            preds_trans,
            y_orig_np[target],
            transformers[target],
            model_name,
        )

        models[target] = model

        print(f"Done training LightGBM for target '{target}'")

    if log_to_mlflow:
        log_multi_target_run(
            model_family_name="lightgbm",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_lightgbm",
            run_name="lightgbm_train_full",
        )

    else:
        # Local JSON logger
        log_multi_target_run_local(
            model_family_name="lightgbm",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_lightgbm",
            run_name="lightgbm_train_full",
            base_dir="experiments",  # optional
        )

    return models, metrics, transformers


In [4]:
lgb_models, lgb_metrics, lgb_transformers = train_lgb_models(X_train, y_train, False)

print("LightGBM metrics:", lgb_metrics)

Training LightGBM for target 'n2o'...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 11172
[LightGBM] [Info] Number of data points in the train set: 45265130, number of used features: 62
[LightGBM] [Info] Using GPU Device: NVIDIA H100 80GB HBM3, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 52 dense feature groups (2244.75 MB) transferred to GPU in 0.773909 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.877517
Done training LightGBM for target 'n2o'
Training LightGBM for target 'no3'...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 11172
[LightGBM] [Info] Number of data points in the train set: 45265130, number of used features: 62
[LightGBM] [Info] Using GPU Device: NVIDIA H100 80GB HBM3, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Ker