In [None]:
import numpy as np
from helper import log_multi_target_run, transform_targets, compute_metrics
from tiny_mlflow import log_multi_target_run_local
from catboost import CatBoostRegressor
import gc
import cudf
import cupy as cp

In [None]:
DROP = ['id']
TARGETS = ["n2o", "no3", "yield", "soc"]

gdf = cudf.read_parquet("../data_processing/data/df_WIWH_training.parquet")

X_train = gdf.drop(columns=TARGETS+DROP)
y_train = gdf[TARGETS]

del gdf
gc.collect()


0

In [5]:
def train_cat_models(X_train_cudf, y_train_cudf, log_to_mlflow=False):
    # CatBoost uses numpy or pandas, we keep numpy here
    X_np = X_train_cudf.to_numpy().astype("float32")
    feature_names = list(X_train_cudf.columns)

    # detect categorical features by index
    # NOTE: this only works if these columns are still dtype object/category.
    # If you already encoded them to int codes, you may need to pass cat_features manually.
    cat_features = [
        i for i, col in enumerate(X_train_cudf.columns)
        if X_train_cudf[col].dtype == "object"
        or X_train_cudf[col].dtype.name == "category"
    ]

    # transform targets: log1p for n2o/no3/yield, Yeo-Johnson for soc
    y_orig_np, y_trans_np, transformers = transform_targets(y_train_cudf)

    models = {}
    metrics = {}
    model_name = "catboost_v1"

    for target in TARGETS:
        print(f"Training CatBoost for target '{target}'...")

        y_trans = y_trans_np[target]

        model = CatBoostRegressor(
            loss_function="RMSE",
            task_type="GPU",
            devices="0",
            iterations=2000,
            depth=6,
            learning_rate=0.013556,
            border_count=64,
            l2_leaf_reg=3,
            grow_policy="Lossguide",
            random_seed=42,
            verbose=False,
        )

        model.fit(
            X_np,
            y_trans,
            cat_features=cat_features,
            verbose=False,
        )

        preds_trans = model.predict(X_np)

        metrics[target], _ = compute_metrics(
            target,
            preds_trans,
            y_orig_np[target],
            transformers[target],
            model_name,
        )

        models[target] = model

        print(f"Done training CatBoost for target '{target}'")

    if log_to_mlflow:
        log_multi_target_run(
            model_family_name="catboost",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_catboost",
            run_name="catboost_train_full",
        )
    
    else:
        # Local JSON logger
        log_multi_target_run_local(
            model_family_name="catboost",
            models=models,
            metrics=metrics,
            feature_names=feature_names,
            transformers=transformers,
            experiment_name="multi_target_catboost",
            run_name="catboost_train_full",
            base_dir="experiments",  # optional
        )
    

    return models, metrics, transformers


In [6]:
%%time

cat_models, cat_metrics, cat_transformers = train_cat_models(X_train, y_train, False)
print("CatBoost metrics:", cat_metrics)

Training CatBoost for target 'n2o'...
Done training CatBoost for target 'n2o'
Training CatBoost for target 'no3'...
Done training CatBoost for target 'no3'
Training CatBoost for target 'yield'...
Done training CatBoost for target 'yield'
Training CatBoost for target 'soc'...
Done training CatBoost for target 'soc'
CatBoost metrics: {'n2o': {'rmse': 0.58609118108803, 'mae': 0.3978250778979991, 'r2': 0.7306765789548921}, 'no3': {'rmse': 23.826889466840218, 'mae': 12.19853193655124, 'r2': 0.7480072545621734}, 'yield': {'rmse': 723.8702331099971, 'mae': 517.7299606485384, 'r2': 0.9128602239660599}, 'soc': {'rmse': 359.9414931873278, 'mae': 265.01387701756636, 'r2': 0.7882664145429612}}
CPU times: user 1h 26min 44s, sys: 3min 5s, total: 1h 29min 49s
Wall time: 7min 42s
