In [None]:
from pathlib import Path

import lightgbm as lgb
import pandas as pd
from ml_trainer.tabular.models.catboost import CatBoostModel
from ml_trainer.tabular.models.lightgbm import LightGBMModel
from ml_trainer.tabular.models.lr import LinearRegressionModel
from ml_trainer.tabular.models.xgboost import XGBoostModel
from ml_trainer.tabular.trainer import Trainer
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, train_test_split

In [None]:
OUT_DIR = Path("../data") / "output" / "binary_classification"

### Load Data


In [None]:
feature_names = load_breast_cancer()["feature_names"]
raw_df = pd.concat(
    [
        pd.DataFrame(load_breast_cancer()["data"], columns=feature_names),
        pd.DataFrame(load_breast_cancer()["target"], columns=["target"]),
    ],
    axis=1,
)
train_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=8823)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

### Train


In [None]:
trainer = Trainer(
    estimators=[
        LinearRegressionModel(feature_names=feature_names, estimator_name="logistic_regression"),
        LightGBMModel(
            feature_names=feature_names,
            estimator_name="lightgbm",
            params={"objective": "binary"},
            fit_params={
                "callbacks": [
                    lgb.callback._EarlyStoppingCallback(
                        stopping_rounds=10,
                        verbose=True,
                    ),
                    lgb.callback._LogEvaluationCallback(
                        period=10,
                        show_stdv=True,
                    ),
                ]
            },
            use_cache=True,  # use cache for faster training
        ),
        CatBoostModel(
            feature_names=feature_names,
            estimator_name="catboostclassifier",
            params={"loss_function": "Logloss", "early_stopping_rounds": 10},
            use_cache=True,
        ),
        XGBoostModel(
            feature_names=feature_names,
            estimator_name="xgboost",
            params={
                "objective": "binary:logistic",
                "early_stopping_rounds": 10,
            },
            use_cache=True,
        ),
    ],
    out_dir=OUT_DIR,
    split_type=KFold,
    n_splits=4,
    seed=8823,
    task_type="binary",
    ensemble=True,  # mean ensemble
)

oof_preds = trainer.train_cv(X_train=train_df, y_train=train_df["target"])
trainer.scores_df

### Importance


In [None]:
importances = trainer.make_plot_feature_importances()

In [None]:
trainer.make_plot_confusion_matrix(y=train_df["target"], out_dir=None, palette="GnBu", threshold=0.5)

### Predict


In [None]:
fold_means = trainer.predict_cv(X=test_df)
fold_means = pd.DataFrame(fold_means)  # to DataFrame from dict
fold_means

### Save & Load Trainer


In [None]:
trainer.save()  # save the trainer
trainer_new = Trainer.load(OUT_DIR / "trainer.pkl")  # load the trainer

In [None]:
fold_means_new = trainer_new.predict_cv(X=test_df)
fold_means_new = pd.DataFrame(fold_means_new)
fold_means_new

In [None]:
# check if the loaded model is the same as the original one
all(fold_means == fold_means_new)