In [None]:
from pathlib import Path

import pandas as pd
from catboost.datasets import rotten_tomatoes
from ml_trainer.tabular.models.catboost import CatBoostModel
from ml_trainer.tabular.trainer import Trainer
from sklearn.model_selection import KFold

In [None]:
OUT_DIR = Path("../data") / "output" / "multiclass_classification"

### Load Data


In [None]:
train_df, test_df = rotten_tomatoes()
train_df = train_df.sample(1000, random_state=8823).reset_index(drop=True)  # レコード数を減らす
test_df = test_df.sample(1000, random_state=8823).reset_index(drop=True)

In [None]:
target_col = "rating_10"
cat_feature_names = ["rating_MPAA", "studio", "fresh"]
feature_names = ["runtime", "fresh", "date_int", "rating_MPAA", "studio"]

# 欠損値処理
train_df.loc[:, cat_feature_names] = train_df[cat_feature_names].fillna("missing")
test_df.loc[:, cat_feature_names] = test_df[cat_feature_names].fillna("missing")

### Train


In [None]:
trainer = Trainer(
    estimators=[
        CatBoostModel(
            feature_names=feature_names,
            estimator_name="catboostclassifier",
            params={
                "loss_function": "MultiClass",
                "early_stopping_rounds": 10,
                "random_state": i,
                "cat_features": cat_feature_names,
            },
            use_cache=False,
        )
        for i in range(2)  # seed average
    ],
    out_dir=OUT_DIR,
    split_type=KFold,
    n_splits=4,
    seed=8823,
    task_type="multiclass",
    ensemble=True,  # mean ensemble
)

oof_preds = trainer.train_cv(X_train=train_df, y_train=train_df[target_col])
trainer.scores_df

### Importance


In [None]:
importances = trainer.make_plot_feature_importances()

In [None]:
trainer.make_plot_confusion_matrix(y=train_df[target_col], out_dir=None, palette="GnBu")

### Predict


In [None]:
fold_means = trainer.predict_cv(X=test_df)
fold_means = pd.DataFrame(fold_means)  # to DataFrame from dict
fold_means

### Save & Load Trainer


In [None]:
trainer.save()  # save the trainer
trainer_new = Trainer.load(OUT_DIR / "trainer.pkl")  # load the trainer

In [None]:
fold_means_new = trainer_new.predict_cv(X=test_df)
fold_means_new = pd.DataFrame(fold_means_new)
fold_means_new

In [None]:
# check if the loaded model is the same as the original one
all(fold_means == fold_means_new)