In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, StratifiedKFold

from catboost import CatBoostClassifier, Pool, metrics, cv

In [2]:
feat = "feat00"
train = pl.read_csv(f"feat/feat_train_{feat}.csv")
test = pl.read_csv(f"feat/feat_test_{feat}.csv")
train_origin = pl.read_csv("data/train.csv").rename({"": "idx"})

# 説明変数のカラム
cols_exp = [c for c in test.columns if c != "idx"]

# カテゴリ特徴量のカラム
cols_cat_int = ["boro_ct", "cb_num"] # integerだがカテゴリ特徴量とみなすもの
cols_cat = [c for c in train_origin.select(pl.col(pl.Utf8)).columns if c != "created_at"] + cols_cat_int # カテゴリ特徴量

# カテゴリ特徴量の欠損を-1で埋める（Catboostはカテゴリ特徴量の欠損を受け付けない）
train = train.with_columns(train[cols_cat].fill_null(-1))
test = test.with_columns(test[cols_cat].fill_null(-1))

# ターゲット変数
col_target = "health"

### Catboost

In [3]:
import hyperopt
from numpy.random import RandomState

def tune_catboost_params(train, cols_exp, col_target, cols_cat):    
    x = train[cols_exp].to_numpy()
    y = train[col_target].to_numpy()

    # cols_expにおけるカテゴリ変数のインデックス (Catboostモデリングに使用)
    cols_cat_idxs = [i for i, c in enumerate(cols_exp) if c in cols_cat]

    params_space = {'learning_rate': hyperopt.hp.uniform('learning_rate', 0.1, 0.5),
                    'l2_leaf_reg': hyperopt.hp.randint('l2_leaf_reg',1,10),
                    'depth': hyperopt.hp.randint('depth',4,10), 
                    'bagging_temperature': hyperopt.hp.uniform('bagging_temperature',0.0,1.0), 
                    'random_strength': hyperopt.hp.uniform('random_strength',1, 20)
                    }
    trials = hyperopt.Trials()
    
    # hyperopt目的関数の定義
    def hyperopt_objective(params):
        params_add = {'classes_count': 3, 
                    'loss_function': "MultiClass", 
                    'od_type': 'Iter', # early stopping
                    'od_wait': 50,  # early stopping, 
                    'logging_level': 'Silent', 
                    'use_best_model': True
                    }
        params |= params_add
        
        clf = CatBoostClassifier(**params)
        
        cv_data = cv(
            Pool(x, y, cat_features=cols_cat_idxs),
            params,
            logging_level='Silent',
        )
        min_loss = np.min(cv_data['test-MultiClass-mean'])
        
        return min_loss # as hyperopt minimises
    
    # パラメータ最適化の実行
    params_tuned = hyperopt.fmin(
        hyperopt_objective,
        space=params_space,
        algo=hyperopt.tpe.suggest,
        max_evals=50,
        trials=trials,
    )

    return params_tuned

In [4]:
def train_catboost(train, cols_exp, col_target, cols_cat, params=None):
    # cols_expにおけるカテゴリ変数のインデックス (Catboostモデリングに使用)
    cols_cat_idxs = [i for i, c in enumerate(cols_exp) if c in cols_cat]
    
    if params is None:
        params = {}
        
    params_add = {'classes_count': 3, 
                  'loss_function': "MultiClass", 
                  'od_type': 'Iter', # early stopping
                  'od_wait': 50,  # early stopping, 
                  'logging_level': 'Silent', 
                  'use_best_model': True
                  }
    params |= params_add

    x = train[cols_exp].to_numpy()
    y = train[col_target].to_numpy()

    # 層化K-fold
    skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    y_valid_pred_lst = []
    idx_valid_lst = []
    clf_lst = []

    # cross validation
    for fold, (idx_train, idx_valid) in enumerate(skf.split(x, y)):
        print("fold", fold)
        x_train = x[idx_train, :]
        x_valid = x[idx_valid, :]
        y_train = y[idx_train]
        y_valid = y[idx_valid]
        
        # catboost modeling
        clf = CatBoostClassifier(**params)
        clf.fit(
            x_train, y_train,
            cat_features=cols_cat_idxs,
            eval_set=(x_valid, y_valid),
            plot=True
        )

        # oof
        y_valid_pred = clf.predict_proba(x_valid)
        y_valid_pred_lst.append(y_valid_pred)
        idx_valid_lst.append(idx_valid)
        clf_lst.append(clf)

    idx_valid = np.hstack(idx_valid_lst)
    y_valid_pred = np.vstack(y_valid_pred_lst)
    oof_pred = y_valid_pred[np.argsort(idx_valid)]
    
    return clf_lst, oof_pred

In [5]:
def predict_test(x_test, clf_lst):
    y_test_pred_lst = []

    for clf in clf_lst:
        y_test_pred = clf.predict_proba(x_test)
        y_test_pred_lst.append(y_test_pred)

    y_test_pred = np.mean(y_test_pred_lst, axis=0)
    return y_test_pred

In [6]:
# hyperoptを用いたcatboostパラメータ最適化
# params_tuned = tune_catboost_params(train, cols_exp, col_target, cols_cat)
params_tuned = {
    'bagging_temperature': 0.4652651687152352,
    'depth': 4,
    'l2_leaf_reg': 6,
    'learning_rate': 0.1143642151168838,
    'random_strength': 8.147531052232143
}

# train Catboost model
clf_lst, oof_pred = train_catboost(train, cols_exp, col_target, cols_cat, params_tuned)

# predict test with CV ensemble
y_test_pred = predict_test(test[cols_exp].to_numpy(), clf_lst)

# record
oof_pred_df = pl.DataFrame(oof_pred, schema=[f"health_is_{h}" for h in range(3)])
test_pred_df = pl.DataFrame(y_test_pred, schema=[f"health_is_{h}" for h in range(3)])

fold 0


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

fold 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

fold 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

fold 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

fold 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [7]:
# save
oof_pred_df.write_csv(f"pred/oof_pred_catboost_{feat}.csv")
test_pred_df.write_csv(f"pred/test_pred_catboost_{feat}.csv")