In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time

import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
# from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from optuna import integration, logging

In [2]:
train = pl.read_csv("feat/feat_train_multiclass.csv")
test = pl.read_csv("feat/feat_test.csv")

cols_exp = [c for c in test.columns if c != "idx"]

### LGBM

In [3]:
def tune_lgbm_params(train, cols_exp, col_target):    
    params = {
        'objective': 'multiclass', 
        'num_class': 3, 
        "metric": "multi_logloss",
        # "force_col_wise": True, 
        "random_seed": 0, 
        'verbose': -1
    }

    x = train[cols_exp].to_numpy()
    y = train[col_target].to_numpy()

    # dataset
    train_set = integration.lightgbm.Dataset(x, y)

    # tuning with optuna
    skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=False)
    tuner = integration.lightgbm.LightGBMTunerCV(params=params, 
                                                train_set=train_set, 
                                                num_boost_round=100, 
                                                # num_boost_round=5, 
                                                folds=list(skf.split(x, y)))
    logging.set_verbosity(logging.WARNING)
    tuner.run()

    params_tuned = tuner.best_params
    return params_tuned

In [4]:
def train_lgbm(train, cols_exp, col_target, params=None):
    
    if params is None:
        params = {}
        
    params_add = {
        'objective': 'multiclass', 
        'num_class': 3, 
        "n_estimators": 10000, 
        "metric": "multi_logloss",
        # "force_col_wise": True
    }
    params |= params_add

    x = train[cols_exp].to_numpy()
    y = train[col_target].to_numpy()

    # # down sampling
    # sampler = RandomUnderSampler(random_state=42)
    # x, y = sampler.fit_resample(x, y)
    
    # 層化K-fold
    skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=False)
    y_valid_pred_lst = []
    idx_valid_lst = []
    clf_lst = []

    # cross validation
    for fold, (idx_train, idx_valid) in enumerate(skf.split(x, y)):
        print("fold", fold)
        x_train = x[idx_train, :]
        x_valid = x[idx_valid, :]
        y_train = y[idx_train]
        y_valid = y[idx_valid]

        # lightgbm modeling
        clf = lgbm.LGBMClassifier(**params)#, verbose=0)
        clf.fit(x_train, y_train, 
                eval_set=[(x_train, y_train), (x_valid, y_valid)],  
                callbacks=[
                    lgbm.early_stopping(stopping_rounds=50),
                    lgbm.log_evaluation(period=10000),
                ])

        # oof
        y_valid_pred = clf.predict_proba(x_valid)
        y_valid_pred_lst.append(y_valid_pred)
        idx_valid_lst.append(idx_valid)
        clf_lst.append(clf)

    idx_valid = np.hstack(idx_valid_lst)
    # y_valid_pred = np.hstack(y_valid_pred_lst)
    y_valid_pred = np.vstack(y_valid_pred_lst)
    oof_pred = y_valid_pred[idx_valid]

    return clf_lst, oof_pred

In [5]:
def predict_test(test, cols_exp, clf_lst):
    x_test = test[cols_exp].to_numpy()
    y_test_pred_lst = []

    for clf in clf_lst:
        y_test_pred = clf.predict_proba(x_test)
        y_test_pred_lst.append(y_test_pred)

    y_test_pred = np.mean(y_test_pred_lst, axis=0)
    return y_test_pred

In [6]:
col_target = "health"
print("col_target =", col_target, "-"*50)

# parameter tuning with optuna
params_tuned = tune_lgbm_params(train, cols_exp, col_target)

# train LGBM model
clf_lst, oof_pred = train_lgbm(train, cols_exp, col_target, params_tuned)

# predict test with CV ensemble
y_test_pred = predict_test(test, cols_exp, clf_lst)

# record
oof_pred_df = pl.DataFrame(oof_pred, schema=[f"health_is_{h}" for h in range(3)])
test_pred_df = pl.DataFrame(y_test_pred, schema=[f"health_is_{h}" for h in range(3)])

[I 2023-12-29 15:09:48,583] A new study created in memory with name: no-name-f77ba559-a0f1-430f-9b3a-b0b219063218


col_target = health --------------------------------------------------


feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

feature_fraction, val_score: 0.615968: 100%|##########| 7/7 [00:09<00:00,  1.34s/it]
num_leaves, val_score: 0.606078: 100%|##########| 20/20 [00:41<00:00,  2.07s/it]
bagging, val_score: 0.605851: 100%|##########| 10/10 [00:04<00:00,  2.17it/s]
feature_fraction_stage2, val_score: 0.605851: 100%|##########| 3/3 [00:01<00:00,  2.13it/s]
regularization_factors, val_score: 0.605851: 100%|##########| 20/20 [00:09<00:00,  2.15it/s]
min_child_samples, val_score: 0.605662: 100%|##########| 5/5 [00:02<00:00,  2.20it/s]


fold 0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[252]	training's multi_logloss: 0.596847	valid_1's multi_logloss: 0.604313
fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	training's multi_logloss: 0.601637	valid_1's multi_logloss: 0.605568
fold 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[102]	training's multi_logloss: 0.600463	valid_1's multi_logloss: 0.606178
fold 3
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[375]	training's multi_logloss: 0.595443	valid_1's multi_logloss: 0.603172
fold 4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[160]	training's multi_logloss: 0.598468	valid_1's multi_logloss: 0.605868


In [7]:
# save
oof_pred_df.write_csv("pred/oof_pred_df_multiclass.csv")
test_pred_df.write_csv("pred/test_pred_df_multiclass.csv")