In [1]:
import gc
import itertools
import os
import pickle
import random
import sys
import warnings
from glob import glob
from pathlib import Path

import config
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import torch
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lightgbm import LGBMClassifier, LGBMRegressor
from metric import macro_auc_score, score
from scipy.optimize import minimize
from seed import seed_everything
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, TimeSeriesSplit, train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from xgboost import XGBClassifier, XGBRegressor

warnings.filterwarnings("ignore")


In [2]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DRY_RUN = False
    EXP_NAME = config.EXP_NAME
    AUTHOR = "marumarukun"
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    MODEL_PATH = config.OUTPUT_DIR / "models"  # モデル作成・実験時はこちらを使用(Notebookではこちらを使用)
    # MODEL_PATH = config.ARTIFACT_EXP_DIR(config.EXP_NAME) / "models"  # 提出時はこちらを使用(pyではこちらを使用)
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    METHOD_WEIGHT_DICT = {"lightgbm": 0.2, "xgboost": 0.2, "catboost": 0.6}
    SEED = 319
    n_folds = 4
    target_col_list = ["チョコレート", "ビール", "ヘアケア", "米（5㎏以下）"]
    num_boost_round = 50 if DRY_RUN else 1000000
    early_stopping_round = 10 if DRY_RUN else 100  # 10÷lrで設定
    verbose = 500

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
    regression_lgb_params = {
        "objective": "regression",
        # "metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 5,
        "min_child_weight": 1,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "subsample_freq": 1,
        "seed": SEED,
        "device": "cpu",  # cpu/gpu/cuda
        "verbosity": -1,
    }
    # https://xgboost.readthedocs.io/en/stable/parameter.html
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
    regression_xgb_params = {
        "objective": "reg:squarederror",
        # "eval_metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 5,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "min_child_weight": 1,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "gpu",  # cpu/gpu/cuda
    }
    # https://catboost.ai/docs/en/references/training-parameters/
    # https://catboost.ai/docs/en/concepts/python-reference_catboostregressor
    # https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier
    regression_cat_params = {
        "loss_function": "RMSE",
        "learning_rate": 0.1,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "GPU",  # CPU/GPU
    }

    # 分類問題用のLightGBMパラメータ
    classification_lgb_params = {
        "objective": "binary",
        "metric": "auc",
        "learning_rate": 0.1,
        "max_depth": 5,
        "min_child_weight": 1,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "subsample_freq": 1,
        "seed": SEED,
        "device": "cpu",
        "verbosity": -1,
    }

    # 分類問題用のXGBoostパラメータ
    classification_xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.1,
        "max_depth": 5,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "min_child_weight": 1,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "cpu",
    }

    # 分類問題用のCatBoostパラメータ
    classification_cat_params = {
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "learning_rate": 0.1,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "CPU",
    }


In [3]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [4]:
# ====================================================
# Read data
# ====================================================
train_pl = pl.read_csv(CFG.DATA_PATH / "train_session.csv", try_parse_dates=True)
test_pl = pl.read_csv(CFG.DATA_PATH / "test_session.csv", try_parse_dates=True)
target_pl = pl.read_csv(CFG.DATA_PATH / "train_target.csv", try_parse_dates=True)
# make index column
# train = train.with_row_index()
# test = test.with_row_index()

# merge target
train_pl = pl.concat([train_pl, target_pl], how="horizontal")
train_pl.head()


session_id,売上日,時刻,店舗名,年代,性別,顧客CD,チョコレート,ビール,ヘアケア,米（5㎏以下）
str,date,i64,str,str,str,str,i64,i64,i64,i64
"""3VrcoHzNbhVjwWry8bprTk""",2024-07-01,0,"""つくば""","""20代""","""不明""","""9545f047bcc38513f1bf9d420b615b…",0,0,0,0
"""2BQUMmbDbCffCzmyCmJWXM""",2024-07-01,0,"""つくば""","""20代""","""不明""","""a8caaca05647788262c55ae79b3c6c…",0,0,0,0
"""Bbi8hQdtC3CCJxULJYCE9F""",2024-07-01,0,"""つくば""","""20代""","""女性""","""33c89af908f24a3189312289978d1c…",0,0,0,0
"""7zecHQC6svwsdprayEJCfR""",2024-07-01,0,"""つくば""","""20代""","""女性""","""395e3a5c2d0c006e619e8196e310e7…",0,0,0,0
"""Dr2ky5Hdr8odb2uJQth8Hc""",2024-07-01,0,"""つくば""","""20代""","""女性""","""ed59f7160f6b18c34a510d2ea179e3…",0,0,0,0


In [5]:
# ====================================================
# Make fold column
# ====================================================

# TimeSeriesSplit
fold_array = np.zeros(train_pl.height)
tss = TimeSeriesSplit(n_splits=CFG.n_folds)
for fold, (_, valid_index) in enumerate(tss.split(train_pl), start=1):
    fold_array[valid_index] = fold

train_pl = train_pl.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))

train_pl.sample(10)


session_id,売上日,時刻,店舗名,年代,性別,顧客CD,チョコレート,ビール,ヘアケア,米（5㎏以下）,fold
str,date,i64,str,str,str,str,i64,i64,i64,i64,i8
"""f2DhD7Vo4H2vPBNzNqMDGh""",2024-09-20,2,"""つくば""","""40代""","""男性""","""cf0abf0216a0baa1ed5b33d973b652…",1,0,0,0,3
"""AoqrthqpiwkP7SK3MSgsjH""",2024-10-11,16,"""益浦店""","""30代""","""女性""","""b5cabf2ef128dc0ff4e87b4ad8e5da…",0,0,0,0,4
"""5Y2hmDpWRaBJjKRUkefYQs""",2024-10-10,11,"""福岡空""","""40代""","""男性""","""c45d0297b2ed512504d770948141b0…",0,0,0,0,4
"""DUmDLMA729fTUL7h3msNzK""",2024-09-02,14,"""新宮店""","""70代""","""女性""","""bdf555f038e797fe0fc93472ed0bfe…",0,0,0,0,2
"""J6N32AZgrTb4NjUpUHwX8n""",2024-09-05,3,"""福岡空""","""60代""","""男性""","""a11d10f12cfa0d480c47e2c735ca50…",0,0,0,0,2
"""gj3kJiixvGKH4P7pikBuuT""",2024-08-02,15,"""つくば""","""60代""","""女性""","""932c3902d7fbc6539ad2c9e73634ed…",0,0,0,0,1
"""aeZJT4J84GsvTut7oX5zUo""",2024-09-27,13,"""新宮店""","""40代""","""女性""","""9862a699e1f912617d876fbff7e348…",1,0,0,0,3
"""DS3GrDmQRTdQLAUvWrQn5c""",2024-10-29,17,"""福岡空""","""20代""","""男性""","""e9e9495396c1c207d94a5c86ddba72…",1,0,0,0,4
"""C9TXrmuAct3PgGMWfeYFLX""",2024-07-15,14,"""新宮店""","""80代以上""","""女性""","""cf977398470ed0e3623159201d5fc7…",0,0,0,0,0
"""abYAaqCKhf5QBkxUUyHSx7""",2024-10-31,8,"""つくば""","""10代以下""","""男性""","""20c219bace44d96e4c6d9619f7ccc7…",0,0,0,0,4


In [6]:
# ====================================================
# Define columns and Label Encode categorical columns
# ====================================================
train = train_pl.to_pandas()
test = test_pl.to_pandas()

RMV = [
    "session_id",
    "売上日",
    "チョコレート",
    "ビール",
    "ヘアケア",
    "米（5㎏以下）",
    "fold",
]
FEATURES = [c for c in train.columns if c not in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")


There are 5 FEATURES: ['時刻', '店舗名', '年代', '性別', '顧客CD']


In [7]:
CATS = []
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")


In these features, there are 4 CATEGORICAL FEATURES: ['店舗名', '年代', '性別', '顧客CD']


In [8]:
combined = pd.concat([train, test], axis=0, ignore_index=True)
# print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ", end="")
for c in FEATURES:
    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ", end="")
        combined[c], _ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype == "float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype == "int64":
            combined[c] = combined[c].astype("int32")

train = combined.iloc[: len(train)].copy()
test = combined.iloc[len(train) :].reset_index(drop=True).copy()


We LABEL ENCODE the CATEGORICAL FEATURES: 店舗名, 年代, 性別, 顧客CD, 

In [9]:
# ====================================================
# CV用関数
# ====================================================
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    model = LGBMClassifier(
        **CFG.classification_lgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        categorical_feature=categorical_features,
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]  # 陽性クラスの確率を取得
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBClassifier(
        **CFG.classification_xgb_params,
        n_estimators=CFG.num_boost_round,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
    )
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]  # 陽性クラスの確率を取得
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]  # 陽性クラスの確率を取得
    return model, valid_pred


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, target_col_list: list, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    oof_predictions_df = pd.DataFrame(np.zeros((len(train_df), len(target_col_list))), columns=target_col_list)

    # ベストイテレーション数を記録する辞書を追加
    best_iterations_dict = {target_col: [] for target_col in target_col_list}

    for target_col in target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            # 時系列クロスバリデーション
            # 訓練データ: foldが現在のfold以下のデータ
            # 検証データ: foldが現在のfold+1のデータ
            x_train = train_df[train_df["fold"] <= fold][features]
            y_train = train_df[train_df["fold"] <= fold][target_col]
            x_valid = train_df[train_df["fold"] == fold + 1][features]
            y_valid = train_df[train_df["fold"] == fold + 1][target_col]
            if method == "lightgbm":
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)
                # ベストイテレーション数を記録
                best_iterations_dict[target_col].append(model.best_iteration_)
            elif method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
                # ベストイテレーション数を記録
                best_iterations_dict[target_col].append(model.best_iteration)
            elif method == "catboost":
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)
                # ベストイテレーション数を記録
                best_iterations_dict[target_col].append(model.get_best_iteration())
            else:
                raise ValueError(f"Unknown method: {method}")

            # Save best model
            save_model_path = (
                CFG.MODEL_PATH / f"{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
            )
            save_model_path.parent.mkdir(parents=True, exist_ok=True)
            pickle.dump(model, open(save_model_path, "wb"))
            # Add to out of folds array
            oof_predictions[train_df["fold"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        oof_predictions_df[target_col] = oof_predictions

        # Compute out of folds metric
        # fold=0のデータを除外して評価する
        valid_indices = train_df["fold"] != 0
        m = score(
            train_df.loc[valid_indices, target_col].copy(), oof_predictions_df.loc[valid_indices, target_col].copy()
        )
        print("=" * 50)
        print(f"{method} our out of folds CV score is {m}")
        print("=" * 50)

        # 各ターゲットのベストイテレーション数の平均を計算して表示
        # avg_iterations = int(np.mean(best_iterations_dict[target_col]))
        print(f"{target_col} のベストイテレーション数: {best_iterations_dict[target_col]}")

    oof_predictions_df.to_csv(CFG.OUTPUT_DIR / f"oof_{method}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False)

    # Macro AUCを計算
    macro_auc = macro_auc_score(
        train_df.loc[valid_indices, target_col_list].values,
        oof_predictions_df.loc[valid_indices, target_col_list].values,
    )
    print("=" * 50)
    print("=" * 50)
    print(f"{method} Macro AUC: {macro_auc}")
    print("=" * 50)
    print("=" * 50)

    # ベストイテレーション数の辞書も返す
    return oof_predictions_df, best_iterations_dict


In [10]:
# ====================================================
# CV
# ====================================================
oof_dict = {}
best_iterations_dict_all = {}

# クロスバリデーションを実行し、ベストイテレーション数を取得
for method in CFG.METHOD_LIST:
    oof_df, best_iterations_dict = gradient_boosting_model_cv_training(
        method, train, CFG.target_col_list, FEATURES, CATS
    )
    oof_dict[method] = oof_df
    best_iterations_dict_all[method] = best_iterations_dict


--------------------------------------------------
lightgbm training fold 1 チョコレート
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.626783
Early stopping, best iteration is:
[728]	valid_0's auc: 0.627679
--------------------------------------------------
lightgbm training fold 2 チョコレート
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.651622
[1000]	valid_0's auc: 0.653182
Early stopping, best iteration is:
[1028]	valid_0's auc: 0.653215
--------------------------------------------------
lightgbm training fold 3 チョコレート
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.6646
[1000]	valid_0's auc: 0.667561
[1500]	valid_0's auc: 0.668079
Early stopping, best iteration is:
[1439]	valid_0's auc: 0.668241
--------------------------------------------------
lightgbm training fold 4 チョコレート
Training until validation scores don't improve for 100 rounds
[500]	valid_0's auc: 0.654006
[1000]	valid

In [11]:
# ====================================================
# 各メソッドの予測結果を加重平均して最終的なMacro AUCを計算する
# ====================================================
# 単純平均の予測結果を格納するDataFrame
ensemble_oof_df = pd.DataFrame(np.zeros((len(train), len(CFG.target_col_list))), columns=CFG.target_col_list)

# 各メソッドの予測結果を重みづけして合計
print(CFG.METHOD_WEIGHT_DICT)
for method, weight in CFG.METHOD_WEIGHT_DICT.items():
    if method in oof_dict:
        ensemble_oof_df += oof_dict[method] * weight

# fold=0のデータを除外して評価する
valid_indices = train["fold"] != 0

# 各ターゲットごとのスコアを計算
for target_col in CFG.target_col_list:
    m = score(train.loc[valid_indices, target_col].copy(), ensemble_oof_df.loc[valid_indices, target_col].copy())
    print(f"Ensemble score for {target_col}: {m}")

# Macro AUCを計算
ensemble_macro_auc = macro_auc_score(
    train.loc[valid_indices, CFG.target_col_list].values,
    ensemble_oof_df.loc[valid_indices, CFG.target_col_list].values,
)
print("=" * 50)
print(f"Ensemble Macro AUC: {ensemble_macro_auc}")
print("=" * 50)


{'lightgbm': 0.2, 'xgboost': 0.2, 'catboost': 0.6}
Ensemble score for チョコレート: 0.7106783916443025
Ensemble score for ビール: 0.8298428218805837
Ensemble score for ヘアケア: 0.6836120357183179
Ensemble score for 米（5㎏以下）: 0.6797397793242785
Ensemble Macro AUC: 0.7259682571418706


In [12]:
# ====================================================
# 最終モデル学習関数（全データ使用）
# ====================================================
def train_final_models(
    method: str,
    train_df: pd.DataFrame,
    target_col_list: list,
    features: list,
    categorical_features: list,
    best_iterations_dict: dict,
):
    print("=" * 50)
    print(f"全データを使用して{method}の最終モデルを学習します")
    print("=" * 50)

    for target_col in target_col_list:
        # 各ターゲットのベストイテレーション数の最大値を計算
        iterations = best_iterations_dict[target_col]
        max_iterations = max(iterations)

        print(f"{method} 最終モデル学習 {target_col}, 使用イテレーション数: {max_iterations}")

        # 全データを使用
        x_all = train_df[features]
        y_all = train_df[target_col]

        if method == "lightgbm":
            final_model = LGBMClassifier(
                **CFG.classification_lgb_params,
                n_estimators=max_iterations,
            )
            final_model.fit(
                x_all,
                y_all,
                categorical_feature=categorical_features,
            )

        elif method == "xgboost":
            final_model = XGBClassifier(
                **CFG.classification_xgb_params,
                n_estimators=max_iterations,
            )
            final_model.fit(
                x_all,
                y_all,
                verbose=CFG.verbose,
            )

        elif method == "catboost":
            cat_all = Pool(data=x_all, label=y_all, cat_features=categorical_features)
            final_model = CatBoostClassifier(
                **{k: v for k, v in CFG.classification_cat_params.items() if k != "iterations"},
                iterations=max_iterations,  # 固定イテレーション数
            )
            final_model.fit(
                cat_all,
                verbose=CFG.verbose,
            )

        # 最終モデルを保存
        save_model_path = CFG.MODEL_PATH / f"{method}_{target_col}_final_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
        save_model_path.parent.mkdir(parents=True, exist_ok=True)
        pickle.dump(final_model, open(save_model_path, "wb"))

        del x_all, y_all, final_model
        gc.collect()

    print("=" * 50)
    print("全ての最終モデルの学習が完了しました")
    print("=" * 50)


In [13]:
# ====================================================
# 最終モデルの学習（全データ使用）
# ====================================================
for method in CFG.METHOD_LIST:
    train_final_models(method, train, CFG.target_col_list, FEATURES, CATS, best_iterations_dict_all[method])


全データを使用してlightgbmの最終モデルを学習します
lightgbm 最終モデル学習 チョコレート, 使用イテレーション数: 1439
lightgbm 最終モデル学習 ビール, 使用イテレーション数: 3917
lightgbm 最終モデル学習 ヘアケア, 使用イテレーション数: 608
lightgbm 最終モデル学習 米（5㎏以下）, 使用イテレーション数: 605
全ての最終モデルの学習が完了しました
全データを使用してxgboostの最終モデルを学習します
xgboost 最終モデル学習 チョコレート, 使用イテレーション数: 1230
xgboost 最終モデル学習 ビール, 使用イテレーション数: 1250
xgboost 最終モデル学習 ヘアケア, 使用イテレーション数: 469
xgboost 最終モデル学習 米（5㎏以下）, 使用イテレーション数: 221
全ての最終モデルの学習が完了しました
全データを使用してcatboostの最終モデルを学習します
catboost 最終モデル学習 チョコレート, 使用イテレーション数: 521
0:	total: 583ms	remaining: 5m 3s
500:	total: 3m 52s	remaining: 9.27s
520:	total: 4m	remaining: 0us
catboost 最終モデル学習 ビール, 使用イテレーション数: 1419
0:	total: 632ms	remaining: 14m 56s
500:	total: 3m 54s	remaining: 7m 9s
1000:	total: 7m 38s	remaining: 3m 11s
1418:	total: 10m 41s	remaining: 0us
catboost 最終モデル学習 ヘアケア, 使用イテレーション数: 389
0:	total: 655ms	remaining: 4m 14s
388:	total: 3m 8s	remaining: 0us
catboost 最終モデル学習 米（5㎏以下）, 使用イテレーション数: 629
0:	total: 634ms	remaining: 6m 38s
500:	total: 3m 58s	remaining: 1m
628:	total: 4m

In [14]:
# ====================================================
# Inference functions
# ====================================================
# 最終モデルを使用した推論関数
def final_model_inference(method: str, x_test: pd.DataFrame, target_col: str):
    model_path = CFG.MODEL_PATH / f"{method}_{target_col}_final_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
    model = pickle.load(open(model_path, "rb"))
    # 確率予測
    pred = model.predict_proba(x_test)[:, 1]
    return pred


def final_predicting(input_df: pd.DataFrame, features: list):
    output_df = input_df.copy()
    for target_col in CFG.target_col_list:
        output_df[target_col] = 0
        for method in CFG.METHOD_LIST:
            pred = final_model_inference(method, input_df[features], target_col)
            output_df[f"{method}_pred_{target_col}"] = pred
            output_df[target_col] += CFG.METHOD_WEIGHT_DICT[method] * pred
    return output_df


In [15]:
# ====================================================
# Inference
# ====================================================
# 最終モデルで推論
final_output_df = final_predicting(test, FEATURES)

# 結果の表示
# display(final_output_df)

# 提出ファイルの作成
submission = final_output_df[CFG.target_col_list].copy()
display(submission)
submission.to_csv(CFG.OUTPUT_DIR / f"submission_final_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False)


Unnamed: 0,チョコレート,ビール,ヘアケア,米（5㎏以下）
0,0.126088,0.029224,0.071926,0.040981
1,0.027013,0.058504,0.011785,0.018648
2,0.020743,0.024303,0.009030,0.065402
3,0.265363,0.255535,0.222480,0.211521
4,0.036254,0.205713,0.011236,0.013572
...,...,...,...,...
121410,0.030003,0.023393,0.025809,0.046917
121411,0.065517,0.056550,0.022652,0.014668
121412,0.104002,0.057540,0.031139,0.011151
121413,0.064558,0.051147,0.034131,0.012917
