kaggleなどでよく使用される勾配ブースティングモデル3種を試すことが可能なnotebookです．

愚直に各目的変数を予測するモデルを学習させています．

Preprocessingという関数の中で特徴量作成を始めとした工夫を凝らすことが，コンペでスコアを上げていく方法の一つの方法となります．

細かいハイパーパラメータの設定などは，atmaのgotoさんが記事にまとめてくれているので，こちらを自身はいつも参考にしております．
[勾配ブースティングで大事なパラメータの気持ち](https://nykergoto.hatenablog.jp/entry/2019/03/29/%E5%8B%BE%E9%85%8D%E3%83%96%E3%83%BC%E3%82%B9%E3%83%86%E3%82%A3%E3%83%B3%E3%82%B0%E3%81%A7%E5%A4%A7%E4%BA%8B%E3%81%AA%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF%E3%81%AE%E6%B0%97%E6%8C%81%E3%81%A1)

サブ数に上限があるatmaCupでは手元の実験により，LBスコアや最終的なスコアの見積もりが重要になります．

今回のコンペでは，学習データとテストデータで日付で区切られているとのことなので，学習データとテストデータでは，シーンID（IDというカラムのアンダーバーで分割した前半部分）に重複がないため，自身はシーンIDで分割を行っております．

テーブルデータを使う場合だけであれば，ランダム分割でも問題ないような気もしておりますが，画像を使う場合は学習と評価に同じシーンIDがあると，過学習によりCVとLBの相関が取れなくなってくる可能性があるので，上記の分割がいいのではないかなと思っております．

どこまで時間をかけることができるかわかりませんが，EDAを行い，特徴量の作成や画像データの活用などに着手していくことができたらと思っております．

このnotebookが何かの役にたてば幸いです！

In [1]:
import sys

# ====================================================
# Library
# ====================================================
import os
import gc
import warnings

warnings.filterwarnings("ignore")
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier


In [2]:
!mkdir oof
!mkdir models
!mkdir submission


In [3]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 1
    AUTHOR = "takaito"
    COMPETITION = "atmaCup18"
    DATA_PATH = Path("/kaggle/input/atmacup18-dataset/content/atmaCup18_dataset")  # 適宜書き換える
    OOF_DATA_PATH = Path("oof")
    MODEL_DATA_PATH = Path("models")
    SUB_DATA_PATH = Path("submission")
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    SEED = 42
    n_folds = 2
    target_col_list = [
        "x_0",
        "y_0",
        "z_0",
        "x_1",
        "y_1",
        "z_1",
        "x_2",
        "y_2",
        "z_2",
        "x_3",
        "y_3",
        "z_3",
        "x_4",
        "y_4",
        "z_4",
        "x_5",
        "y_5",
        "z_5",
    ]
    group_col = "cv_ID"
    metric = "MAE"
    metric_maximize_flag = False
    num_boost_round = 2000
    early_stopping_round = 50
    verbose = 250

    regression_lgb_params = {
        "objective": "regression_l1",
        "metric": "mae",
        "learning_rate": 0.1,
        "num_leaves": 31,
        "seed": SEED,
    }
    regression_xgb_params = {
        "objective": "reg:absoluteerror",
        "eval_metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 6,
        "random_state": SEED,
    }

    regression_cat_params = {
        "loss_function": "MAE",
        "learning_rate": 0.1,
        "iterations": num_boost_round,
        "depth": 7,  #
        "random_seed": SEED,
    }

    model_weight_dict = {"lightgbm": 0.40, "xgboost": 0.30, "catboost": 0.30}


In [4]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


seed_everything(CFG.SEED)


In [5]:
train_df = pd.read_csv(CFG.DATA_PATH / "train_features.csv")
test_df = pd.read_csv(CFG.DATA_PATH / "test_features.csv")
submit_df = pd.DataFrame(test_df["ID"])


In [6]:
def category2id(input_df: pd.DataFrame()) -> pd.DataFrame():
    output_df = input_df.copy()
    gearShifter_dict = {"drive": 0, "park": 1, "neutral": 2, "reverse": 3}
    output_df["gearShifter"] = input_df["gearShifter"].map(gearShifter_dict).astype(np.int8)
    return output_df


def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
    output_df = input_df.copy()
    output_df[["cv_ID", "Time"]] = input_df["ID"].str.split("_", expand=True)
    output_df = category2id(output_df)
    return output_df


def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():
    # いろいろ特徴量作成や工夫を追加する
    output_df = input_df.copy()
    output_df = make_features(output_df)
    return output_df


In [7]:
train_df = Preprocessing(train_df)
test_df = Preprocessing(test_df)


In [8]:
numerical_features = []
categorical_features = []
for col in test_df.columns:
    if col in set(["ID", "cv_ID", "Time", "cv_flag"]):
        continue
    unique_count = train_df[col].nunique()
    if unique_count == 1:
        continue
    if unique_count < 10:
        train_df[col] = train_df[col].astype("category")
        test_df[col] = test_df[col].astype("category")
        categorical_features.append(col)
    else:
        numerical_features.append(col)
features = numerical_features + categorical_features


In [9]:
oof_fold = np.zeros(len(train_df))
kfold = GroupKFold(n_splits=CFG.n_folds)
for fold, (train_index, valid_index) in enumerate(
    kfold.split(train_df, train_df[CFG.target_col_list[0]], train_df[CFG.group_col])
):
    oof_fold[valid_index] = fold + 1
train_df["cv_flag"] = oof_fold
train_df["cv_flag"] = train_df["cv_flag"].astype(np.int8)


In [10]:
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    features: list,
    categorical_features: list,
):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
        params=CFG.regression_lgb_params,
        train_set=lgb_train,
        num_boost_round=CFG.num_boost_round,
        valid_sets=[lgb_train, lgb_valid],
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    features: list,
    categorical_features: list,
):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)
    model = xgb.train(
        CFG.regression_xgb_params,
        dtrain=xgb_train,
        num_boost_round=CFG.num_boost_round,
        evals=[(xgb_train, "train"), (xgb_valid, "eval")],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose_eval=CFG.verbose,
    )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid, enable_categorical=True))
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    features: list,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    oof_predictions_df = pd.DataFrame(np.zeros((len(train_df), len(CFG.target_col_list))), columns=CFG.target_col_list)
    for target_col in CFG.target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            x_train = train_df[train_df["cv_flag"] != fold + 1][features]
            y_train = train_df[train_df["cv_flag"] != fold + 1][target_col]
            x_valid = train_df[train_df["cv_flag"] == fold + 1][features]
            y_valid = train_df[train_df["cv_flag"] == fold + 1][target_col]
            if method == "lightgbm":
                model, valid_pred = lightgbm_training(
                    x_train, y_train, x_valid, y_valid, features, categorical_features
                )
            if method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
            if method == "catboost":
                model, valid_pred = catboost_training(
                    x_train, y_train, x_valid, y_valid, features, categorical_features
                )

            # Save best model
            pickle.dump(
                model,
                open(
                    CFG.MODEL_DATA_PATH
                    / f"{CFG.AUTHOR}_{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}.pkl",
                    "wb",
                ),
            )
            # Add to out of folds array
            oof_predictions[train_df["cv_flag"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()
        oof_predictions_df[target_col] = oof_predictions

    # Compute out of folds metric
    abs_diff = np.abs(
        train_df[CFG.target_col_list].values - oof_predictions_df[CFG.target_col_list].values
    )  # 各予測の差分の絶対値を計算して
    score = np.mean(
        abs_diff.reshape(
            -1,
        )
    )
    print(f"{method} our out of folds CV MAE is {score}")
    # Create a dataframe to store out of folds predictions
    oof_predictions_df["ID"] = train_df["ID"].values
    oof_predictions_df.to_csv(
        CFG.OOF_DATA_PATH / f"{CFG.AUTHOR}_oof_{method}_seed{CFG.SEED}_ver{CFG.VER}.csv", index=False
    )


In [None]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train_df, features, categorical_features)


In [12]:
oof_predictions_df = pd.DataFrame(train_df["ID"])
oof_predictions_df[CFG.target_col_list] = 0
for method in CFG.METHOD_LIST:
    oof_predictions_df[CFG.target_col_list] += (
        CFG.model_weight_dict[method]
        * pd.read_csv(CFG.OOF_DATA_PATH / f"{CFG.AUTHOR}_oof_{method}_seed{CFG.SEED}_ver{CFG.VER}.csv")[
            CFG.target_col_list
        ].values
    )
abs_diff = np.abs(
    train_df[CFG.target_col_list].values - oof_predictions_df[CFG.target_col_list].values
)  # 各予測の差分の絶対値を計算して
score = np.mean(
    abs_diff.reshape(
        -1,
    )
)
print(f"{method} our out of folds CV MAE is {score}")


catboost our out of folds CV MAE is 0.23380061044861875


In [13]:
def lightgbm_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_DATA_PATH
                / f"{CFG.AUTHOR}_lightgbm_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def xgboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_DATA_PATH
                / f"{CFG.AUTHOR}_xgboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(xgb.DMatrix(x_test, enable_categorical=True))
        test_pred += pred
    return test_pred / CFG.n_folds


def catboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_DATA_PATH
                / f"{CFG.AUTHOR}_catboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.VER}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, target_col: str):
    x_test = test_df[features]
    if method == "lightgbm":
        test_pred = lightgbm_inference(x_test, target_col)
    if method == "xgboost":
        test_pred = xgboost_inference(x_test, target_col)
    if method == "catboost":
        test_pred = catboost_inference(x_test, target_col)
    return test_pred


def Predicting(input_df: pd.DataFrame, features: list):
    output_df = input_df.copy()
    for target_col in CFG.target_col_list:
        output_df[target_col] = 0
        for method in CFG.METHOD_LIST:
            output_df[f"{method}_pred_{target_col}"] = gradient_boosting_model_inference(
                method, input_df, features, target_col
            )
            output_df[target_col] += CFG.model_weight_dict[method] * output_df[f"{method}_pred_{target_col}"]
    return output_df


In [14]:
test_df = Predicting(test_df, features)


In [15]:
submit_df = submit_df.merge(test_df[["ID"] + CFG.target_col_list], on="ID", how="left")
submit_df[CFG.target_col_list].to_csv(CFG.SUB_DATA_PATH / "submit.csv", index=False)
