In [16]:
import gc
import itertools
import os
import pickle
import random
import sys
import warnings
from glob import glob
from pathlib import Path

import config  # edit config.py as needed
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import torch
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from metric import score  # edit metric.py as needed
from seed import seed_everything  # edit seed.py as needed
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")


In [2]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    EXP_NAME = config.EXP_NAME
    AUTHOR = config.KAGGLE_USERNAME
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    # OOF_DATA_PATH = config.OUTPUT_DIR / "oof"
    MODEL_PATH = config.OUTPUT_DIR / "models"
    # SUB_DATA_PATH = config.OUTPUT_DIR / "submission"
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    SEED = 42
    n_folds = 5
    target_col_list = ["target"]
    group_col = "race_group"  # Required for GroupKFold (edit as needed)
    stratified_col = "race_group"  # Required for StratifiedKFold (edit as needed)
    # metric = "MAE"
    # metric_maximize_flag = False
    num_boost_round = 10000
    early_stopping_round = 100
    verbose = 250

    regression_lgb_params = {
        "objective": "regression_l1",
        "metric": "mae",
        "learning_rate": 0.1,
        "num_leaves": 31,
        "seed": SEED,
    }
    regression_xgb_params = {
        "objective": "reg:absoluteerror",
        "eval_metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 6,
        "random_state": SEED,
    }

    regression_cat_params = {
        "loss_function": "MAE",
        "learning_rate": 0.1,
        "iterations": num_boost_round,
        "depth": 7,
        "random_seed": SEED,
    }

    model_weight_dict = {"lightgbm": 0.40, "xgboost": 0.30, "catboost": 0.30}


In [3]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [4]:
# ====================================================
# Read data
# ====================================================
train = pl.read_csv(CFG.DATA_PATH / "train.csv", try_parse_dates=True)
test = pl.read_csv(CFG.DATA_PATH / "test.csv", try_parse_dates=True)
sample_submission = pl.read_csv(CFG.DATA_PATH / "sample_submission.csv")
# make index column
train = train.with_row_index()
test = test.with_row_index()


In [6]:
# ====================================================
# Preprocess(ここに前処理や特徴量エンジニアリングを記述)
# ====================================================
def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    output = df.clone()
    return output


In [7]:
train = preprocess(train)
test = preprocess(test)


In [13]:
# ====================================================
# Make fold column
# ====================================================
fold_array = np.zeros(len(train))
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
for i, (_, val_idx) in enumerate(skf.split(train, train[CFG.stratified_col])):
    fold = i + 1
    fold_array[val_idx] = fold
train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))


In [55]:
# ====================================================
# Set categorical columns etc. (pandas operation from here)
# ====================================================
train = train.to_pandas()
test = test.to_pandas()


In [56]:
# ====================================================
# Training functions
# ====================================================
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    # features: list,
    categorical_features: list,
):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
        params=CFG.regression_lgb_params,
        train_set=lgb_train,
        num_boost_round=CFG.num_boost_round,
        valid_sets=[lgb_train, lgb_valid],
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    # features: list,
    # categorical_features: list,
):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)
    model = xgb.train(
        CFG.regression_xgb_params,
        dtrain=xgb_train,
        num_boost_round=CFG.num_boost_round,
        evals=[(xgb_train, "train"), (xgb_valid, "eval")],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose_eval=CFG.verbose,
    )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid, enable_categorical=True))
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    # features: list,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    oof_predictions_df = pd.DataFrame(np.zeros((len(train_df), len(CFG.target_col_list))), columns=CFG.target_col_list)
    for target_col in CFG.target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            x_train = train_df[train_df["fold"] != fold + 1][features]
            y_train = train_df[train_df["fold"] != fold + 1][target_col]
            x_valid = train_df[train_df["fold"] == fold + 1][features]
            y_valid = train_df[train_df["fold"] == fold + 1][target_col]
            if method == "lightgbm":
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)
            if method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
            if method == "catboost":
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)

            # Save best model
            pickle.dump(
                model,
                open(
                    CFG.MODEL_DATA_PATH
                    / f"{CFG.AUTHOR}_{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                    "wb",
                ),
            )
            # Add to out of folds array
            oof_predictions[train_df["fold"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()
        oof_predictions_df[target_col] = oof_predictions

    # Compute out of folds metric
    m = score(train_df, oof_predictions_df, "ID")
    print(f"{method} our out of folds CV score is {m}")

    # Create a dataframe to store out of folds predictions
    oof_predictions_df["ID"] = train_df["ID"].values
    oof_predictions_df.to_csv(
        CFG.OUTPUT_DIR / f"{CFG.AUTHOR}_oof_{method}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False
    )


In [None]:
# TODO: features, categorical_featuresを設定
# TODO: 以下のステップ実行さす


In [None]:
# ====================================================
# Training
# ====================================================
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train, features, categorical_features)


In [None]:
# ====================================================
# Inference functions
# ====================================================


In [None]:
# ====================================================
# Inference
# ====================================================
