In [1]:
import gc
import itertools
import os
import pickle
import random
import sys
import warnings
from glob import glob
from pathlib import Path

import config
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import torch
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lightgbm import LGBMClassifier, LGBMRegressor
from metric import score
from scipy.optimize import minimize
from seed import seed_everything
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from xgboost import XGBClassifier, XGBRegressor

warnings.filterwarnings("ignore")


In [2]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DRY_RUN = True
    EXP_NAME = config.EXP_NAME
    AUTHOR = "marumarukun"
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    MODEL_PATH = config.OUTPUT_DIR / "models"  # モデル作成・実験時はこちらを使用(Notebookではこちらを使用)
    # MODEL_PATH = config.ARTIFACT_EXP_DIR(config.EXP_NAME) / "models"  # 提出時はこちらを使用(pyではこちらを使用)
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    METHOD_WEIGHT_DICT = {"lightgbm": 0.7, "xgboost": 0.2, "catboost": 0.1}
    SEED = 319
    n_folds = 2 if DRY_RUN else 5
    target_col_list = ["target"]
    # group_col = "category1"  # Required for GroupKFold (edit as needed)
    stratified_col = "target"  # Required for StratifiedKFold (edit as needed)
    num_boost_round = 100 if DRY_RUN else 1000000
    early_stopping_round = 10 if DRY_RUN else 100  # 10÷lrで設定
    verbose = 500

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
    regression_lgb_params = {
        "objective": "regression",
        # "metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 5,
        "min_child_weight": 1,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "subsample_freq": 1,
        "seed": SEED,
        "device": "cpu",  # cpu/gpu/cuda
    }
    # https://xgboost.readthedocs.io/en/stable/parameter.html
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
    regression_xgb_params = {
        "objective": "reg:squarederror",
        # "eval_metric": "mae",
        "learning_rate": 0.1,
        "max_depth": 5,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "min_child_weight": 1,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "cpu",  # cpu/gpu/cuda
    }
    # https://catboost.ai/docs/en/references/training-parameters/
    # https://catboost.ai/docs/en/concepts/python-reference_catboostregressor
    # https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier
    regression_cat_params = {
        "loss_function": "RMSE",
        "learning_rate": 0.1,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "CPU",  # CPU/GPU
    }


In [3]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [4]:
# ====================================================
# Read data
# ====================================================
train_pl = pl.read_csv(CFG.DATA_PATH / "train_demo.csv", try_parse_dates=True)
test_pl = pl.read_csv(CFG.DATA_PATH / "test_demo.csv", try_parse_dates=True)
# make index column
# train = train.with_row_index()
# test = test.with_row_index()


In [5]:
# ====================================================
# Make fold column
# ====================================================
# StratifiedKFold
fold_array = np.zeros(train_pl.height)
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
for fold, (_, val_idx) in enumerate(skf.split(train_pl, train_pl[CFG.stratified_col]), start=1):
    fold_array[val_idx] = fold
train_pl = train_pl.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))

# KFold
# fold_array = np.zeros(train.height)
# kf = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
# for fold, (_, val_idx) in enumerate(kf.split(train), start=1):
#     fold_array[val_idx] = fold
# train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))


In [6]:
# ====================================================
# Define columns and Label Encode categorical columns
# ====================================================
train = train_pl.to_pandas()
test = test_pl.to_pandas()

RMV = ["id", "fold", "target"]
FEATURES = [c for c in train.columns if c not in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")


There are 3 FEATURES: ['category1', 'feature1', 'feature2']


In [7]:
CATS = []
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")


In these features, there are 1 CATEGORICAL FEATURES: ['category1']


In [8]:
combined = pd.concat([train, test], axis=0, ignore_index=True)
# print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ", end="")
for c in FEATURES:
    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ", end="")
        combined[c], _ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype == "float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype == "int64":
            combined[c] = combined[c].astype("int32")

train = combined.iloc[: len(train)].copy()
test = combined.iloc[len(train) :].reset_index(drop=True).copy()


We LABEL ENCODE the CATEGORICAL FEATURES: category1, 

In [9]:
# ====================================================
# Training functions
# ====================================================
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    model = LGBMRegressor(
        **CFG.regression_lgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        categorical_feature=categorical_features,
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBRegressor(
        **CFG.regression_xgb_params,
        n_estimators=CFG.num_boost_round,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, target_col_list: list, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    # oof_predictions_df = pd.DataFrame(np.zeros((len(train_df), len(target_col_list))), columns=["prediction"])
    oof_predictions_df = pd.DataFrame(np.zeros((len(train_df), len(target_col_list))), columns=target_col_list)

    for target_col in target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            x_train = train_df[train_df["fold"] != fold + 1][features]
            y_train = train_df[train_df["fold"] != fold + 1][target_col]
            x_valid = train_df[train_df["fold"] == fold + 1][features]
            y_valid = train_df[train_df["fold"] == fold + 1][target_col]
            if method == "lightgbm":
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)
            elif method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
            elif method == "catboost":
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)
            else:
                raise ValueError(f"Unknown method: {method}")

            # Save best model
            save_model_path = (
                CFG.MODEL_PATH / f"{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
            )
            save_model_path.parent.mkdir(parents=True, exist_ok=True)
            pickle.dump(model, open(save_model_path, "wb"))
            # Add to out of folds array
            oof_predictions[train_df["fold"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        oof_predictions_df[target_col] = oof_predictions

        # Compute out of folds metric
        m = score(train_df[target_col].copy(), oof_predictions_df[target_col].copy())
        print("=" * 50)
        print(f"{method} our out of folds CV score is {m}")
        print("=" * 50)

    oof_predictions_df.to_csv(CFG.OUTPUT_DIR / f"oof_{method}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False)


In [10]:
# ====================================================
# Training
# ====================================================
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, train, CFG.target_col_list, FEATURES, CATS)


--------------------------------------------------
lightgbm training fold 1 target
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.133003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 500, number of used features: 3
[LightGBM] [Info] Start training from score 4.494000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	valid_0's l2: 7.81797
--------------------------------------------------
lightgbm training fold 2 target
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 500, number of used features: 3
[LightGBM] [Info] Start training from score 4.494000
Training until validation scores do

In [11]:
# ====================================================
# Inference functions
# ====================================================
def model_inference(method: str, x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model_path = CFG.MODEL_PATH / f"{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
        model = pickle.load(open(model_path, "rb"))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, target_col: str):
    x_test = test_df[features]
    return model_inference(method, x_test, target_col)


def predicting(input_df: pd.DataFrame, features: list):
    output_df = input_df.copy()
    for target_col in CFG.target_col_list:
        output_df[target_col] = 0
        for method in CFG.METHOD_LIST:
            output_df[f"{method}_pred_{target_col}"] = gradient_boosting_model_inference(
                method, input_df, features, target_col
            )
            output_df[target_col] += CFG.METHOD_WEIGHT_DICT[method] * output_df[f"{method}_pred_{target_col}"]
    return output_df


In [14]:
# ====================================================
# Inference
# ====================================================
output_df = predicting(test, FEATURES)

display(output_df)


Unnamed: 0,id,category1,feature1,feature2,target,fold,lightgbm_pred_target,xgboost_pred_target,catboost_pred_target
0,1,2,0.792534,0.720213,4.388481,,4.268106,4.120021,4.538473
1,2,1,0.270750,0.338985,4.436763,,4.554857,4.401791,4.349283
2,3,2,0.232813,0.533922,4.539075,,4.490044,4.436782,4.598759
3,4,0,0.784797,0.178037,4.344282,,4.243482,4.561685,4.381441
4,5,2,0.688592,0.487690,4.283289,,4.025682,4.422742,4.461484
...,...,...,...,...,...,...,...,...,...
995,996,0,0.376355,0.557178,4.632432,,4.683666,4.750127,4.567906
996,997,0,0.264158,0.270880,4.494678,,4.620548,4.437678,4.405381
997,998,1,0.488203,0.628793,4.413378,,4.337676,4.385800,4.479455
998,999,2,0.851246,0.073835,4.305156,,4.114915,4.407165,4.436948
