In [5]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
import sys
from pathlib import Path
import pickle

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering

In [2]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect()

test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect()

In [6]:
X, y = train.to_pandas().iloc[:, 1:], train.to_pandas()["Duration"]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
# XGBoostのパラメータ設定
params = {
    'boosting_type': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.01,
    'gamma': 0,
    'max_depth': 7,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'vervose': 0
}

y_test = test.to_pandas()["Duration"]
X_test = test.to_pandas().iloc[:, 1:]

rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    dtrain = xgb.DMatrix(X.loc[train_index], label=y.loc[train_index], feature_names=train.columns[1:])
    dvalid = xgb.DMatrix(X.loc[val_index], label=y.loc[val_index], feature_names=train.columns[1:])
    dtest = xgb.DMatrix(X_test, label=y_test, feature_names=test.columns[1:])
    # 学習の経過を保存する箱
    print(f"KFold: {i+1} Start")
    evaluation_results = {}
    evals = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params,
                      dtrain,
                      num_boost_round=2000,
                      evals=evals,
                      evals_result=evaluation_results,
                      early_stopping_rounds=50,
                      verbose_eval=50
                     )
    best_iteration = int(model.attributes()['best_iteration'])
    # XGBoost推論
    y_pred = model.predict(dtest, iteration_range=(0, best_iteration + 1))

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/baseline_xgb/model_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
[0]	train-rmse:24.87463	eval-rmse:24.85891
[50]	train-rmse:18.70457	eval-rmse:18.69341
[100]	train-rmse:15.75924	eval-rmse:15.75382
[150]	train-rmse:14.41965	eval-rmse:14.41906
[200]	train-rmse:13.75879	eval-rmse:13.76129
[250]	train-rmse:13.33389	eval-rmse:13.33970
[300]	train-rmse:12.87086	eval-rmse:12.87951
[350]	train-rmse:12.23884	eval-rmse:12.24889
[400]	train-rmse:11.64814	eval-rmse:11.65883
[450]	train-rmse:11.05805	eval-rmse:11.07023
[500]	train-rmse:10.58793	eval-rmse:10.60286
[550]	train-rmse:10.20794	eval-rmse:10.22600
[600]	train-rmse:9.86356	eval-rmse:9.88444
[650]	train-rmse:9.58701	eval-rmse:9.60932
[700]	train-rmse:9.33846	eval-rmse:9.36185
[750]	train-rmse:9.11800	eval-rmse:9.14278
[800]	train-rmse:8.90764	eval-rmse:8.93412
[850]	train-rmse:8.71981	eval-rmse:8.74836
[900]	train-rmse:8.53488	eval-rmse:8.56486
[950]	train-rmse:8.36254	eval-rmse:8.39423
[1000]	train-rmse:8.20273	eval-rmse:8.23641
[1050]	train-rmse:8.05729	eval-rmse:8.09302
[1100]	train-rms