In [1]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')
import pickle

import lightgbm as lgb
import sys
from pathlib import Path

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering

In [5]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect().to_pandas()
test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect().to_pandas()

In [6]:
# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.2,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

del train, test


In [7]:
categorical_feature = ["PDweek", "DDweek"]
rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      categorical_feature=categorical_feature,
                      verbose_eval=100
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    # with open(f"./models/baseline_lgbm/model_5000_{i+1}.pickle", mode='wb') as f:
    #     pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 15.8818	valid_1's rmse: 15.8751
[200]	training's rmse: 13.8626	valid_1's rmse: 13.8627
[300]	training's rmse: 12.9638	valid_1's rmse: 12.9668
[400]	training's rmse: 11.9402	valid_1's rmse: 11.9447
[500]	training's rmse: 10.9852	valid_1's rmse: 10.9928
[600]	training's rmse: 10.2922	valid_1's rmse: 10.3043
[700]	training's rmse: 9.78369	valid_1's rmse: 9.79853
[800]	training's rmse: 9.35469	valid_1's rmse: 9.37143
[900]	training's rmse: 8.98046	valid_1's rmse: 9.00001
[1000]	training's rmse: 8.65194	valid_1's rmse: 8.67296
[1100]	training's rmse: 8.36043	valid_1's rmse: 8.38367
[1200]	training's rmse: 8.08727	valid_1's rmse: 8.1126
[1300]	training's rmse: 7.82964	valid_1's rmse: 7.8575
[1400]	training's rmse: 7.60144	valid_1's rmse: 7.6313
[1500]	training's rmse: 7.38657	valid_1's rmse: 7.41842
[1600]	training's rmse: 7.20586	valid_1's rmse: 7.23942
[1700]	training's rmse: 7.03489	valid_1'

KeyboardInterrupt: 