In [1]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')
import pickle

import lightgbm as lgb
import sys
from pathlib import Path

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering

In [4]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect().to_pandas()
test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect().to_pandas()

In [8]:

# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.2,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=5000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      verbose_eval=100
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/baseline_lgbm/model_5000_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 15.8819	valid_1's rmse: 15.8756
[200]	training's rmse: 13.8653	valid_1's rmse: 13.8658
[300]	training's rmse: 12.9781	valid_1's rmse: 12.9811
[400]	training's rmse: 11.9554	valid_1's rmse: 11.96
[500]	training's rmse: 10.9836	valid_1's rmse: 10.9924
[600]	training's rmse: 10.2798	valid_1's rmse: 10.2932
[700]	training's rmse: 9.77334	valid_1's rmse: 9.78912
[800]	training's rmse: 9.3604	valid_1's rmse: 9.37803
[900]	training's rmse: 8.98296	valid_1's rmse: 9.00301
[1000]	training's rmse: 8.62854	valid_1's rmse: 8.65046
[1100]	training's rmse: 8.31561	valid_1's rmse: 8.33955
[1200]	training's rmse: 8.03607	valid_1's rmse: 8.06162
[1300]	training's rmse: 7.79363	valid_1's rmse: 7.82111
[1400]	training's rmse: 7.58443	valid_1's rmse: 7.61395
[1500]	training's rmse: 7.37304	valid_1's rmse: 7.40455
[1600]	training's rmse: 7.18117	valid_1's rmse: 7.21403
[1700]	training's rmse: 6.98822	valid_1'

In [10]:
# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.1,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      verbose_eval=50
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/baseline_lgbm/model_2000_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 18.8383	valid_1's rmse: 18.8267
[100]	training's rmse: 15.8812	valid_1's rmse: 15.8747
[150]	training's rmse: 14.5308	valid_1's rmse: 14.5283
[200]	training's rmse: 13.8641	valid_1's rmse: 13.864
[250]	training's rmse: 13.4032	valid_1's rmse: 13.404
[300]	training's rmse: 12.9578	valid_1's rmse: 12.96
[350]	training's rmse: 12.3919	valid_1's rmse: 12.3952
[400]	training's rmse: 11.8569	valid_1's rmse: 11.8616
[450]	training's rmse: 11.4214	valid_1's rmse: 11.4263
[500]	training's rmse: 10.9579	valid_1's rmse: 10.965
[550]	training's rmse: 10.5721	valid_1's rmse: 10.5815
[600]	training's rmse: 10.2697	valid_1's rmse: 10.2809
[650]	training's rmse: 10.0134	valid_1's rmse: 10.0252
[700]	training's rmse: 9.79374	valid_1's rmse: 9.80619
[750]	training's rmse: 9.58054	valid_1's rmse: 9.5938
[800]	training's rmse: 9.3625	valid_1's rmse: 9.3777
[850]	training's rmse: 9.15454	valid_1's rmse: 9.1708