In [2]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')
import pickle

import lightgbm as lgb
import sys
from pathlib import Path

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering, feature_agg_temp


In [3]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect()
test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect()

In [4]:
train = feature_engineering(train).to_pandas()
test = feature_engineering(test).to_pandas()

In [5]:
train, test = feature_agg_temp(train, test)

In [6]:
# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.2,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

del train, test


In [7]:
# Usage_timeを追加しての検証
rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      verbose_eval=100
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/exp_lgbm/model_temp_2000_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 9.32648	valid_1's rmse: 9.33321
[200]	training's rmse: 3.79051	valid_1's rmse: 3.80118
[300]	training's rmse: 2.11721	valid_1's rmse: 2.13766
[400]	training's rmse: 1.74131	valid_1's rmse: 1.76813
[500]	training's rmse: 1.64635	valid_1's rmse: 1.67777
[600]	training's rmse: 1.60158	valid_1's rmse: 1.63676
[700]	training's rmse: 1.57087	valid_1's rmse: 1.61159
[800]	training's rmse: 1.54764	valid_1's rmse: 1.59343
[900]	training's rmse: 1.52891	valid_1's rmse: 1.57744
[1000]	training's rmse: 1.51603	valid_1's rmse: 1.56702
[1100]	training's rmse: 1.5062	valid_1's rmse: 1.55984
[1200]	training's rmse: 1.49518	valid_1's rmse: 1.55141
[1300]	training's rmse: 1.48445	valid_1's rmse: 1.54287
[1400]	training's rmse: 1.4748	valid_1's rmse: 1.53541
[1500]	training's rmse: 1.46668	valid_1's rmse: 1.529
[1600]	training's rmse: 1.45799	valid_1's rmse: 1.52266
[1700]	training's rmse: 1.44992	valid_1's

## Note
- exp2: Average RMSE: 1.5055 ± 0.0020
- 精度は0.01くらい改善、バラつきが大きくなった
- 位置情報の特徴量を加えてみる