In [1]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')
import pickle

import lightgbm as lgb
import sys
from pathlib import Path

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering, feature_agg_temp


In [2]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect()
test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect()

In [3]:
train = feature_engineering(train).to_pandas()
test = feature_engineering(test).to_pandas()

In [4]:
train, test = feature_agg_temp(train, test)

In [5]:
train.head()

Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,...,Snow,GroundTemp,Dust,Usage_time,Velocity,Temp_mean,Temp_min,Temp_max,Duratioin_mean,Duration_meadian
0,28,8480,37.530167,127.007439,37.535221,127.068398,5.404561,4,14,19,...,0.0,9.9,13.0,28,302.857143,16.548405,2.5,22.9,43.0,51.0
1,51,7710,37.530167,127.007439,37.535221,127.068398,5.404561,9,12,18,...,0.0,28.5,17.0,51,151.176471,24.362245,18.3,28.8,43.0,51.0
2,59,8020,37.530167,127.007439,37.535221,127.068398,5.404561,6,30,22,...,0.0,22.4,0.0,60,133.666667,22.198692,19.0,27.4,43.0,51.0
3,53,10490,37.530167,127.007439,37.535221,127.068398,5.404561,7,25,22,...,0.0,28.1,27.0,55,190.727273,27.672861,20.4,32.5,43.0,51.0
4,24,7230,37.530167,127.007439,37.535221,127.068398,5.404561,11,30,20,...,0.0,-0.4,50.0,26,278.076923,8.890479,2.6,13.9,43.0,51.0


In [6]:
# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.2,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=4, shuffle=True, random_state=42)

del train, test


In [7]:
# Usage_timeを追加しての検証
rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=5000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      verbose_eval=100
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/exp_lgbm/model_5000_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 9.33043	valid_1's rmse: 9.32229
[200]	training's rmse: 3.78801	valid_1's rmse: 3.78736
[300]	training's rmse: 2.104	valid_1's rmse: 2.1109
[400]	training's rmse: 1.72328	valid_1's rmse: 1.737
[500]	training's rmse: 1.63153	valid_1's rmse: 1.64985
[600]	training's rmse: 1.58636	valid_1's rmse: 1.60923
[700]	training's rmse: 1.55004	valid_1's rmse: 1.57952
[800]	training's rmse: 1.52055	valid_1's rmse: 1.5549
[900]	training's rmse: 1.50231	valid_1's rmse: 1.54038
[1000]	training's rmse: 1.48905	valid_1's rmse: 1.53099
[1100]	training's rmse: 1.47799	valid_1's rmse: 1.52325
[1200]	training's rmse: 1.46797	valid_1's rmse: 1.51651
[1300]	training's rmse: 1.45669	valid_1's rmse: 1.50825
[1400]	training's rmse: 1.44507	valid_1's rmse: 1.4993
[1500]	training's rmse: 1.43534	valid_1's rmse: 1.49183
[1600]	training's rmse: 1.42673	valid_1's rmse: 1.48594
[1700]	training's rmse: 1.41958	valid_1's rm