In [1]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')
import pickle

import lightgbm as lgb
import sys
from pathlib import Path

# スクリプトのあるディレクトリの絶対パスを取得
script_dir = str(Path('../scripts').resolve())

# sys.pathにスクリプトのディレクトリを追加
if script_dir not in sys.path:
    sys.path.insert(0, script_dir)

from preprocess import reduce_mem_usage, feature_engineering, feature_agg_temp


In [2]:
train = reduce_mem_usage(pl.scan_parquet("../data/train.parquet")).collect()
test = reduce_mem_usage(pl.scan_parquet("../data/test.parquet")).collect()

In [3]:
train = feature_engineering(train).to_pandas()
test = feature_engineering(test).to_pandas()

In [4]:
train, test = feature_agg_temp(train, test)

In [5]:
train.head()

Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,...,Humid,Solar,Snow,GroundTemp,Dust,Usage_time,Velocity,Temp_max,Temp_min,Temp_mean
0,28,8480,37.530167,127.007439,37.535221,127.068398,5.404561,4,14,19,...,76.0,0.02,0.0,9.9,13.0,28,302.857143,22.9,2.5,16.548405
1,11,2130,37.544582,127.044609,37.54258,127.063309,1.663655,4,17,19,...,31.0,0.25,0.0,13.2,91.0,11,193.636364,22.9,2.5,16.548405
2,93,10240,37.557892,126.93808,37.547733,126.931763,1.259433,4,1,19,...,79.0,0.03,0.0,15.9,74.0,94,108.93617,22.9,2.5,16.548405
3,18,3320,37.526066,126.925537,37.543579,126.951324,2.993715,4,29,19,...,38.0,0.16,0.0,17.4,52.0,19,174.736842,22.9,2.5,16.548405
4,113,8220,37.471539,127.050591,37.505581,127.024277,4.440585,4,12,19,...,14.0,0.25,0.0,11.9,28.0,123,66.829268,22.9,2.5,16.548405


In [6]:
# データの準備
X, y = train.iloc[:, 1:], train["Duration"]
X_test = test.iloc[:, 1:]
y_test = test["Duration"]

# LightGBMのパラメータ設定
params = {
    'boosting_type': 'gbdt', # LightGBMでは 'gbdt' が通常の勾配ブースティングを表す
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'lambda_l2': 0.2,
    'max_depth': 7,
    'num_leaves': 64, # LightGBM特有のパラメータ、ツリーの葉の数
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'verbose': -1 # verboseの設定は -1, 0, 1 となります
}

# KFoldの設定
kf = KFold(n_splits=5, shuffle=True, random_state=42)

del train, test


In [8]:
# Usage_timeを追加しての検証
rmse = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # LightGBM用データセットの作成
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    print(f"KFold: {i+1} Start")
    
    # モデル学習
    model = lgb.train(params,
                      lgb_train,
                      num_boost_round=2000,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=50,
                      verbose_eval=100
                     )
    
    # 推論
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    score = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse.append(score)
    print(f"KFold: {i+1} ==> RMSE: {score:.4f}")
    with open(f"./models/exp_lgbm/model_velocity_2000_{i+1}.pickle", mode='wb') as f:
        pickle.dump(model, f)

print(f"Average RMSE: {np.mean(rmse):.4f} ± {np.std(rmse):.4f}")

KFold: 1 Start
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 9.3303	valid_1's rmse: 9.33753
[200]	training's rmse: 3.79474	valid_1's rmse: 3.80598
[300]	training's rmse: 2.12496	valid_1's rmse: 2.1464
[400]	training's rmse: 1.7499	valid_1's rmse: 1.77836
[500]	training's rmse: 1.65585	valid_1's rmse: 1.68794
[600]	training's rmse: 1.61235	valid_1's rmse: 1.64813
[700]	training's rmse: 1.5747	valid_1's rmse: 1.61695
[800]	training's rmse: 1.5466	valid_1's rmse: 1.59322
[900]	training's rmse: 1.52592	valid_1's rmse: 1.5757
[1000]	training's rmse: 1.51056	valid_1's rmse: 1.56402
[1100]	training's rmse: 1.49757	valid_1's rmse: 1.55374
[1200]	training's rmse: 1.48599	valid_1's rmse: 1.5454
[1300]	training's rmse: 1.4753	valid_1's rmse: 1.53733
[1400]	training's rmse: 1.4608	valid_1's rmse: 1.5252
[1500]	training's rmse: 1.44935	valid_1's rmse: 1.51642
[1600]	training's rmse: 1.43888	valid_1's rmse: 1.50766
[1700]	training's rmse: 1.43181	valid_1's rmse:

## Note
- exp2: Average RMSE: 1.5055 ± 0.0020
- 精度は0.01くらい改善、バラつきが大きくなった
- 位置情報の特徴量を加えてみる