In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
import utils

In [3]:
class CFG3:
    DEBUG = True
    note_num = "StackingOpt/LGBM003/CFG3"
    load_data_kind = "load_data2"
    n_splits = 5  #データの分割,クロスバリデーション（機械学習、学習データの分割）
    seed = 42
    
    #model
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'num_boost_round': 3,
        'early_stopping_rounds': 100,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 5,
        'device_type': 'cpu',  # ここを 'cpu' に変更
        'seed': 42
    }

    
    #日付
    train_start_date = "201406010000"
    train_end_date = "201407010000"
    test_start_date = "201407010000"
    test_end_date = "201408010000"

    #特徴量
    flo_unique_dir = 'H:/study/output/StackingOpt/EDA006/'

    features = ['two_weeks_max', 'id', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'year_sin', 'year_cos',\
                'prev_30m_generation', 'id_lat', 'id_lng', 'id_lat_mesh', 'id_lng_mesh', 'pvrate', 'observed_max2']


    use_flo_unique_features = False #オプティカルフローのunique_idの予測値

    target = 'nv2'

    #oofで保存するcol
    saved_cols = ["datetime","id","fold","observed_max","generation",target,"pred"]

cfg = CFG3


In [4]:
if cfg.DEBUG:
    OUTPUT_DIR = f'E:/study/output/DEBUG/{cfg.note_num}/'
else:
    OUTPUT_DIR = f'E:/study/output/{cfg.note_num}/'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

utils.set_seeds()
# 時系列の分割設定
train_date_list = utils.create_time_series_data(cfg.train_start_date,cfg.train_end_date)
train_date_list_split = np.array_split(train_date_list, cfg.n_splits)

test_dates = utils.create_time_series_data(cfg.test_start_date,cfg.test_end_date)

#データセットの読み込み
if cfg.load_data_kind == "load_data1":
    df,unique_id = utils.load_data1()
elif cfg.load_data_kind == "load_data2":
    df,unique_id = utils.load_data2()

#oof作成用
df["pred"] = 0
df.loc[df.datetime.isin(test_dates),"fold"] = "test"

for fold in range(len(train_date_list_split)):
    print(f"\nFold {fold + 1}")
    train_dates = np.concatenate(train_date_list_split[:fold] + train_date_list_split[fold+1:])
    valid_dates = train_date_list_split[fold]

    X_train, y_train = df.loc[df.datetime.isin(train_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(train_dates),cfg.target]
    X_valid, y_valid = df.loc[df.datetime.isin(valid_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(valid_dates),cfg.target]
    X_test, y_test = df.loc[df.datetime.isin(test_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(test_dates),cfg.target]
    df.loc[df.datetime.isin(valid_dates),"fold"] = fold

    if cfg.use_flo_unique_features:
        X_train_flo_unique = utils.get_unique_pred_interpolated(cfg.flo_unique_dir,train_dates,unique_id)
        X_valid_flo_unique = utils.get_unique_pred_interpolated(cfg.flo_unique_dir,valid_dates,unique_id)
        X_test_flo_unique = utils.get_unique_pred_interpolated(cfg.flo_unique_dir,test_dates,unique_id)
        
        X_train_flo_unique = pd.DataFrame(X_train_flo_unique,columns=[f"{i}_flo_unique" for i in range(X_train_flo_unique.shape[1])])
        X_train_flo_unique["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

        X_valid_flo_unique= pd.DataFrame(X_valid_flo_unique,columns=[f"{i}_flo_unique" for i in range(X_valid_flo_unique.shape[1])])
        X_valid_flo_unique["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

        X_test_flo_unique= pd.DataFrame(X_test_flo_unique,columns=[f"{i}_flo_unique" for i in range(X_test_flo_unique.shape[1])])
        X_test_flo_unique["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

        X_train = X_train.merge(X_train_flo_unique,on=["datetime"],how="left")
        X_valid = X_valid.merge(X_valid_flo_unique,on=["datetime"],how="left")
        X_test = X_test.merge(X_test_flo_unique,on=["datetime"],how="left")


    X_train.drop("datetime",axis=1,inplace=True)
    X_valid.drop("datetime",axis=1,inplace=True)
    X_test.drop("datetime",axis=1,inplace=True)

    # Train LightGBM model
    model = utils.train_lgbm(X_train, y_train, X_valid, y_valid, cfg.lgb_params)
    save_path = OUTPUT_DIR + f"/lgbm_fold{fold}.txt"
    model.save_model(save_path)

    # Evaluate model
    valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
    mse = utils.compute_mse(y_valid, valid_preds)
    mae = utils.compute_mae(y_valid, valid_preds)
    print(f"Fold {fold + 1} MSE: {mse}, MAE: {mae}")

    # Make predictions for the test set
    test_preds = model.predict(X_test, num_iteration=model.best_iteration)

    df.loc[df.datetime.isin(valid_dates),"pred"] = valid_preds
    df.loc[df.datetime.isin(test_dates),"pred"] += test_preds

df.loc[df.datetime.isin(test_dates),"pred"] /= len(train_date_list_split)
df.loc[df.datetime.isin(train_date_list+test_dates),cfg.saved_cols].to_csv(OUTPUT_DIR+"oof.csv",index=False)

oof_mse = utils.compute_mse(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
test_mse = utils.compute_mse(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

oof_mae = utils.compute_mae(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
test_mae = utils.compute_mae(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

print('-'*40)
print(f"Overall Out-of-Fold RMSE: {np.sqrt(oof_mse):.4f}")
print(f"Overall Out-of-Fold MAE: {oof_mae:.4f}")
print()
print(f"Overall Test RMSE: {np.sqrt(test_mse):.4f}")
print(f"Overall Test MAE: {test_mae:.4f}")
print('-'*40)


Fold 1




[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000070
[LightGBM] [Debug] init for col-wise cost 0.000272 seconds, init for row-wise cost 0.009035 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2463
[LightGBM] [Info] Number of data points in the train set: 215280, number of used features: 15
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 0.420561
[LightGBM] [Debug] Re-bagging, using 172076 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's l1: 0.230031
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[2]	valid_0's l1: 0.221126
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[3]	valid_0's l1: 0.212702
Did not meet early stopping. Best ite



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.002860
[LightGBM] [Debug] init for col-wise cost 0.000003 seconds, init for row-wise cost 0.008441 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2461
[LightGBM] [Info] Number of data points in the train set: 215280, number of used features: 15
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 0.423102
[LightGBM] [Debug] Re-bagging, using 172076 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's l1: 0.212633
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[2]	valid_0's l1: 0.204409
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[3]	valid_0's l1: 0.196688
Did not meet early stopping. Best ite



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.002872
[LightGBM] [Debug] init for col-wise cost 0.000005 seconds, init for row-wise cost 0.008165 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2461
[LightGBM] [Info] Number of data points in the train set: 215280, number of used features: 15
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 0.410337
[LightGBM] [Debug] Re-bagging, using 172076 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's l1: 0.214725
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[2]	valid_0's l1: 0.206205
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[3]	valid_0's l1: 0.198258
Did not meet early stopping. Best ite



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.002876
[LightGBM] [Debug] init for col-wise cost 0.000004 seconds, init for row-wise cost 0.008830 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2461
[LightGBM] [Info] Number of data points in the train set: 215280, number of used features: 15
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 0.405722
[LightGBM] [Debug] Re-bagging, using 172076 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's l1: 0.227515
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[2]	valid_0's l1: 0.218433
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 6
[3]	valid_0's l1: 0.209887
Did not meet early stopping. Best ite



[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.002855
[LightGBM] [Debug] init for col-wise cost 0.000003 seconds, init for row-wise cost 0.008124 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2461
[LightGBM] [Info] Number of data points in the train set: 215280, number of used features: 15
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 0.413141
[LightGBM] [Debug] Re-bagging, using 172076 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's l1: 0.216335
Training until validation scores don't improve for 100 rounds
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[2]	valid_0's l1: 0.207815
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[3]	valid_0's l1: 0.199836
Did not meet early stopping. Best ite

In [5]:
oof = pd.read_csv(OUTPUT_DIR+"oof.csv")

In [6]:
oof.head()

Unnamed: 0,datetime,id,fold,observed_max,generation,nv2,pred
0,2014-06-01 07:00:00,564.0,0,5150.5454,3071.7414,0.594653,0.459352
1,2014-06-01 07:30:00,564.0,0,5150.5454,3577.721,0.692605,0.459352
2,2014-06-01 08:00:00,564.0,0,5150.5454,4000.3258,0.774416,0.459352
3,2014-06-01 08:30:00,564.0,0,5150.5454,4299.936,0.832418,0.459352
4,2014-06-01 09:00:00,564.0,0,5150.5454,4500.6054,0.871265,0.459352


In [7]:
oof[oof.fold=="test"]

Unnamed: 0,datetime,id,fold,observed_max,generation,nv2,pred
690,2014-07-01 07:00:00,5.640000e+02,test,5150.5454,1845.2872,0.357226,0.396203
691,2014-07-01 07:30:00,5.640000e+02,test,5150.5454,2830.1376,0.547882,0.417653
692,2014-07-01 08:00:00,5.640000e+02,test,5150.5454,1342.0332,0.259802,0.449809
693,2014-07-01 08:30:00,5.640000e+02,test,5150.5454,2931.8792,0.567578,0.408370
694,2014-07-01 09:00:00,5.640000e+02,test,5150.5454,3960.7984,0.766764,0.453815
...,...,...,...,...,...,...,...
547165,2014-07-31 16:00:00,6.950000e+09,test,18880.4616,3977.2572,0.205877,0.409674
547166,2014-07-31 16:30:00,6.950000e+09,test,18880.4616,5679.7408,0.294004,0.383995
547167,2014-07-31 17:00:00,6.950000e+09,test,18880.4616,4034.7046,0.208851,0.403061
547168,2014-07-31 17:30:00,6.950000e+09,test,18880.4616,2863.2240,0.148211,0.377709
