# LGBM001　ベースラインモデル
- オプティカルフローの予測値を使いたい。
- フローの速さを用いたい。
- 過去の発電量をid指定で用いたい。
- Interpolated_Dataset(Dataset)などをutils.pyにまとめたい。

- observed_maxを修正する。

In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
import utils

class CFG:
    DEBUG = False
    note_num = "StackingOpt/LGBM002"
    n_splits = 10  #データの分割
    seed = 42
    
    #model
    lgb_params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metric': 'mae',
                'num_boost_round':10000,
                'early_stopping_rounds':100,
                'num_leaves': 31,
                'learning_rate': 0.05,
                'feature_fraction': 0.9,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'verbose': 0,
                'device_type': 'gpu',
                'seed':42
                }
    
    #日付
    train_start_date = "201406010000"
    train_end_date = "201407010000"
    test_start_date = "201407010000"
    test_end_date = "201408010000"

    # train_start_date = "201308150000"
    # train_end_date = "20130817000"
    # test_start_date = "201308170000"
    # test_end_date = "20130818000"

    #特徴量
    interpolated_dir = "H:\study\output\StackingOpt\EDA005"
    pred_dir = "H:\study\output\StackingOpt\EDA006"

    features = ['two_weeks_max', 'id', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'year_sin', 'year_cos',\
                'prev_30m_generation', 'id_lat', 'id_lng', 'id_lat_mesh', 'id_lng_mesh', 'pvrate', 'observed_max']

    use_interpolated_features = True
    use_pred_features = True

    target = 'nv2'

    #oofで保存するcol
    saved_cols = ["datetime","id","fold","observed_max","generation",target,"pred"]



def main(cfg):

    if cfg.DEBUG:
        OUTPUT_DIR = f'H:/study/output/DEBUG/{cfg.note_num}/'
    else:
        OUTPUT_DIR = f'H:/study/output/{cfg.note_num}/'

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    utils.set_seeds()
    # 時系列の分割設定
    train_date_list = utils.create_time_series_data(cfg.train_start_date,cfg.train_end_date)
    train_date_list_split = np.array_split(train_date_list, cfg.n_splits)

    test_dates = utils.create_time_series_data(cfg.test_start_date,cfg.test_end_date)

    #データセットの読み込み
    unique_id = ['10000095', '10000269', '1020000002', '1110000001', '1110000010', '1110000011', '1110000012', '1110000013', '1110000014', '1110000015', '1160000025', '1160000090', '1160000091', '1160000182', '1160000185', '1160000253', '1160000387', '1160000402', '1160000419', '1160000420', '1160000423', '1270000026', '1280000048', '1550000001', '1650000004', '1680000001', '1680000002', '1680000003', '1680000004', '1680000010', '1680000017', '1680000021', '1680000033', '1680000047', '1680000054', '1680000057', '1680000063', '1680000067', '1680000080', '1680000081', '1680000097', '1680000107', '1680000108', '1680000112', '1680000151', '1680000152', '1680000213', '1680000216', '1680000217', '1680000218', '1680000223', '1680000228', '1680000285', '1680000287', '1680000327', '1680000364', '2220000001', '2220000002', '2220000003', '2730000001', '2910000002', '3000000007', '3000000012', '3000000042', '5000000044', '5000000045', '6000000016', '6000000017', '6060000016', '6060000017', '6060000018', '6170000016', '6170000123', '6170000124', '6170000125', '6620000065', '6620000066', '6620000088', '6620000089', '6620000111', '6620000117', '6620000118', '6620000121', '6620000122', '6620000123', '6620000124', '6620000131', '6620000132', '6910000180', '6910000198', '6910000200', '6910000206', '6910000216', '6910000217', '6910000239', '6910000240', '6910000249', '6910000250', '6910000421', '6910000424', '6910000425', '6910000469', '6910000470']
    unique_id = [int(i) for i in unique_id]
    to_unique_id = [str(num).zfill(10) for num in unique_id]
    df = utils.get_preprocessing_data(to_unique_id)
    df.drop_duplicates(inplace=True)
    df.drop_duplicates(subset=["id","datetime"],inplace=True)
    df = df.groupby('id').apply(utils.prev_30m_generation).reset_index(level=0, drop=True)
    id_all_data = pd.read_csv("H:\study\preprocessing_data\id_all_data.csv",encoding='shift_jis')
    df = df.merge(id_all_data,on=["id"],how="left")
    df.dropna(subset=["year"],inplace=True) #utils.prev_30m_generationで30分間隔のデータセットになっているため欠損が出ている。
    df["nv2"] = df["generation"] / df["observed_max"]

    #oof作成用
    df["pred"] = 0
    df.loc[df.datetime.isin(test_dates),"fold"] = "test"
    
    for fold in range(len(train_date_list_split)):
        print(f"\nFold {fold + 1}")
        train_dates = np.concatenate(train_date_list_split[:fold] + train_date_list_split[fold+1:])
        valid_dates = train_date_list_split[fold]

        X_train, y_train = df.loc[df.datetime.isin(train_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(train_dates),cfg.target]
        X_valid, y_valid = df.loc[df.datetime.isin(valid_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(valid_dates),cfg.target]
        X_test, y_test = df.loc[df.datetime.isin(test_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(test_dates),cfg.target]
        df.loc[df.datetime.isin(valid_dates),"fold"] = fold

        if cfg.use_interpolated_features:
            X_train_interpolated, y_train_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,train_dates)
            X_valid_interpolated, y_valid_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,valid_dates)
            X_test_interpolated, y_test_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,test_dates)
            
            X_train_interpolated = pd.DataFrame(X_train_interpolated)
            X_train_interpolated["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

            X_valid_interpolated = pd.DataFrame(X_valid_interpolated)
            X_valid_interpolated["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

            X_test_interpolated = pd.DataFrame(X_test_interpolated)
            X_test_interpolated["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

            X_train = X_train.merge(X_train_interpolated,on=["datetime"],how="left")
            X_valid = X_valid.merge(X_valid_interpolated,on=["datetime"],how="left")
            X_test = X_test.merge(X_test_interpolated,on=["datetime"],how="left")

        if cfg.use_pred_features:
            X_train_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,train_dates)
            X_valid_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,valid_dates)
            X_test_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,test_dates)
            
            X_train_interpolated_pred  = pd.DataFrame(X_train_interpolated_pred )
            X_train_interpolated_pred ["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

            X_valid_interpolated_pred = pd.DataFrame(X_valid_interpolated_pred)
            X_valid_interpolated_pred["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

            X_test_interpolated_pred = pd.DataFrame(X_test_interpolated_pred)
            X_test_interpolated_pred["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

            X_train = X_train.merge(X_train_interpolated_pred,on=["datetime"],how="left")
            X_valid = X_valid.merge(X_valid_interpolated_pred,on=["datetime"],how="left")
            X_test = X_test.merge(X_test_interpolated_pred,on=["datetime"],how="left")

        X_train.drop("datetime",axis=1,inplace=True)
        X_valid.drop("datetime",axis=1,inplace=True)
        X_test.drop("datetime",axis=1,inplace=True)

        # Train LightGBM model
        model = utils.train_lgbm(X_train, y_train, X_valid, y_valid, cfg.lgb_params)
        save_path = OUTPUT_DIR + f"/lgbm_fold{fold}.txt"
        model.save_model(save_path)

        # Evaluate model
        valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
        mse = utils.compute_mse(y_valid, valid_preds)
        mae = utils.compute_mae(y_valid, valid_preds)
        print(f"Fold {fold + 1} MSE: {mse}, MAE: {mae}")

        # Make predictions for the test set
        test_preds = model.predict(X_test, num_iteration=model.best_iteration)

        df.loc[df.datetime.isin(valid_dates),"pred"] = valid_preds
        df.loc[df.datetime.isin(test_dates),"pred"] += test_preds


    df.loc[df.datetime.isin(test_dates),"pred"] /= len(train_date_list_split)
    df.loc[df.datetime.isin(train_date_list+test_dates),cfg.saved_cols].to_csv(OUTPUT_DIR+"oof.csv",index=False)

    oof_mse = utils.compute_mse(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
    test_mse = utils.compute_mse(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

    oof_mae = utils.compute_mae(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
    test_mae = utils.compute_mae(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

    print('-'*40)
    print(f"Overall Out-of-Fold RMSE: {np.sqrt(oof_mse):.4f}")
    print(f"Overall Out-of-Fold MAE: {oof_mae:.4f}")
    print()
    print(f"Overall Test RMSE: {np.sqrt(test_mse):.4f}")
    print(f"Overall Test MAE: {test_mae:.4f}")
    print('-'*40)
    
    
if __name__ == "__main__":
    main(CFG)


Fold 1




[1]	valid_0's l1: 0.290026
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.277227
[3]	valid_0's l1: 0.264772
[4]	valid_0's l1: 0.253809
[5]	valid_0's l1: 0.242767
[6]	valid_0's l1: 0.232327
[7]	valid_0's l1: 0.222566
[8]	valid_0's l1: 0.213745
[9]	valid_0's l1: 0.204832
[10]	valid_0's l1: 0.19646
[11]	valid_0's l1: 0.188774
[12]	valid_0's l1: 0.181755
[13]	valid_0's l1: 0.174528
[14]	valid_0's l1: 0.167735
[15]	valid_0's l1: 0.161466
[16]	valid_0's l1: 0.155667
[17]	valid_0's l1: 0.150006
[18]	valid_0's l1: 0.14433
[19]	valid_0's l1: 0.138989
[20]	valid_0's l1: 0.133907
[21]	valid_0's l1: 0.129207
[22]	valid_0's l1: 0.124871
[23]	valid_0's l1: 0.12058
[24]	valid_0's l1: 0.116477
[25]	valid_0's l1: 0.113037
[26]	valid_0's l1: 0.109369
[27]	valid_0's l1: 0.105735
[28]	valid_0's l1: 0.10261
[29]	valid_0's l1: 0.0995761
[30]	valid_0's l1: 0.0969479
[31]	valid_0's l1: 0.0941792
[32]	valid_0's l1: 0.0914066
[33]	valid_0's l1: 0.0890568
[34]	valid_0's l1: 0.0



[1]	valid_0's l1: 0.222376
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.213356
[3]	valid_0's l1: 0.204717
[4]	valid_0's l1: 0.196862
[5]	valid_0's l1: 0.188882
[6]	valid_0's l1: 0.18176
[7]	valid_0's l1: 0.175017
[8]	valid_0's l1: 0.16824
[9]	valid_0's l1: 0.162405
[10]	valid_0's l1: 0.156423
[11]	valid_0's l1: 0.150787
[12]	valid_0's l1: 0.145302
[13]	valid_0's l1: 0.140166
[14]	valid_0's l1: 0.13544
[15]	valid_0's l1: 0.130996
[16]	valid_0's l1: 0.126628
[17]	valid_0's l1: 0.122543
[18]	valid_0's l1: 0.118543
[19]	valid_0's l1: 0.114856
[20]	valid_0's l1: 0.111398
[21]	valid_0's l1: 0.107979
[22]	valid_0's l1: 0.104757
[23]	valid_0's l1: 0.10187
[24]	valid_0's l1: 0.0989791
[25]	valid_0's l1: 0.0964561
[26]	valid_0's l1: 0.0941038
[27]	valid_0's l1: 0.0918166
[28]	valid_0's l1: 0.0896815
[29]	valid_0's l1: 0.0876775
[30]	valid_0's l1: 0.0857816
[31]	valid_0's l1: 0.0839329
[32]	valid_0's l1: 0.0820878
[33]	valid_0's l1: 0.0803239
[34]	valid_0's l1



[1]	valid_0's l1: 0.198755
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.190729
[3]	valid_0's l1: 0.183407
[4]	valid_0's l1: 0.176458
[5]	valid_0's l1: 0.169654
[6]	valid_0's l1: 0.16319
[7]	valid_0's l1: 0.157289
[8]	valid_0's l1: 0.151349
[9]	valid_0's l1: 0.146052
[10]	valid_0's l1: 0.140865
[11]	valid_0's l1: 0.136093
[12]	valid_0's l1: 0.13184
[13]	valid_0's l1: 0.127455
[14]	valid_0's l1: 0.123161
[15]	valid_0's l1: 0.119186
[16]	valid_0's l1: 0.115393
[17]	valid_0's l1: 0.111832
[18]	valid_0's l1: 0.108533
[19]	valid_0's l1: 0.105453
[20]	valid_0's l1: 0.102382
[21]	valid_0's l1: 0.0996712
[22]	valid_0's l1: 0.0969345
[23]	valid_0's l1: 0.0947823
[24]	valid_0's l1: 0.0923937
[25]	valid_0's l1: 0.0901744
[26]	valid_0's l1: 0.0879951
[27]	valid_0's l1: 0.0859543
[28]	valid_0's l1: 0.084413
[29]	valid_0's l1: 0.0828319
[30]	valid_0's l1: 0.0811645
[31]	valid_0's l1: 0.0796453
[32]	valid_0's l1: 0.0782425
[33]	valid_0's l1: 0.0769945
[34]	valid_0'



[1]	valid_0's l1: 0.224973
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.216411
[3]	valid_0's l1: 0.208305
[4]	valid_0's l1: 0.200317
[5]	valid_0's l1: 0.193165
[6]	valid_0's l1: 0.186385
[7]	valid_0's l1: 0.179114
[8]	valid_0's l1: 0.172629
[9]	valid_0's l1: 0.166602
[10]	valid_0's l1: 0.160815
[11]	valid_0's l1: 0.155251
[12]	valid_0's l1: 0.149894
[13]	valid_0's l1: 0.14514
[14]	valid_0's l1: 0.1406
[15]	valid_0's l1: 0.136177
[16]	valid_0's l1: 0.132118
[17]	valid_0's l1: 0.127934
[18]	valid_0's l1: 0.124226
[19]	valid_0's l1: 0.120477
[20]	valid_0's l1: 0.116957
[21]	valid_0's l1: 0.114027
[22]	valid_0's l1: 0.110807
[23]	valid_0's l1: 0.107871
[24]	valid_0's l1: 0.104977
[25]	valid_0's l1: 0.10233
[26]	valid_0's l1: 0.0999277
[27]	valid_0's l1: 0.0975295
[28]	valid_0's l1: 0.0952251
[29]	valid_0's l1: 0.0930589
[30]	valid_0's l1: 0.0911385
[31]	valid_0's l1: 0.0892101
[32]	valid_0's l1: 0.0873656
[33]	valid_0's l1: 0.0856743
[34]	valid_0's l1: 



[1]	valid_0's l1: 0.332329
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.319086
[3]	valid_0's l1: 0.306265
[4]	valid_0's l1: 0.293695
[5]	valid_0's l1: 0.282319
[6]	valid_0's l1: 0.271168
[7]	valid_0's l1: 0.261162
[8]	valid_0's l1: 0.252187
[9]	valid_0's l1: 0.243439
[10]	valid_0's l1: 0.235267
[11]	valid_0's l1: 0.227795
[12]	valid_0's l1: 0.22074
[13]	valid_0's l1: 0.213258
[14]	valid_0's l1: 0.206553
[15]	valid_0's l1: 0.199934
[16]	valid_0's l1: 0.193523
[17]	valid_0's l1: 0.187307
[18]	valid_0's l1: 0.18211
[19]	valid_0's l1: 0.176673
[20]	valid_0's l1: 0.171515
[21]	valid_0's l1: 0.166833
[22]	valid_0's l1: 0.162385
[23]	valid_0's l1: 0.157813
[24]	valid_0's l1: 0.153431
[25]	valid_0's l1: 0.14969
[26]	valid_0's l1: 0.145502
[27]	valid_0's l1: 0.142524
[28]	valid_0's l1: 0.138866
[29]	valid_0's l1: 0.13573
[30]	valid_0's l1: 0.133546
[31]	valid_0's l1: 0.130752
[32]	valid_0's l1: 0.127711
[33]	valid_0's l1: 0.124952
[34]	valid_0's l1: 0.122708



[1]	valid_0's l1: 0.230302
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.220663
[3]	valid_0's l1: 0.211173
[4]	valid_0's l1: 0.202355
[5]	valid_0's l1: 0.193963
[6]	valid_0's l1: 0.18686
[7]	valid_0's l1: 0.17929
[8]	valid_0's l1: 0.172259
[9]	valid_0's l1: 0.166272
[10]	valid_0's l1: 0.160059
[11]	valid_0's l1: 0.154427
[12]	valid_0's l1: 0.149073
[13]	valid_0's l1: 0.143756
[14]	valid_0's l1: 0.138824
[15]	valid_0's l1: 0.134119
[16]	valid_0's l1: 0.130086
[17]	valid_0's l1: 0.125876
[18]	valid_0's l1: 0.122147
[19]	valid_0's l1: 0.118412
[20]	valid_0's l1: 0.115237
[21]	valid_0's l1: 0.112078
[22]	valid_0's l1: 0.109267
[23]	valid_0's l1: 0.106466
[24]	valid_0's l1: 0.1035
[25]	valid_0's l1: 0.100839
[26]	valid_0's l1: 0.098482
[27]	valid_0's l1: 0.0962438
[28]	valid_0's l1: 0.0941522
[29]	valid_0's l1: 0.0923234
[30]	valid_0's l1: 0.090224
[31]	valid_0's l1: 0.0884455
[32]	valid_0's l1: 0.0870757
[33]	valid_0's l1: 0.0855729
[34]	valid_0's l1: 0.



[1]	valid_0's l1: 0.269291
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.259215
[3]	valid_0's l1: 0.249072
[4]	valid_0's l1: 0.239721
[5]	valid_0's l1: 0.230803
[6]	valid_0's l1: 0.222021
[7]	valid_0's l1: 0.213382
[8]	valid_0's l1: 0.205746
[9]	valid_0's l1: 0.198644
[10]	valid_0's l1: 0.192152
[11]	valid_0's l1: 0.185682
[12]	valid_0's l1: 0.179767
[13]	valid_0's l1: 0.174485
[14]	valid_0's l1: 0.168873
[15]	valid_0's l1: 0.163679
[16]	valid_0's l1: 0.158625
[17]	valid_0's l1: 0.153925
[18]	valid_0's l1: 0.150068
[19]	valid_0's l1: 0.145537
[20]	valid_0's l1: 0.14128
[21]	valid_0's l1: 0.137365
[22]	valid_0's l1: 0.13371
[23]	valid_0's l1: 0.130104
[24]	valid_0's l1: 0.126769
[25]	valid_0's l1: 0.123564
[26]	valid_0's l1: 0.120749
[27]	valid_0's l1: 0.117558
[28]	valid_0's l1: 0.114782
[29]	valid_0's l1: 0.112448
[30]	valid_0's l1: 0.10982
[31]	valid_0's l1: 0.107653
[32]	valid_0's l1: 0.105515
[33]	valid_0's l1: 0.104069
[34]	valid_0's l1: 0.10212



[1]	valid_0's l1: 0.189391
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.182869
[3]	valid_0's l1: 0.176797
[4]	valid_0's l1: 0.171906
[5]	valid_0's l1: 0.167032
[6]	valid_0's l1: 0.162297
[7]	valid_0's l1: 0.157317
[8]	valid_0's l1: 0.15283
[9]	valid_0's l1: 0.148546
[10]	valid_0's l1: 0.144705
[11]	valid_0's l1: 0.141391
[12]	valid_0's l1: 0.138427
[13]	valid_0's l1: 0.135434
[14]	valid_0's l1: 0.132547
[15]	valid_0's l1: 0.129554
[16]	valid_0's l1: 0.126634
[17]	valid_0's l1: 0.123979
[18]	valid_0's l1: 0.121491
[19]	valid_0's l1: 0.1193
[20]	valid_0's l1: 0.117108
[21]	valid_0's l1: 0.114673
[22]	valid_0's l1: 0.112642
[23]	valid_0's l1: 0.110851
[24]	valid_0's l1: 0.108821
[25]	valid_0's l1: 0.106927
[26]	valid_0's l1: 0.105424
[27]	valid_0's l1: 0.104012
[28]	valid_0's l1: 0.102703
[29]	valid_0's l1: 0.101367
[30]	valid_0's l1: 0.0997819
[31]	valid_0's l1: 0.0985253
[32]	valid_0's l1: 0.0972242
[33]	valid_0's l1: 0.0962767
[34]	valid_0's l1: 0.0



[1]	valid_0's l1: 0.20743
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.201931
[3]	valid_0's l1: 0.196582
[4]	valid_0's l1: 0.190926
[5]	valid_0's l1: 0.186605
[6]	valid_0's l1: 0.182367
[7]	valid_0's l1: 0.177387
[8]	valid_0's l1: 0.173209
[9]	valid_0's l1: 0.169146
[10]	valid_0's l1: 0.16538
[11]	valid_0's l1: 0.161762
[12]	valid_0's l1: 0.158783
[13]	valid_0's l1: 0.155772
[14]	valid_0's l1: 0.152639
[15]	valid_0's l1: 0.149876
[16]	valid_0's l1: 0.147528
[17]	valid_0's l1: 0.144877
[18]	valid_0's l1: 0.142724
[19]	valid_0's l1: 0.140378
[20]	valid_0's l1: 0.138231
[21]	valid_0's l1: 0.136262
[22]	valid_0's l1: 0.134431
[23]	valid_0's l1: 0.132288
[24]	valid_0's l1: 0.130261
[25]	valid_0's l1: 0.128479
[26]	valid_0's l1: 0.12706
[27]	valid_0's l1: 0.125557
[28]	valid_0's l1: 0.124273
[29]	valid_0's l1: 0.122801
[30]	valid_0's l1: 0.121516
[31]	valid_0's l1: 0.120113
[32]	valid_0's l1: 0.118831
[33]	valid_0's l1: 0.117696
[34]	valid_0's l1: 0.11649



[1]	valid_0's l1: 0.19118
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l1: 0.184757
[3]	valid_0's l1: 0.178741
[4]	valid_0's l1: 0.172759
[5]	valid_0's l1: 0.167163
[6]	valid_0's l1: 0.161655
[7]	valid_0's l1: 0.156832
[8]	valid_0's l1: 0.152248
[9]	valid_0's l1: 0.147937
[10]	valid_0's l1: 0.143816
[11]	valid_0's l1: 0.13978
[12]	valid_0's l1: 0.135709
[13]	valid_0's l1: 0.132083
[14]	valid_0's l1: 0.128668
[15]	valid_0's l1: 0.125418
[16]	valid_0's l1: 0.122428
[17]	valid_0's l1: 0.11966
[18]	valid_0's l1: 0.116865
[19]	valid_0's l1: 0.114371
[20]	valid_0's l1: 0.111875
[21]	valid_0's l1: 0.109721
[22]	valid_0's l1: 0.107669
[23]	valid_0's l1: 0.105597
[24]	valid_0's l1: 0.103622
[25]	valid_0's l1: 0.102003
[26]	valid_0's l1: 0.100246
[27]	valid_0's l1: 0.0987078
[28]	valid_0's l1: 0.097133
[29]	valid_0's l1: 0.0957503
[30]	valid_0's l1: 0.094584
[31]	valid_0's l1: 0.0932975
[32]	valid_0's l1: 0.0921043
[33]	valid_0's l1: 0.0910746
[34]	valid_0's l1: 0.

# LGBM002.py

In [4]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
import utils

import argparse
import inspect
import importlib

def main(cfg):

    if cfg.DEBUG:
        OUTPUT_DIR = f'H:/study/output/DEBUG/{cfg.note_num}/'
    else:
        OUTPUT_DIR = f'H:/study/output/{cfg.note_num}/'

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    utils.set_seeds()
    # 時系列の分割設定
    train_date_list = utils.create_time_series_data(cfg.train_start_date,cfg.train_end_date)
    train_date_list_split = np.array_split(train_date_list, cfg.n_splits)

    test_dates = utils.create_time_series_data(cfg.test_start_date,cfg.test_end_date)

    #データセットの読み込み
    unique_id = ['10000095', '10000269', '1020000002', '1110000001', '1110000010', '1110000011', '1110000012', '1110000013', '1110000014', '1110000015', '1160000025', '1160000090', '1160000091', '1160000182', '1160000185', '1160000253', '1160000387', '1160000402', '1160000419', '1160000420', '1160000423', '1270000026', '1280000048', '1550000001', '1650000004', '1680000001', '1680000002', '1680000003', '1680000004', '1680000010', '1680000017', '1680000021', '1680000033', '1680000047', '1680000054', '1680000057', '1680000063', '1680000067', '1680000080', '1680000081', '1680000097', '1680000107', '1680000108', '1680000112', '1680000151', '1680000152', '1680000213', '1680000216', '1680000217', '1680000218', '1680000223', '1680000228', '1680000285', '1680000287', '1680000327', '1680000364', '2220000001', '2220000002', '2220000003', '2730000001', '2910000002', '3000000007', '3000000012', '3000000042', '5000000044', '5000000045', '6000000016', '6000000017', '6060000016', '6060000017', '6060000018', '6170000016', '6170000123', '6170000124', '6170000125', '6620000065', '6620000066', '6620000088', '6620000089', '6620000111', '6620000117', '6620000118', '6620000121', '6620000122', '6620000123', '6620000124', '6620000131', '6620000132', '6910000180', '6910000198', '6910000200', '6910000206', '6910000216', '6910000217', '6910000239', '6910000240', '6910000249', '6910000250', '6910000421', '6910000424', '6910000425', '6910000469', '6910000470']
    unique_id = [int(i) for i in unique_id]
    to_unique_id = [str(num).zfill(10) for num in unique_id]
    df = utils.get_preprocessing_data(to_unique_id)
    df.drop_duplicates(inplace=True)
    df.drop_duplicates(subset=["id","datetime"],inplace=True)
    df = df.groupby('id').apply(utils.prev_30m_generation).reset_index(level=0, drop=True)
    id_all_data = pd.read_csv("H:\study\preprocessing_data\id_all_data.csv",encoding='shift_jis')
    df = df.merge(id_all_data,on=["id"],how="left")
    df.dropna(subset=["year"],inplace=True) #utils.prev_30m_generationで30分間隔のデータセットになっているため欠損が出ている。
    df["nv2"] = df["generation"] / df["observed_max"]

    #oof作成用
    df["pred"] = 0
    df.loc[df.datetime.isin(test_dates),"fold"] = "test"
    
    for fold in range(len(train_date_list_split)):
        print(f"\nFold {fold + 1}")
        train_dates = np.concatenate(train_date_list_split[:fold] + train_date_list_split[fold+1:])
        valid_dates = train_date_list_split[fold]

        X_train, y_train = df.loc[df.datetime.isin(train_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(train_dates),cfg.target]
        X_valid, y_valid = df.loc[df.datetime.isin(valid_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(valid_dates),cfg.target]
        X_test, y_test = df.loc[df.datetime.isin(test_dates),cfg.features+["datetime"]],df.loc[df.datetime.isin(test_dates),cfg.target]
        df.loc[df.datetime.isin(valid_dates),"fold"] = fold

        if cfg.use_interpolated_features:
            X_train_interpolated, y_train_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,train_dates)
            X_valid_interpolated, y_valid_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,valid_dates)
            X_test_interpolated, y_test_interpolated = utils.get_interpolated_mesh_data(cfg.interpolated_dir,test_dates)
            
            X_train_interpolated = pd.DataFrame(X_train_interpolated)
            X_train_interpolated["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

            X_valid_interpolated = pd.DataFrame(X_valid_interpolated)
            X_valid_interpolated["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

            X_test_interpolated = pd.DataFrame(X_test_interpolated)
            X_test_interpolated["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

            X_train = X_train.merge(X_train_interpolated,on=["datetime"],how="left")
            X_valid = X_valid.merge(X_valid_interpolated,on=["datetime"],how="left")
            X_test = X_test.merge(X_test_interpolated,on=["datetime"],how="left")

        if cfg.use_pred_features:
            X_train_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,train_dates)
            X_valid_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,valid_dates)
            X_test_interpolated_pred = utils.get_pred_interpolated_mesh_data(cfg.pred_dir,test_dates)
            
            X_train_interpolated_pred  = pd.DataFrame(X_train_interpolated_pred )
            X_train_interpolated_pred ["datetime"] = pd.to_datetime(train_dates, format="%Y%m%d%H%M")

            X_valid_interpolated_pred = pd.DataFrame(X_valid_interpolated_pred)
            X_valid_interpolated_pred["datetime"] = pd.to_datetime(valid_dates, format="%Y%m%d%H%M")

            X_test_interpolated_pred = pd.DataFrame(X_test_interpolated_pred)
            X_test_interpolated_pred["datetime"] = pd.to_datetime(test_dates, format="%Y%m%d%H%M")

            X_train = X_train.merge(X_train_interpolated_pred,on=["datetime"],how="left")
            X_valid = X_valid.merge(X_valid_interpolated_pred,on=["datetime"],how="left")
            X_test = X_test.merge(X_test_interpolated_pred,on=["datetime"],how="left")

        X_train.drop("datetime",axis=1,inplace=True)
        X_valid.drop("datetime",axis=1,inplace=True)
        X_test.drop("datetime",axis=1,inplace=True)

        # Train LightGBM model
        model = utils.train_lgbm(X_train, y_train, X_valid, y_valid, cfg.lgb_params)
        save_path = OUTPUT_DIR + f"/lgbm_fold{fold}.txt"
        model.save_model(save_path)

        # Evaluate model
        valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
        mse = utils.compute_mse(y_valid, valid_preds)
        mae = utils.compute_mae(y_valid, valid_preds)
        print(f"Fold {fold + 1} MSE: {mse}, MAE: {mae}")

        # Make predictions for the test set
        test_preds = model.predict(X_test, num_iteration=model.best_iteration)

        df.loc[df.datetime.isin(valid_dates),"pred"] = valid_preds
        df.loc[df.datetime.isin(test_dates),"pred"] += test_preds


    df.loc[df.datetime.isin(test_dates),"pred"] /= len(train_date_list_split)
    df.loc[df.datetime.isin(train_date_list+test_dates),cfg.saved_cols].to_csv(OUTPUT_DIR+"oof.csv",index=False)

    oof_mse = utils.compute_mse(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
    test_mse = utils.compute_mse(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

    oof_mae = utils.compute_mae(df.loc[df.datetime.isin(train_date_list),cfg.target] , df.loc[df.datetime.isin(train_date_list),"pred"])
    test_mae = utils.compute_mae(df.loc[df.datetime.isin(test_dates),cfg.target] , df.loc[df.datetime.isin(test_dates),"pred"])

    print('-'*40)
    print(f"Overall Out-of-Fold RMSE: {np.sqrt(oof_mse):.4f}")
    print(f"Overall Out-of-Fold MAE: {oof_mae:.4f}")
    print()
    print(f"Overall Test RMSE: {np.sqrt(test_mse):.4f}")
    print(f"Overall Test MAE: {test_mae:.4f}")
    print('-'*40)
    
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="CFG1", help="設定クラスを選択（CFG1 または CFG2 など）")
    args = parser.parse_args()

    config_module = importlib.import_module("LGBM_config")
    cfg_classes = {name: cls for name, cls in inspect.getmembers(config_module, inspect.isclass) if name.startswith("CFG")}

    if args.config in cfg_classes:
        cfg = cfg_classes[args.config]()
    else:
        raise ValueError("無効な設定クラス名が指定されました。")

    main(cfg)

usage: ipykernel_launcher.py [-h] [--config CONFIG]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9063 --control=9061 --hb=9060 --Session.signature_scheme="hmac-sha256" --Session.key=b"436d0c35-fa32-4092-9e4a-6ea5b07f2915" --shell=9062 --transport="tcp" --iopub=9064 --f=c:\Users\Yosui\AppData\Roaming\jupyter\runtime\kernel-v2-9864RwtWACbA1e3c.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
