In [1]:
# !pip install autogluon

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [3]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('./electric_train.csv', index_col=0)
df_train.columns = [i.split('.')[1] for i in df_train.columns]

In [4]:
df_train

Unnamed: 0,num,tm,hh24,n,stn,sum_qctr,sum_load,n_mean_load,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,week_name,elec
1,4821,2021-01-01 01:00:00,1,11,884,6950,751.32,68.606449,2.2,62.7,1.8,0.0,-1.0,4,0,99.56
2,4821,2021-01-01 02:00:00,2,11,884,6950,692.60,68.606449,2.3,63.1,2.1,0.0,-0.6,4,0,91.78
3,4821,2021-01-01 03:00:00,3,11,884,6950,597.48,68.606449,2.2,62.4,2.5,0.0,-1.3,4,0,79.17
4,4821,2021-01-01 04:00:00,4,11,884,6950,553.48,68.606449,1.7,63.5,1.7,0.0,-0.2,4,0,73.34
5,4821,2021-01-01 05:00:00,5,11,884,6950,526.24,68.606449,1.7,63.0,1.6,0.0,-0.8,4,0,69.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7593351,20947,2022-12-31 20:00:00,20,23,671,34200,6779.84,225.461986,2.7,46.3,3.1,0.0,-0.4,5,1,130.74
7593352,20947,2022-12-31 21:00:00,21,23,671,34200,6802.40,225.461986,2.6,46.8,3.1,0.0,-0.5,5,1,131.18
7593353,20947,2022-12-31 22:00:00,22,23,671,34200,6706.68,225.461986,2.4,47.4,2.1,0.0,0.2,5,1,129.33
7593354,20947,2022-12-31 23:00:00,23,23,671,34200,6355.88,225.461986,2.5,47.0,2.1,0.0,0.3,5,1,122.57


In [5]:
# !pip install holidays



In [6]:
import holidays

# CDD 계산 함수
def calculate_cdd(temperature, ref_CDD=24):
    return np.maximum(0, temperature - ref_CDD)
# HDD 계산 함수
def calculate_hdd(temperature, ref_HDD=18):
    return np.maximum(0, ref_HDD - temperature)

def eda_df(df:pd.DataFrame) -> pd.DataFrame:

    # nph_ws_10m 결측치 이전과 앞 시간의 평균으로 대치
    tmp_df = df[df['nph_ws_10m']==-99]
    error_times = tmp_df.tm.value_counts().index
    just_1hour = pd.tseries.offsets.DateOffset(hours=1)

    for error_time in error_times:
        error_time_ts = pd.to_datetime(error_time)
        before_time = (error_time_ts-just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        after_time = (error_time_ts+just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        tmp_df2 = df[df.num.isin(tmp_df[tmp_df.tm==error_time].num.unique())]
        tmp_df2 = tmp_df2[tmp_df2.tm.isin([before_time, after_time])].pivot_table('nph_ws_10m', 'num', 'tm', 'max')
        for error_num, value in zip(tmp_df2.index, tmp_df2.mean(axis=1).values):
            tmp_df3 = df[df.num==error_num]
            df.loc[tmp_df3[tmp_df3.nph_ws_10m==-99].index, 'nph_ws_10m'] = value

    # nph_rn_60m 결측치 이전과 앞 시간의 평균으로 대치

    tmp_df = df[df.nph_rn_60m==-99]
    error_times = tmp_df.tm.value_counts().index
    just_1hour = pd.tseries.offsets.DateOffset(hours=1)

    for error_time in error_times:
        error_time_ts = pd.to_datetime(error_time)
        before_time = (error_time_ts-just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        after_time = (error_time_ts+just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        tmp_df2 = df[df.num.isin(tmp_df[tmp_df.tm==error_time].num.unique())]
        tmp_df2 = tmp_df2[tmp_df2.tm.isin([before_time, after_time])].pivot_table('nph_rn_60m', 'num', 'tm', 'max')
        for error_num, value in zip(tmp_df2.index, tmp_df2.mean(axis=1).values):
            tmp_df3 = df[df.num==error_num]
            df.loc[tmp_df3[tmp_df3.nph_rn_60m==-99].index, 'nph_rn_60m'] = value
    
    # 결측치 rows 제외
    for col in df.select_dtypes(exclude=object).columns:
        df = df[df[col]>-99]

    df['day'] = df['tm'].apply(lambda x: x[:10])
    df['day'] = pd.to_datetime(df['day'])
    df['time'] = df['tm'].apply(lambda x: int(x[11:13]))

    df.loc[df['time']==0, 'day'] = df.loc[df['time'] == 0, 'day'] - pd.DateOffset(days=1)
    df.loc[df['time']==0, 'time'] = 24

    # 냉방도일 기준 온도 설정
    ref_CDD = 24
    # CDD 컬럼 추가
    df['cdd'] = calculate_cdd(df['nph_ta'], ref_CDD)
    # 일 단위로 CDD 합산
    df['accumulate_cdd'] = df.groupby(['num', 'day'])['cdd'].transform('sum')

    # 난방도일 기준 온도 설정
    ref_HDD = 18
    # HDD 컬럼 추가
    df['hdd'] = calculate_hdd(df['nph_ta'], ref_HDD)
    # 일 단위로 HDD 합산
    df['accumulate_hdd'] = df.groupby(['num', 'day'])['hdd'].transform('sum')

    # 불쾌지수
    df['thi'] = (1.8*df['nph_ta']) - (0.55*(1-df['nph_hm']/100)*(1.8*df['nph_ta']-26)) + 32

    # 휴일
    df['평일/주말'] = df['day'].dt.weekday
    df['평일/주말'] = df['평일/주말'].apply(lambda x: 1 if x in [5,6] else 0)
    df['holiday'] = df['day'].dt.date.isin(holidays.CountryHoliday('KR')).astype(int)
    df['주말'] = (df['day'].dt.weekday >= 5).astype(int)

    # 개인 더미 변수
    # df['year'] = df.tm.str[:4].astype(int)
    # df['dawn'] = df.hh24.apply(lambda x: 1 if x in [1,2,3,4,5,6,20,21,22,23,24] else 0)
    # df['morning'] = df.hh24.apply(lambda x: 1 if x in [7,8,9,10,11,12,13] else 0)
    # df['afternoon'] = df.hh24.apply(lambda x: 1 if x in [14,15,16,17,18,19] else 0)
    # df['summer'] = df.tm.str[5:7].astype(int).apply(lambda x: 1 if x in [5,6,7,8,9] else 0)
    # df['winter'] = df.tm.str[5:7].astype(int).apply(lambda x: 1 if x in [1,2,3,11,12] else 0)
    return df

use_cols = ['num', 'hh24', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m', 'nph_ta_chi', 'weekday', 
            'cdd', 'accumulate_cdd', 'hdd','accumulate_hdd', 'thi', '평일/주말', 'holiday', '주말', 
            # 'year', 'dawn', 'morning', 'afternoon', 'summer', 'winter',
            'elec']
df_train = eda_df(df_train)[use_cols]
df_train.columns

Index(['num', 'hh24', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m',
       'nph_ta_chi', 'weekday', 'cdd', 'accumulate_cdd', 'hdd',
       'accumulate_hdd', 'thi', '평일/주말', 'holiday', '주말', 'elec'],
      dtype='object')

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# 실제 코드에서는 df = pd.read_csv('your_file.csv') 와 같이 로드합니다.
df = df_train.iloc[:,1:]

# 특성과 레이블 분리
X = df
y = df['elec'].values

# # 데이터 정규화
# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset(X_train)
save_path = 'automodel'
predictor =  TabularPredictor(
    label = 'elec',
    path=save_path,
    eval_metric = 'pearsonr',
    problem_type = 'regression'
)
predictor.fit(
    train_data = train_data,
    # presets = 'best_quality',
    time_limit = 3600*8
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.7
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
Memory Avail:       20.58 GB / 31.68 GB (65.0%)
Disk Space Avail:   358.29 GB / 453.79 GB (79.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ... Time limit = 28800s
AutoGluon will save models to "automodel"
Train Data Rows:    6074680

[1000]	valid_set's l2: 46.8038	valid_set's pearsonr: 0.961971
[2000]	valid_set's l2: 45.6001	valid_set's pearsonr: 0.962968
[3000]	valid_set's l2: 44.9152	valid_set's pearsonr: 0.963535
[4000]	valid_set's l2: 44.4159	valid_set's pearsonr: 0.963948
[5000]	valid_set's l2: 44.03	valid_set's pearsonr: 0.964267
[6000]	valid_set's l2: 43.6935	valid_set's pearsonr: 0.964545
[7000]	valid_set's l2: 43.3961	valid_set's pearsonr: 0.964791
[8000]	valid_set's l2: 43.1244	valid_set's pearsonr: 0.965015
[9000]	valid_set's l2: 42.895	valid_set's pearsonr: 0.965205
[10000]	valid_set's l2: 42.686	valid_set's pearsonr: 0.965377


	0.9654	 = Validation score   (pearsonr)
	464.45s	 = Training   runtime
	5.22s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 28180.47s of the 28180.47s of remaining time.


[1000]	valid_set's l2: 40.7982	valid_set's pearsonr: 0.966935
[2000]	valid_set's l2: 39.2488	valid_set's pearsonr: 0.968211
[3000]	valid_set's l2: 38.3002	valid_set's pearsonr: 0.968992
[4000]	valid_set's l2: 37.519	valid_set's pearsonr: 0.969635
[5000]	valid_set's l2: 36.9296	valid_set's pearsonr: 0.970119
[6000]	valid_set's l2: 36.4173	valid_set's pearsonr: 0.97054
[7000]	valid_set's l2: 35.9239	valid_set's pearsonr: 0.970946
[8000]	valid_set's l2: 35.5403	valid_set's pearsonr: 0.971261
[9000]	valid_set's l2: 35.1778	valid_set's pearsonr: 0.971559
[10000]	valid_set's l2: 34.8029	valid_set's pearsonr: 0.971867


	0.9719	 = Validation score   (pearsonr)
	371.37s	 = Training   runtime
	4.13s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 27804.42s of the 27804.42s of remaining time.
	0.9707	 = Validation score   (pearsonr)
	1558.69s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 26245.05s of the 26245.05s of remaining time.
	0.969	 = Validation score   (pearsonr)
	1636.98s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesMSE ... Training model for up to 24607.96s of the 24607.96s of remaining time.
	0.969	 = Validation score   (pearsonr)
	765.36s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 23841.92s of the 23841.92s of remaining time.
Metric pearsonr is not supported by this model - using mean_squared_error instead
	0.9656	 = Validation score   (pearsonr)
	3751.04s	 = Training   runtime
	0.38s	 = Validation runti

[1000]	valid_set's l2: 38.6846	valid_set's pearsonr: 0.968677
[2000]	valid_set's l2: 36.9849	valid_set's pearsonr: 0.970075
[3000]	valid_set's l2: 35.7414	valid_set's pearsonr: 0.971096
[4000]	valid_set's l2: 34.7821	valid_set's pearsonr: 0.971884
[5000]	valid_set's l2: 33.9569	valid_set's pearsonr: 0.972562
[6000]	valid_set's l2: 33.2859	valid_set's pearsonr: 0.973112
[7000]	valid_set's l2: 32.7756	valid_set's pearsonr: 0.973531
[8000]	valid_set's l2: 32.2596	valid_set's pearsonr: 0.973953
[9000]	valid_set's l2: 31.8066	valid_set's pearsonr: 0.974324
[10000]	valid_set's l2: 31.4091	valid_set's pearsonr: 0.97465


	0.9746	 = Validation score   (pearsonr)
	559.4s	 = Training   runtime
	5.52s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 2879.19s of the 13344.38s of remaining time.
	Ensemble Weights: {'LightGBMLarge': 0.52, 'KNeighborsDist': 0.44, 'XGBoost': 0.04}
	0.9794	 = Validation score   (pearsonr)
	0.4s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 15457.85s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 3807.0 rows/s (60747 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("automodel")


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x241169bb790>

In [9]:
predictor.leaderboard()
test_data = TabularDataset(X_test)
train_pred = predictor.predict(train_data.drop(columns=['elec']))
test_pred = predictor.predict(test_data.drop(columns=['elec']))

from sklearn.metrics import r2_score
print(r2_score(y_train, train_pred))
print(r2_score(y_test, test_pred))

0.9821756069825591
0.9591286539348302


In [10]:
predictor.evaluate(test_data, silent=True)

{'pearsonr': 0.9793953491589603,
 'root_mean_squared_error': -5.082585450507701,
 'mean_squared_error': -25.832674861712576,
 'mean_absolute_error': -3.5456218946052824,
 'r2': 0.9591286539348302,
 'median_absolute_error': -2.5330589294433565}

In [11]:
predictor.evaluate(train_data, silent=True)

{'pearsonr': 0.9911412134553057,
 'root_mean_squared_error': -3.352999685997801,
 'mean_squared_error': -11.24260689430135,
 'mean_absolute_error': -2.4170664479175783,
 'r2': 0.9821756069825591,
 'median_absolute_error': -1.7901092529296818}

In [12]:
predictor.leaderboard(test_data)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.979395,0.979409,pearsonr,397.348872,15.956706,2234.36745,0.039694,0.001992,0.395796,2,True,12
1,LightGBMLarge,0.974841,0.97465,pearsonr,139.727816,5.520766,559.402981,139.727816,5.520766,559.402981,1,True,11
2,XGBoost,0.973421,0.973246,pearsonr,192.040208,7.454323,1608.219734,192.040208,7.454323,1608.219734,1,True,9
3,LightGBM,0.971963,0.971867,pearsonr,100.531869,4.132765,371.369265,100.531869,4.132765,371.369265,1,True,4
4,KNeighborsDist,0.971725,0.971853,pearsonr,65.541154,2.979624,66.348938,65.541154,2.979624,66.348938,1,True,2
5,RandomForestMSE,0.970732,0.970733,pearsonr,7.129673,0.261278,1558.685398,7.129673,0.261278,1558.685398,1,True,5
6,CatBoost,0.969084,0.969034,pearsonr,1.139276,0.043678,1636.97957,1.139276,0.043678,1636.97957,1,True,6
7,ExtraTreesMSE,0.969028,0.969035,pearsonr,7.586999,0.257333,765.361213,7.586999,0.257333,765.361213,1,True,7
8,NeuralNetFastAI,0.965525,0.965606,pearsonr,9.658902,0.375,3751.039418,9.658902,0.375,3751.039418,1,True,8
9,LightGBMXT,0.965483,0.965377,pearsonr,130.544851,5.222679,464.447529,130.544851,5.222679,464.447529,1,True,3


In [13]:
df_test = pd.read_csv('electric_test.csv')
df_test

Unnamed: 0,NUM,TM,HH24,STN,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,week_name,elect
0,4816,2023-01-01 01:00:00,1,752,3.0,68.6,2.9,0.0,-0.1,6,1.0,
1,4816,2023-01-01 02:00:00,2,752,3.1,69.4,2.7,0.0,0.3,6,1.0,
2,4816,2023-01-01 03:00:00,3,752,3.6,68.3,2.3,0.0,1.2,6,1.0,
3,4816,2023-01-01 04:00:00,4,752,4.0,69.2,3.1,0.0,1.1,6,1.0,
4,4816,2023-01-01 05:00:00,5,752,4.2,69.5,2.5,0.0,2.0,6,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
2829473,12322,2023-12-31 20:00:00,20,901,5.6,70.8,3.2,0.0,5.8,6,1.0,
2829474,12322,2023-12-31 21:00:00,21,901,5.3,69.1,3.6,0.0,4.5,6,1.0,
2829475,12322,2023-12-31 22:00:00,22,901,5.1,70.6,3.0,0.0,5.1,6,1.0,
2829476,12322,2023-12-31 23:00:00,23,901,5.2,69.7,3.8,0.0,5.1,6,1.0,


In [14]:
in_col = []
for i in df_test.columns:
    in_col.append(i.lower())

df_test.columns = in_col
df_test

Unnamed: 0,num,tm,hh24,stn,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,week_name,elect
0,4816,2023-01-01 01:00:00,1,752,3.0,68.6,2.9,0.0,-0.1,6,1.0,
1,4816,2023-01-01 02:00:00,2,752,3.1,69.4,2.7,0.0,0.3,6,1.0,
2,4816,2023-01-01 03:00:00,3,752,3.6,68.3,2.3,0.0,1.2,6,1.0,
3,4816,2023-01-01 04:00:00,4,752,4.0,69.2,3.1,0.0,1.1,6,1.0,
4,4816,2023-01-01 05:00:00,5,752,4.2,69.5,2.5,0.0,2.0,6,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
2829473,12322,2023-12-31 20:00:00,20,901,5.6,70.8,3.2,0.0,5.8,6,1.0,
2829474,12322,2023-12-31 21:00:00,21,901,5.3,69.1,3.6,0.0,4.5,6,1.0,
2829475,12322,2023-12-31 22:00:00,22,901,5.1,70.6,3.0,0.0,5.1,6,1.0,
2829476,12322,2023-12-31 23:00:00,23,901,5.2,69.7,3.8,0.0,5.1,6,1.0,


In [15]:
import holidays

# CDD 계산 함수
def calculate_cdd(temperature, ref_CDD=24):
    return np.maximum(0, temperature - ref_CDD)
# HDD 계산 함수
def calculate_hdd(temperature, ref_HDD=18):
    return np.maximum(0, ref_HDD - temperature)


def eda_df(df:pd.DataFrame) -> pd.DataFrame:

    # nph_ws_10m 결측치 이전과 앞 시간의 평균으로 대치
    tmp_df = df[df['nph_ws_10m']==-99]
    error_times = tmp_df.tm.value_counts().index
    just_1hour = pd.tseries.offsets.DateOffset(hours=1)

    for error_time in error_times:
        error_time_ts = pd.to_datetime(error_time)
        before_time = (error_time_ts-just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        after_time = (error_time_ts+just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        tmp_df2 = df[df.num.isin(tmp_df[tmp_df.tm==error_time].num.unique())]
        tmp_df2 = tmp_df2[tmp_df2.tm.isin([before_time, after_time])].pivot_table('nph_ws_10m', 'num', 'tm', 'max')
        for error_num, value in zip(tmp_df2.index, tmp_df2.mean(axis=1).values):
            tmp_df3 = df[df.num==error_num]
            df.loc[tmp_df3[tmp_df3.nph_ws_10m==-99].index, 'nph_ws_10m'] = value

    # nph_rn_60m 결측치 이전과 앞 시간의 평균으로 대치

    tmp_df = df[df.nph_rn_60m==-99]
    error_times = tmp_df.tm.value_counts().index
    just_1hour = pd.tseries.offsets.DateOffset(hours=1)

    for error_time in error_times:
        error_time_ts = pd.to_datetime(error_time)
        before_time = (error_time_ts-just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        after_time = (error_time_ts+just_1hour).strftime('%Y-%m-%d %H:%M:%S')
        tmp_df2 = df[df.num.isin(tmp_df[tmp_df.tm==error_time].num.unique())]
        tmp_df2 = tmp_df2[tmp_df2.tm.isin([before_time, after_time])].pivot_table('nph_rn_60m', 'num', 'tm', 'max')
        for error_num, value in zip(tmp_df2.index, tmp_df2.mean(axis=1).values):
            tmp_df3 = df[df.num==error_num]
            df.loc[tmp_df3[tmp_df3.nph_rn_60m==-99].index, 'nph_rn_60m'] = value
    
    # 결측치 rows 제외
    # for col in df.select_dtypes(exclude='object').columns[:-1]:
        # df = df[df[col]>-99]

    df['day'] = df['tm'].apply(lambda x: x[:10])
    df['day'] = pd.to_datetime(df['day'])
    df['time'] = df['tm'].apply(lambda x: int(x[11:13]))

    df.loc[df['time']==0, 'day'] = df.loc[df['time'] == 0, 'day'] - pd.DateOffset(days=1)
    df.loc[df['time']==0, 'time'] = 24

    # 냉방도일 기준 온도 설정
    ref_CDD = 24
    # CDD 컬럼 추가
    df['cdd'] = calculate_cdd(df['nph_ta'], ref_CDD)
    # 일 단위로 CDD 합산
    df['accumulate_cdd'] = df.groupby(['num', 'day'])['cdd'].transform('sum')

    # 난방도일 기준 온도 설정
    ref_HDD = 18
    # HDD 컬럼 추가
    df['hdd'] = calculate_hdd(df['nph_ta'], ref_HDD)
    # 일 단위로 HDD 합산
    df['accumulate_hdd'] = df.groupby(['num', 'day'])['hdd'].transform('sum')

    # 불쾌지수
    df['thi'] = (1.8*df['nph_ta']) - (0.55*(1-df['nph_hm']/100)*(1.8*df['nph_ta']-26)) + 32

    # 휴일
    df['평일/주말'] = df['day'].dt.weekday
    df['평일/주말'] = df['평일/주말'].apply(lambda x: 1 if x in [5,6] else 0)
    df['holiday'] = df['day'].dt.date.isin(holidays.CountryHoliday('KR')).astype(int)
    df['주말'] = (df['day'].dt.weekday >= 5).astype(int)

    # 개인 더미 변수
    # df['year'] = df.tm.str[:4].astype(int)
    # df['dawn'] = df.hh24.apply(lambda x: 1 if x in [1,2,3,4,5,6,20,21,22,23,24] else 0)
    # df['morning'] = df.hh24.apply(lambda x: 1 if x in [7,8,9,10,11,12,13] else 0)
    # df['afternoon'] = df.hh24.apply(lambda x: 1 if x in [14,15,16,17,18,19] else 0)
    # df['summer'] = df.tm.str[5:7].astype(int).apply(lambda x: 1 if x in [5,6,7,8,9] else 0)
    # df['winter'] = df.tm.str[5:7].astype(int).apply(lambda x: 1 if x in [1,2,3,11,12] else 0)
    return df

use_cols = ['num', 'hh24', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m', 'nph_ta_chi', 'weekday', 
            'cdd', 'accumulate_cdd', 'hdd','accumulate_hdd', 'thi', '평일/주말', 'holiday', '주말', 
            # 'year', 'dawn', 'morning', 'afternoon', 'summer', 'winter',
            # 'elec'
            ]
df_test = eda_df(df_test)[use_cols]
df_test.columns

Index(['num', 'hh24', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m',
       'nph_ta_chi', 'weekday', 'cdd', 'accumulate_cdd', 'hdd',
       'accumulate_hdd', 'thi', '평일/주말', 'holiday', '주말'],
      dtype='object')

In [16]:
df_test

Unnamed: 0,num,hh24,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,cdd,accumulate_cdd,hdd,accumulate_hdd,thi,평일/주말,holiday,주말
0,4816,1,3.0,68.6,2.9,0.0,-0.1,6,0.0,0.0,15.0,281.4,40.957620,1,0,1
1,4816,2,3.1,69.4,2.7,0.0,0.3,6,0.0,0.0,14.9,281.4,41.016686,1,0,1
2,4816,3,3.6,68.3,2.3,0.0,1.2,6,0.0,0.0,14.4,281.4,41.883312,1,0,1
3,4816,4,4.0,69.2,3.1,0.0,1.1,6,0.0,0.0,14.0,281.4,42.384720,1,0,1
4,4816,5,4.2,69.5,2.5,0.0,2.0,6,0.0,0.0,13.8,281.4,42.653310,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829473,12322,20,5.6,70.8,3.2,0.0,5.8,6,0.0,0.0,12.4,257.8,44.636752,1,0,1
2829474,12322,21,5.3,69.1,3.6,0.0,4.5,6,0.0,0.0,12.7,257.8,44.337377,1,0,1
2829475,12322,22,5.1,70.6,3.0,0.0,5.1,6,0.0,0.0,12.9,257.8,43.899794,1,0,1
2829476,12322,23,5.2,69.7,3.8,0.0,5.1,6,0.0,0.0,12.8,257.8,44.133056,1,0,1


In [17]:
predictor.leaderboard()
test_data = TabularDataset(df_test)

test_pred1 = predictor.predict(test_data)

In [18]:
test_pred1

0           92.444077
1           82.370392
2           73.673882
3           71.839767
4           68.347939
              ...    
2829473    130.210861
2829474    129.997742
2829475    125.867340
2829476    119.383987
2829477    109.802032
Name: elec, Length: 2829478, dtype: float32

In [19]:
df = pd.read_csv('electric_test.csv')
df

Unnamed: 0,NUM,TM,HH24,STN,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,week_name,elect
0,4816,2023-01-01 01:00:00,1,752,3.0,68.6,2.9,0.0,-0.1,6,1.0,
1,4816,2023-01-01 02:00:00,2,752,3.1,69.4,2.7,0.0,0.3,6,1.0,
2,4816,2023-01-01 03:00:00,3,752,3.6,68.3,2.3,0.0,1.2,6,1.0,
3,4816,2023-01-01 04:00:00,4,752,4.0,69.2,3.1,0.0,1.1,6,1.0,
4,4816,2023-01-01 05:00:00,5,752,4.2,69.5,2.5,0.0,2.0,6,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
2829473,12322,2023-12-31 20:00:00,20,901,5.6,70.8,3.2,0.0,5.8,6,1.0,
2829474,12322,2023-12-31 21:00:00,21,901,5.3,69.1,3.6,0.0,4.5,6,1.0,
2829475,12322,2023-12-31 22:00:00,22,901,5.1,70.6,3.0,0.0,5.1,6,1.0,
2829476,12322,2023-12-31 23:00:00,23,901,5.2,69.7,3.8,0.0,5.1,6,1.0,


In [20]:
df[df.columns[-1]] = test_pred1

In [21]:
df.to_csv('240259.csv')

In [22]:
df

Unnamed: 0,NUM,TM,HH24,STN,nph_ta,nph_hm,nph_ws_10m,nph_rn_60m,nph_ta_chi,weekday,week_name,elect
0,4816,2023-01-01 01:00:00,1,752,3.0,68.6,2.9,0.0,-0.1,6,1.0,92.444077
1,4816,2023-01-01 02:00:00,2,752,3.1,69.4,2.7,0.0,0.3,6,1.0,82.370392
2,4816,2023-01-01 03:00:00,3,752,3.6,68.3,2.3,0.0,1.2,6,1.0,73.673882
3,4816,2023-01-01 04:00:00,4,752,4.0,69.2,3.1,0.0,1.1,6,1.0,71.839767
4,4816,2023-01-01 05:00:00,5,752,4.2,69.5,2.5,0.0,2.0,6,1.0,68.347939
...,...,...,...,...,...,...,...,...,...,...,...,...
2829473,12322,2023-12-31 20:00:00,20,901,5.6,70.8,3.2,0.0,5.8,6,1.0,130.210861
2829474,12322,2023-12-31 21:00:00,21,901,5.3,69.1,3.6,0.0,4.5,6,1.0,129.997742
2829475,12322,2023-12-31 22:00:00,22,901,5.1,70.6,3.0,0.0,5.1,6,1.0,125.867340
2829476,12322,2023-12-31 23:00:00,23,901,5.2,69.7,3.8,0.0,5.1,6,1.0,119.383987
