In [46]:
!pip install sktime



In [47]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [48]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = '/content/drive/MyDrive/데이콘 캐글 컴페티션/DACON_23.07_PowerConsumption/'
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [49]:
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

pd.set_option('display.max_columns', 30)


In [50]:
train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")
building = pd.read_csv(f"{DATA_PATH}building_info.csv")
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")

In [51]:
def _smape(true, pred):
    true = np.array(true)
    pred = np.array(pred)
    output = np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100
    return output
smape = make_scorer(_smape, greater_is_better=False)

In [52]:
train_ft = pd.merge(train,building,on='건물번호',how='left')

In [53]:
test_ft = pd.merge(test,building,on='건물번호',how='left')

## Feature 추가 - 1차

In [54]:
train_ft.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', '건물유형', '연면적(m2)', '냉방면적(m2)',
       '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'],
      dtype='object')

In [55]:
# 변수 영문명으로 변경
train_eng_cols = ['num_date_time','building_num', 'date_time', 'temp', 'precip', 'wind_ms', 'humidity', 'sunshine',
                  'solar_rad', 'energy_consum', 'building_type', 'floor_area', 'cooling_area', 'solar_capa',
                  'ess_capa','pcs_capa']

train_ft.columns = train_eng_cols

In [56]:
test_ft.columns

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)',
       'PCS용량(kW)'],
      dtype='object')

In [57]:
test_eng_cols = ['num_date_time','building_num', 'date_time', 'temp', 'precip', 'wind_ms', 'humidity','building_type', 'floor_area',
                 'cooling_area', 'solar_capa', 'ess_capa','pcs_capa']

test_ft.columns = test_eng_cols

In [58]:
date = pd.to_datetime(train_ft.date_time)
train_ft['hour'] = date.dt.hour
train_ft['day'] = date.dt.weekday
train_ft['month'] = date.dt.month
train_ft['week'] = date.dt.isocalendar().week

In [59]:
date = pd.to_datetime(test_ft.date_time)
test_ft['hour'] = date.dt.hour
test_ft['day'] = date.dt.weekday
test_ft['month'] = date.dt.month
test_ft['week'] = date.dt.isocalendar().week

In [60]:
train_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'sunshine', 'solar_rad', 'energy_consum',
       'building_type', 'floor_area', 'cooling_area', 'solar_capa', 'ess_capa',
       'pcs_capa', 'hour', 'day', 'month', 'week'],
      dtype='object')

In [61]:
#######################################
## 건물별, 요일별, 시간별 발전량 평균 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train_ft['day_hour_mean'] = train_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 204000/204000 [03:50<00:00, 884.34it/s] 


In [62]:

#######################################
## 건물별 시간별 발전량 평균 넣어주기
#######################################
power_hour_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train_ft['hour_mean'] = train_ft.progress_apply(lambda x : power_hour_mean.loc[(power_hour_mean.building_num == x['building_num']) & (power_hour_mean.hour == x['hour']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 204000/204000 [01:57<00:00, 1739.73it/s]


In [63]:
#######################################
## 건물별 시간별 발전량 표준편차 넣어주기
#######################################
power_hour_std = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour'], aggfunc = np.std).reset_index()
tqdm.pandas()
train_ft['hour_std'] = train_ft.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.building_num == x['building_num']) & (power_hour_std.hour == x['hour']) ,'energy_consum'].values[0], axis = 1)


100%|██████████| 204000/204000 [01:56<00:00, 1752.61it/s]


In [64]:
test_ft['hour_std'] = test_ft.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.building_num == x['building_num']) & (power_hour_std.hour == x['hour']) ,'energy_consum'].values[0], axis = 1)


100%|██████████| 16800/16800 [00:08<00:00, 1969.31it/s]


In [65]:
#######################################
# 건물별, 유형별, 시간별 발전량 평균 계산
#######################################
power_mean = pd.pivot_table(train_ft, values='energy_consum', index=['building_num', 'building_type', 'hour'], aggfunc=np.mean).reset_index()

# tqdm을 이용한 데이터프레임에 열 추가
tqdm.pandas()
def add_power_mean(row):
    mean_value = power_mean.loc[
        (power_mean.building_num == row['building_num']) &
        (power_mean.building_type == row['building_type']) &
        (power_mean.hour == row['hour']),
        'energy_consum'
    ].values[0]
    return mean_value

train_ft['hour_type_mean'] = train_ft.progress_apply(add_power_mean, axis=1)


 70%|███████   | 143509/204000 [02:34<01:05, 926.08it/s]


KeyboardInterrupt: ignored

In [None]:
### 공휴일 변수 추가 --> 건물별 휴무일 변수 추가
train_ft['holiday'] = train_ft.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
# train.loc[('2020-08-17'<=train.date_time)&(train.date_time<'2020-08-18'), 'holiday'] = 1

## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
train_ft['sin_time'] = np.sin(2*np.pi*train_ft.hour/24)
train_ft['cos_time'] = np.cos(2*np.pi*train_ft.hour/24)

## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
train_ft['THI'] = 9/5*train_ft['temp'] - 0.55*(1-train_ft['humidity']/100)*(9/5*train_ft['humidity']-26)+32



In [None]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = train_ft[train_ft['building_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
train_ft['CDH'] = cdhs

# train_ft.drop(['non_elec','solar','hour'], axis = 1, inplace = True)
train_ft.head()

In [None]:
## save the preprocessed data
train_ft.to_csv(f'{DATA_PATH}train_featured_1.csv',index=False)

In [None]:
train_ft = pd.read_csv(f'{DATA_PATH}train_featured_1.csv')

# Feature 추가 - 2차

In [None]:
train_ft

In [None]:
train_ft.info()

In [None]:
!pip install shap

In [None]:

train_ft['ess_capa'] = train_ft['ess_capa'].replace('-',0).astype(float)
train_ft['solar_capa'] = train_ft['solar_capa'].replace('-',0).astype(float)
train_ft['pcs_capa'] = train_ft['pcs_capa'].replace('-',0).astype(float)

- 불쾌지수, 불쾌지수 CDH

In [None]:
# train_ft['discomfort'] = 0.81 * train_ft['temp'] + 0.01 * train_ft['humidity'] * (0.99 * train_ft['temp'] - 14.3) + 46.3

# 불쾌지수 계산 함수
def calculate_discomfort(temp, humidity):
    discomfort = 0.81 * temp + 0.01 * humidity * (0.99 * temp - 14.3) + 46.3
    return discomfort

# 불쾌지수 계산하여 discomfort 열 추가
train_ft['discomfort'] = calculate_discomfort(train_ft['temp'], train_ft['humidity'])

# CDH 계산
cdhs = np.array([])
for num in range(1, 101):
    temp = train_ft[train_ft['building_num'] == num]
    cdh = np.cumsum(temp['discomfort'].values - 26)  # discomfort 값에서 26을 뺀 후 누적 합 계산
    cdhs = np.concatenate([cdhs, cdh])
train_ft['discomfort_CDH'] = cdhs



- 일조량, 일사량 결측치 처리

In [None]:
#일조량 건물별, 요일별, 시간별 평균으로 train_ft 결측치 처리, test_ft에 sunshine값 추가

avg_sunshine_train = train_ft.groupby(['building_num', 'day','hour'])['sunshine'].mean().reset_index()
avg_sunshine_train.rename(columns={'sunshine': 'sunshine_avg'}, inplace=True)

# 결측치 채우기
train_ft = train_ft.merge(avg_sunshine_train, on=['building_num','day', 'hour'], how='left')
train_ft['sunshine'] = train_ft['sunshine'].fillna(train_ft['sunshine_avg'])

# 불필요한 열 제거
train_ft.drop('sunshine_avg', axis=1, inplace=True)


In [None]:
#일사량 건물별, 요일별, 시간별 평균으로 train_ft 결측치 처리, test_ft에 solar rad값 추가

avg_solar_rad_train = train_ft.groupby(['building_num','day','hour'])['solar_rad'].mean().reset_index()
avg_solar_rad_train.rename(columns={'solar_rad': 'solar_rad_avg'}, inplace=True)

# 결측치 채우기
train_ft = train_ft.merge(avg_solar_rad_train, on=['building_num','day', 'hour'], how='left')
train_ft['solar_rad'] = train_ft['solar_rad'].fillna(train_ft['solar_rad_avg'])

# 불필요한 열 제거
train_ft.drop('solar_rad_avg', axis=1, inplace=True)

In [None]:
train_ft['solar_gen'] = train_ft['solar_rad'] * train_ft['sunshine'] * train_ft['solar_capa'] * 0.15

# CDH 계산
cdhs_solar = np.array([])
for num in range(1, 101):
    temp = train_ft[train_ft['building_num'] == num]
    cdh_solar = np.cumsum(temp['solar_gen'].values)  # solar_gen 값에서 26을 뺀 후 누적 합 계산
    cdhs_solar = np.concatenate([cdhs_solar, cdh_solar])
train_ft['CDH_solar'] = cdhs_solar

In [None]:
train_ft['cooling_ratio'] = train_ft['cooling_area'] / train_ft['floor_area']


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import shap

# 데이터 로드 및 전처리
# 예시로 간단한 데이터 생성
cols = ['energy_consum','num_date_time', 'date_time', 'building_type']
X = train_ft.drop(columns=cols, axis=1)
feature_names = X.columns  # 특성의 이름을 저장

y = train_ft['energy_consum']  # 타겟 변수

# XGBoost 모델 생성
model = XGBRegressor(random_state=42)
model.fit(X, y)

# SHAP 값을 계산
explainer = shap.Explainer(model)
shap_values = explainer(X)

# SHAP 값을 시각화하여 피쳐 중요도 확인
shap.summary_plot(shap_values, X)



In [None]:
shap_means = np.abs(shap_values.values).mean(axis=0)

# 결과를 DataFrame으로 저장
shap_features = pd.DataFrame({'SHAP Features': feature_names, 'SHAP_mean': shap_means})

# SHAP 평균 순으로 정렬
shap_features = shap_features.sort_values(by='SHAP_mean', ascending=False)
shap_features

In [None]:
train_ft.columns

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import shap

# 데이터 로드 및 전처리
# 예시로 간단한 데이터 생성
cols = ['num_date_time','energy_consum', 'date_time', 'building_type']
X = train_ft.drop(columns=cols, axis=1)

y = train_ft['energy_consum']  # 타겟 변수

# LightGBM 모델 생성
model = LGBMRegressor(random_state=42)
model.fit(X, y)

# SHAP 값을 계산
explainer = shap.Explainer(model)
shap_values = explainer(X)

# SHAP 값을 시각화하여 피쳐 중요도 확인
shap.summary_plot(shap_values, X)


In [None]:
# 'day_hour_mean','discomfort_CDH','week','discomfort','hour_mean','CDH','hour_std','day','sin_time'

In [None]:
shap_means = np.abs(shap_values.values).mean(axis=0)

# 결과를 DataFrame으로 저장
shap_features = pd.DataFrame({'SHAP Features': feature_names, 'SHAP_mean': shap_means})

# SHAP 평균 순으로 정렬
shap_features = shap_features.sort_values(by='SHAP_mean', ascending=False)
shap_features

In [None]:
# mask = shap_features['SHAP_mean'] >9
# selected_columns = shap_features[mask]['SHAP Features'].tolist()

In [None]:
train_ft.columns

In [None]:
train_ft.to_csv(f'{DATA_PATH}train_featured_2(full).csv', index=False)
# train_ft.to_csv(f'{DATA_PATH}train_featured_1.csv',index=False)

In [None]:
selected_columns = ['day_hour_mean','discomfort_CDH','week','discomfort','hour_mean','CDH','hour_std','day','sin_time']

In [None]:
# selected_train_ft =
train_ft[selected_columns].to_csv(f'{DATA_PATH}train_selected_2.csv', index=False)

# Feature 추가 - 3차

In [None]:
train_ft = pd.read_csv(f'{DATA_PATH}train_featured_2(full).csv')

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 중간값 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.median).reset_index()
tqdm.pandas()
train_ft['day_hour_median'] = train_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 표준편차 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.std).reset_index()
tqdm.pandas()
train_ft['day_hour_std'] = train_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 최빈값 넣어주기
#######################################

from scipy.stats import mode

# 건물별, 요일별, 시간별 최빈값 계산
def mode_value(x):
    return mode(x)[0][0]  # mode 함수는 mode와 count 값을 반환하므로 첫 번째 mode 값을 선택

power_mode = pd.pivot_table(
    train_ft,
    values='energy_consum',
    index=['building_num', 'hour', 'day'],
    aggfunc=mode_value
).reset_index()

# 최빈값 값을 train_ft에 추가
train_ft['day_hour_mode'] = train_ft.progress_apply(
    lambda x: power_mode.loc[
        (power_mode.building_num == x['building_num']) &
        (power_mode.hour == x['hour']) &
        (power_mode.day == x['day']),
        'energy_consum'
    ].values[0],
    axis=1
)

In [None]:
# 일에 대한 주기성을 감지하기 위한 피처 추가
train_ft['day_of_year'] = train_ft['month'] * 30 + train_ft['day']  # 간단한 근사 값
train_ft['cos_day_of_year'] = np.cos(2 * np.pi * train_ft['day_of_year'] / 365)
train_ft['sin_day_of_year'] = np.sin(2 * np.pi * train_ft['day_of_year'] / 365)

# 요일에 대한 sin, cos 피처 추가
train_ft['cos_weekday'] = np.cos(2 * np.pi * train_ft['day'] / 7)  # 7일 기준으로 cos 값 추가
train_ft['sin_weekday'] = np.sin(2 * np.pi * train_ft['day'] / 7)  # 7일 기준으로 sin 값 추가

# 주차별 cos, sin 피처 추가
train_ft['cos_week'] = np.cos(2 * np.pi * train_ft['day_of_year'] / 52)  # 52주 기준으로 cos 값 추가
train_ft['sin_week'] = np.sin(2 * np.pi * train_ft['day_of_year'] / 52)  # 52주 기준으로 sin 값 추가

# 월에 대한 cos, sin 피처 추가
train_ft['cos_month'] = np.cos(2 * np.pi * train_ft['month'] / 12)  # 12달 기준으로 cos 값 추가
train_ft['sin_month'] = np.sin(2 * np.pi * train_ft['month'] / 12)  # 12달 기준으로 sin 값 추가

In [None]:
train_ft.columns

In [None]:
train_ft

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import shap

# 데이터 로드 및 전처리
# 예시로 간단한 데이터 생성
cols = ['energy_consum','num_date_time', 'date_time', 'building_type']
X = train_ft.drop(columns=cols, axis=1)
feature_names = X.columns  # 특성의 이름을 저장

y = train_ft['energy_consum']  # 타겟 변수

# XGBoost 모델 생성
model = XGBRegressor(random_state=42)
model.fit(X, y)

# SHAP 값을 계산
explainer = shap.Explainer(model)
shap_values = explainer(X)

# SHAP 값을 시각화하여 피쳐 중요도 확인
shap.summary_plot(shap_values, X)

In [None]:
shap_means = np.abs(shap_values.values).mean(axis=0)

# 결과를 DataFrame으로 저장
shap_features = pd.DataFrame({'SHAP Features': feature_names, 'SHAP_mean': shap_means})

# SHAP 평균 순으로 정렬
shap_features = shap_features.sort_values(by='SHAP_mean', ascending=False)
shap_features

In [45]:
train_ft

Unnamed: 0,num_date_time,building_num,date_time,temp,precip,wind_ms,humidity,sunshine,solar_rad,energy_consum,building_type,floor_area,cooling_area,solar_capa,ess_capa,pcs_capa,hour,day,month,week
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28,건물기타,110634.00,39570.00,-,-,-,0,2,6,22
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36,건물기타,110634.00,39570.00,-,-,-,1,2,6,22
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88,건물기타,110634.00,39570.00,-,-,-,2,2,6,22
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76,건물기타,110634.00,39570.00,-,-,-,3,2,6,22
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40,건물기타,110634.00,39570.00,-,-,-,4,2,6,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04,호텔및리조트,57497.84,40035.23,-,-,-,19,2,8,34
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96,호텔및리조트,57497.84,40035.23,-,-,-,20,2,8,34
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12,호텔및리조트,57497.84,40035.23,-,-,-,21,2,8,34
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08,호텔및리조트,57497.84,40035.23,-,-,-,22,2,8,34


In [33]:
features = ['day_hour_mean', 'discomfort_CDH', 'week',
       'sin_time', 'cos_week','THI', 'hour', 'day', 'wind_ms', 'temp', 'month', 'humidity','precip']

In [34]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

# 데이터 로드 및 전처리
# 예시로 간단한 데이터 생성
# cols = ['num_date_time','energy_consum', 'date_time', 'building_type']
X = train_ft['features']

y = train_ft['energy_consum']  # 타겟 변수

# LightGBM 모델 생성
model = LGBMRegressor(random_state=42)
model.fit(X, y)

# SHAP 값을 계산
explainer = shap.Explainer(model)
shap_values = explainer(X)

# SHAP 값을 시각화하여 피쳐 중요도 확인
shap.summary_plot(shap_values, X)


ModuleNotFoundError: ignored

In [None]:
shap_means = np.abs(shap_values.values).mean(axis=0)

# 결과를 DataFrame으로 저장
shap_features = pd.DataFrame({'SHAP Features': feature_names, 'SHAP_mean': shap_means})

# SHAP 평균 순으로 정렬
shap_features = shap_features.sort_values(by='SHAP_mean', ascending=False)
shap_features

Unnamed: 0,SHAP Features,SHAP_mean
16,day_hour_mean,1313.828974
39,day_hour_median,148.215132
26,discomfort_CDH,56.667006
15,week,44.946751
41,day_hour_mode,43.214507
25,discomfort,33.260888
40,day_hour_std,25.058457
24,CDH,24.599104
17,hour_mean,21.338553
21,sin_time,8.846939


In [None]:
shap_features['SHAP Features'].values

array(['day_hour_mean', 'day_hour_median', 'discomfort_CDH', 'week',
       'day_hour_mode', 'discomfort', 'day_hour_std', 'CDH', 'hour_mean',
       'sin_time', 'cos_week', 'floor_area', 'day_of_year', 'cos_weekday',
       'THI', 'sin_week', 'building_num', 'hour_std', 'cooling_ratio',
       'cooling_area', 'hour', 'solar_capa', 'day', 'wind_ms', 'temp',
       'cos_time', 'month', 'solar_rad', 'sin_weekday', 'solar_gen',
       'cos_day_of_year', 'humidity', 'precip', 'ess_capa', 'sunshine',
       'CDH_solar', 'pcs_capa', 'sin_day_of_year', 'holiday', 'cos_month',
       'sin_month', 'hour_type_mean'], dtype=object)

In [None]:
features = ['day_hour_mean', 'discomfort_CDH', 'week',
       'sin_time', 'cos_week','THI', 'hour', 'day', 'wind_ms', 'temp', 'month', 'humidity','precip']

In [None]:
train_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'sunshine', 'solar_rad', 'energy_consum',
       'building_type', 'floor_area', 'cooling_area', 'solar_capa', 'ess_capa',
       'pcs_capa', 'hour', 'day', 'month', 'week', 'day_hour_mean',
       'hour_mean', 'hour_std', 'hour_type_mean', 'holiday', 'sin_time',
       'cos_time', 'THI', 'CDH', 'discomfort', 'discomfort_CDH', 'solar_gen',
       'CDH_solar', 'cooling_ratio'],
      dtype='object')

In [None]:
# mask = shap_features['SHAP_mean'] >9
selected_columns = ['day_hour_mean', 'day_hour_median', 'discomfort_CDH', 'week',
       'day_hour_mode', 'discomfort', 'day_hour_std', 'CDH', 'hour_mean',
       'sin_time', 'cos_week', 'day_of_year']

In [None]:
train_ft.to_csv(f'{DATA_PATH}train_featured_3.csv', index=False)
# train_ft.to_csv(f'{DATA_PATH}train_featured_1.csv',index=False)

In [None]:
# selected_train_ft =
train_ft[selected_columns].to_csv(f'{DATA_PATH}train_selected_3.csv', index=False)

### test_ft에 추가

In [None]:
test_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'building_type', 'floor_area', 'cooling_area',
       'solar_capa', 'ess_capa', 'pcs_capa', 'hour', 'day', 'month', 'week'],
      dtype='object')

- day_hour_mean

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 평균 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.mean).reset_index()
tqdm.pandas()
test_ft['day_hour_mean'] = test_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 16800/16800 [00:14<00:00, 1125.31it/s]


- day_hour_median

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 중간값 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.median).reset_index()
tqdm.pandas()
test_ft['day_hour_median'] = test_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 16800/16800 [00:15<00:00, 1118.86it/s]


- discomfort

In [None]:
def calculate_discomfort(temp, humidity):
    discomfort = 0.81 * temp + 0.01 * humidity * (0.99 * temp - 14.3) + 46.3
    return discomfort

# 불쾌지수 계산하여 discomfort 열 추가
test_ft['discomfort'] = calculate_discomfort(test_ft['temp'], test_ft['humidity'])


- discomfort_CDH

In [None]:
# train_ft['discomfort'] = 0.81 * train_ft['temp'] + 0.01 * train_ft['humidity'] * (0.99 * train_ft['temp'] - 14.3) + 46.3

# 불쾌지수 계산 함수
def calculate_discomfort(temp, humidity):
    discomfort = 0.81 * temp + 0.01 * humidity * (0.99 * temp - 14.3) + 46.3
    return discomfort

# CDH 계산
cdhs = np.array([])
for num in range(1, 101):
    temp = test_ft[test_ft['building_num'] == num]
    cdh = np.cumsum(temp['discomfort'].values - 26)  # discomfort 값에서 26을 뺀 후 누적 합 계산
    cdhs = np.concatenate([cdhs, cdh])
test_ft['discomfort_CDH'] = cdhs



- day_hour_mode

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 최빈값 넣어주기
#######################################

from scipy.stats import mode

# 건물별, 요일별, 시간별 최빈값 계산
def mode_value(x):
    return mode(x)[0][0]  # mode 함수는 mode와 count 값을 반환하므로 첫 번째 mode 값을 선택

power_mode = pd.pivot_table(
    train_ft,
    values='energy_consum',
    index=['building_num', 'hour', 'day'],
    aggfunc=mode_value
).reset_index()

# 최빈값 값을 test_ft에 추가
test_ft['day_hour_mode'] = test_ft.progress_apply(
    lambda x: power_mode.loc[
        (power_mode.building_num == x['building_num']) &
        (power_mode.hour == x['hour']) &
        (power_mode.day == x['day']),
        'energy_consum'
    ].values[0],
    axis=1
)

  return mode(x)[0][0]  # mode 함수는 mode와 count 값을 반환하므로 첫 번째 mode 값을 선택
100%|██████████| 16800/16800 [00:14<00:00, 1129.88it/s]


- day_hour_std

In [None]:
#######################################
## 건물별, 요일별, 시간별 발전량 표준편차 넣어주기
#######################################
power_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour', 'day'], aggfunc = np.std).reset_index()
tqdm.pandas()
test_ft['day_hour_std'] = test_ft.progress_apply(lambda x : power_mean.loc[(power_mean.building_num == x['building_num']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 16800/16800 [00:14<00:00, 1162.56it/s]


- CDH

In [None]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = test_ft[test_ft['building_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
test_ft['CDH'] = cdhs

- hour_mean

In [None]:

power_hour_mean = pd.pivot_table(train_ft, values = 'energy_consum', index = ['building_num', 'hour'], aggfunc = np.mean).reset_index()
tqdm.pandas()
test_ft['hour_mean'] = test_ft.progress_apply(lambda x : power_hour_mean.loc[(power_hour_mean.building_num == x['building_num']) & (power_hour_mean.hour == x['hour']) ,'energy_consum'].values[0], axis = 1)

100%|██████████| 16800/16800 [00:16<00:00, 1042.54it/s]


- sin_time

In [None]:

test_ft['sin_time'] = np.sin(2*np.pi*test_ft.hour/24)


- cos_week

In [None]:
test_ft['cos_week'] = np.cos(2 * np.pi * test_ft['day_of_year'] / 52)  # 52주 기준으로 cos 값 추가

- day_of_year

In [None]:
test_ft['day_of_year'] = test_ft['month'] * 30 + test_ft['day']

In [None]:
test_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'building_type', 'floor_area', 'cooling_area',
       'solar_capa', 'ess_capa', 'pcs_capa', 'hour', 'day', 'month', 'week',
       'day_hour_mean', 'discomfort', 'discomfort_CDH', 'day_hour_median',
       'day_hour_mode', 'day_hour_std', 'sin_time', 'hour_mean', 'CDH',
       'day_of_year', 'cos_week'],
      dtype='object')

In [37]:
test_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'building_type', 'floor_area', 'cooling_area',
       'solar_capa', 'ess_capa', 'pcs_capa', 'hour', 'day', 'month', 'week',
       'day_hour_mean', 'discomfort', 'discomfort_CDH', 'day_hour_median',
       'day_hour_mode', 'day_hour_std', 'sin_time', 'hour_mean', 'CDH',
       'day_of_year', 'cos_week'],
      dtype='object')

In [None]:
# mask = shap_features['SHAP_mean'] >9
selected_columns = ['day_hour_mean', 'day_hour_median', 'discomfort_CDH', 'week',
       'day_hour_mode', 'discomfort', 'day_hour_std', 'CDH', 'hour_mean',
       'sin_time', 'cos_week', 'day_of_year']

In [38]:
test_ft['THI'] = 9/5*test_ft['temp'] - 0.55*(1-test_ft['humidity']/100)*(9/5*test_ft['humidity']-26)+32


In [39]:
test_ft.to_csv(f'{DATA_PATH}test_featured_3.csv', index=False)
# train_ft.to_csv(f'{DATA_PATH}train_featured_1.csv',index=False)

In [40]:
# selected_train_ft =
test_ft[selected_columns].to_csv(f'{DATA_PATH}test_selected_3.csv', index=False)

In [None]:
#######################################
# 건물별, 유형별, 시간별 발전량 평균 계산
#######################################
power_mean = pd.pivot_table(train_ft, values='energy_consum', index=['building_num', 'building_type', 'hour'], aggfunc=np.mean).reset_index()

# tqdm을 이용한 데이터프레임에 열 추가
tqdm.pandas()
def add_power_mean(row):
    mean_value = power_mean.loc[
        (power_mean.building_num == row['building_num']) &
        (power_mean.building_type == row['building_type']) &
        (power_mean.hour == row['hour']),
        'energy_consum'
    ].values[0]
    return mean_value

train_ft['hour_type_mean'] = train_ft.progress_apply(add_power_mean, axis=1)


100%|██████████| 204000/204000 [02:28<00:00, 1375.72it/s]


In [None]:
### 공휴일 변수 추가 --> 건물별 휴무일 변수 추가
train_ft['holiday'] = train_ft.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
# train.loc[('2020-08-17'<=train.date_time)&(train.date_time<'2020-08-18'), 'holiday'] = 1

## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
train_ft['sin_time'] = np.sin(2*np.pi*train_ft.hour/24)
train_ft['cos_time'] = np.cos(2*np.pi*train_ft.hour/24)

## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
train_ft['THI'] = 9/5*train_ft['temp'] - 0.55*(1-train_ft['humidity']/100)*(9/5*train_ft['humidity']-26)+32



In [None]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = train_ft[train_ft['building_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])
train_ft['CDH'] = cdhs

# train_ft.drop(['non_elec','solar','hour'], axis = 1, inplace = True)
train_ft.head()

Unnamed: 0,num_date_time,building_num,date_time,temp,precip,wind_ms,humidity,sunshine,solar_rad,energy_consum,building_type,floor_area,cooling_area,solar_capa,ess_capa,pcs_capa,hour,day,month,week,day_hour_mean,hour_mean,hour_std,hour_type_mean,holiday,sin_time,cos_time,THI,CDH
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28,건물기타,110634.0,39570.0,-,-,-,0,2,6,22,1774.744615,1706.318118,446.882767,1706.318118,0,0.0,1.0,49.6576,-7.4
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36,건물기타,110634.0,39570.0,-,-,-,1,2,6,22,1687.347692,1622.620235,439.662704,1622.620235,0,0.258819,0.965926,47.7625,-15.4
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88,건물기타,110634.0,39570.0,-,-,-,2,2,6,22,1571.483077,1506.971294,412.071906,1506.971294,0,0.5,0.866025,47.2225,-23.7
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76,건물기타,110634.0,39570.0,-,-,-,3,2,6,22,1522.153846,1437.365647,391.205981,1437.365647,0,0.707107,0.707107,44.7856,-33.0
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4,건물기타,110634.0,39570.0,-,-,-,4,2,6,22,1506.793846,1447.321412,381.099697,1447.321412,0,0.866025,0.5,49.0061,-40.6


# >> feature importance

In [43]:
train_ft.columns

Index(['num_date_time', 'building_num', 'date_time', 'temp', 'precip',
       'wind_ms', 'humidity', 'sunshine', 'solar_rad', 'energy_consum',
       'building_type', 'floor_area', 'cooling_area', 'solar_capa', 'ess_capa',
       'pcs_capa', 'hour', 'day', 'month', 'week'],
      dtype='object')

# Feature 추가 - 4차


## 재유형화 (Clustering) 지도학습


#### 건물 유형별 (12개)
- 결정된 12개의 건물 유형을 정답값으로 두고, 클러스터링함. 그리고 그 클러스터 분석 지수를 feature로 추가함.

In [None]:
train_ft

In [None]:
train_ft.columns

#'num_date_time', 'building_num', 'date_time', 'temp',
    #    'precip', 'wind_ms', 'humidity', 'sunshine', 'solar_rad',
    #    'energy_consum', 'building_type', 'floor_area', 'cooling_area',
    #    'solar_capa', 'ess_capa', 'pcs_capa', 'hour', 'day', 'month', 'week',
    #    'day_hour_mean', 'hour_mean', 'hour_std', 'hour_type_mean', 'holiday',
    #    'sin_time', 'cos_time', 'THI', 'CDH'],
    #   dtype='object')

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# 데이터 로드
# 데이터는 train_ft와 비슷한 형식으로 로드되어 있다고 가정합니다.

# 건물 유형을 정답값으로 사용하여 클러스터링 평가
true_labels = train_ft['building_type']  # 정답값으로 건물 유형 사용
n_clusters = len(true_labels.unique())  # 건물 유형 개수를 클러스터 개수로 설정

# K-means 클러스터링 수행
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
predicted_labels = kmeans.fit_predict(train_ft[['building_num',]])  # 적절한 특성(features)으로 변경

# predicted_labels = kmeans.fit_predict(train_ft[['feature1', 'feature2', 'feature3']])  # 적절한 특성(features)으로 변경

# 클러스터링 결과 평가
ari_score = adjusted_rand_score(true_labels, predicted_labels)
print(f"Adjusted Rand Index Score: {ari_score}")

ValueError: ignored

#### 건물별 (100개)
- 결정된 100 개의 건물을 정답값으로 두고, 클러스터링함. 그리고 그 클러스터 분석 지수를 feature로 추가함.

#### 건물의 시간대별 (24개)

#### 그래프 모양별

In [None]:
## sktime library으로 마지막 일주일을 validation set으로 설정

y = train_ft.loc[train_ft.num == 7, 'energy_consum']
x = train_ft.loc[train_ft.num == 7, ].iloc[:, 3:]

y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168) # 24시간*7일 = 168

print('train data shape\nx:{}, y:{}'.format(x_train.shape, y_train.shape))

plot_series(y_train, y_valid, markers=[',' , ','])
plt.show()

### 칼럼 처리 및 영문화

- object 칼럼 처리:  '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'

In [None]:
train_ft['태양광용량(kW)'] = train_ft['태양광용량(kW)'].replace('-', np.nan).astype(float)
train_ft['ESS저장용량(kWh)'] = train_ft['ESS저장용량(kWh)'].replace('-', np.nan).astype(float)
train_ft['PCS용량(kW)'] = train_ft['PCS용량(kW)'].replace('-', np.nan).astype(float)

In [None]:
test_ft['태양광용량(kW)'] = test_ft['태양광용량(kW)'].replace('-', np.nan).astype(float)
test_ft['ESS저장용량(kWh)'] = test_ft['ESS저장용량(kWh)'].replace('-', np.nan).astype(float)
test_ft['PCS용량(kW)'] = test_ft['PCS용량(kW)'].replace('-', np.nan).astype(float)

In [None]:
train_ft

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28,건물기타,110634.00,39570.00,,,
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36,건물기타,110634.00,39570.00,,,
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88,건물기타,110634.00,39570.00,,,
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76,건물기타,110634.00,39570.00,,,
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40,건물기타,110634.00,39570.00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04,호텔및리조트,57497.84,40035.23,,,
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96,호텔및리조트,57497.84,40035.23,,,
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12,호텔및리조트,57497.84,40035.23,,,
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08,호텔및리조트,57497.84,40035.23,,,


In [None]:
train_ft.isnull().sum()

num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
일조(hr)            75182
일사(MJ/m2)         87913
전력소비량(kWh)            0
건물유형                  0
연면적(m2)               0
냉방면적(m2)              0
태양광용량(kW)        130560
ESS저장용량(kWh)     193800
PCS용량(kW)        193800
dtype: int64

- object 칼럼 처리 : 'num_date_time'제거 , '일시' to datetime

In [None]:
train_ft['일시'] = pd.to_datetime(train_ft['일시'])

# 월, 일, 시간 추출하여 새로운 칼럼 추가
train_ft['월'] = train_ft['일시'].dt.month
train_ft['일'] = train_ft['일시'].dt.day
train_ft['시간'] = train_ft['일시'].dt.hour

# 요일 추출하여 새로운 칼럼 추가 (월요일: 0, 일요일: 6)
train_ft['요일'] = train_ft['일시'].dt.weekday

#일자번호 추가
train_ft['일자번호'] = train_ft['일시'].dt.dayofyear

#주간번호 추가
# train_ft['주간번호'] = train_ft['일시'].dt.isocalendar().week.astype(float)


In [None]:
test_ft['일시'] = pd.to_datetime(train_ft['일시'])

# 월, 일, 시간 추출하여 새로운 칼럼 추가
test_ft['월'] = test_ft['일시'].dt.month
test_ft['일'] = test_ft['일시'].dt.day
test_ft['시간'] = test_ft['일시'].dt.hour

# 요일 추출하여 새로운 칼럼 추가 (월요일: 0, 일요일: 6)
test_ft['요일'] = test_ft['일시'].dt.weekday

#일자번호 추가
test_ft['일자번호'] = test_ft['일시'].dt.dayofyear

#주간번호 추가
# test_ft['주간번호'] = test_ft['일시'].dt.isocalendar().week


In [None]:
train_ft = train_ft.drop('num_date_time', axis=1)
test_ft = test_ft.drop('num_date_time', axis=1)

In [None]:
# train_ft.fillna(0)
# test_ft.fillna(0)

- 칼럼추가

- 데이터 칼럼 영문화

In [None]:
train_ft.columns

Index(['건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)',
       '일사(MJ/m2)', '전력소비량(kWh)', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)',
       'ESS저장용량(kWh)', 'PCS용량(kW)', '월', '일', '시간', '요일', '일자번호'],
      dtype='object')

In [None]:
# 변수 영문명으로 변경
train_eng_cols = ['building_num', 'date_time', 'temp', 'precip', 'wind_ms', 'humidity', 'sunshine',
                  'solar_rad', 'energy_consum', 'building_type', 'floor_area', 'cooling_area', 'solar_capa',
                  'ess_capa','pcs_capa','month','day','hour','day_of_week','day_of_year']

train_ft.columns = train_eng_cols
# test_df.columns = test_eng_cols

In [None]:
test_eng_cols = ['building_num', 'date_time', 'temp', 'precip', 'wind_ms', 'humidity','building_type', 'floor_area',
                 'cooling_area', 'solar_capa', 'ess_capa','pcs_capa','month','day','hour',
                 'day_of_week','day_of_year']

test_ft.columns = test_eng_cols

#

[링크 텍스트](https://)### 유형 및 태양광 여부 확인