<a href="https://colab.research.google.com/github/nohse/SAI_ML_STUDY/blob/main/Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

import sklearn
import xgboost
from xgboost import XGBRegressor

import random
from torch.utils.data import TensorDataset, Dataset, DataLoader

from datetime import datetime

In [None]:
building_info = pd.read_csv('/content/gdrive/MyDrive/Dacon/PP/datasets/building_info.csv')
sample_submission = pd.read_csv('/content/gdrive/MyDrive/Dacon/PP/datasets/sample_submission.csv')
test = pd.read_csv('/content/gdrive/MyDrive/Dacon/PP/datasets/test.csv')
train = pd.read_csv('/content/gdrive/MyDrive/Dacon/PP/datasets/train.csv')

In [None]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop('num_date_time', axis = 1, inplace=True)

test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test.drop('num_date_time', axis = 1, inplace=True)

building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)

In [None]:
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 200

In [None]:
building_info

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other Buildings,110634.00,39570.00,-,-,-
1,2,Other Buildings,122233.47,99000.00,-,-,-
2,3,Other Buildings,171243.00,113950.00,40,-,-
3,4,Other Buildings,74312.98,34419.62,60,-,-
4,5,Other Buildings,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...
95,96,Hotel and Resort,93314.00,60500.00,-,-,-
96,97,Hotel and Resort,55144.67,25880.00,-,-,-
97,98,Hotel and Resort,53578.62,17373.75,-,-,-
98,99,Hotel and Resort,53499.00,40636.00,-,-,-


In [None]:
building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)

In [None]:
building_info

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,solar_power_utility,ess_utility
0,1,Other Buildings,110634.00,39570.00,-,-,-,0,0
1,2,Other Buildings,122233.47,99000.00,-,-,-,0,0
2,3,Other Buildings,171243.00,113950.00,40,-,-,1,0
3,4,Other Buildings,74312.98,34419.62,60,-,-,1,0
4,5,Other Buildings,205884.00,150000.00,-,2557,1000,0,1
...,...,...,...,...,...,...,...,...,...
95,96,Hotel and Resort,93314.00,60500.00,-,-,-,0,0
96,97,Hotel and Resort,55144.67,25880.00,-,-,-,0,0
97,98,Hotel and Resort,53578.62,17373.75,-,-,-,0,0
98,99,Hotel and Resort,53499.00,40636.00,-,-,-,0,0


In [None]:
train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

In [None]:
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,solar_power_utility,ess_utility
0,1,20220601 00,18.6,,0.9,42.0,,,1085.28,Other Buildings,110634.00,39570.00,-,-,-,0,0
1,1,20220601 01,18.0,,1.1,45.0,,,1047.36,Other Buildings,110634.00,39570.00,-,-,-,0,0
2,1,20220601 02,17.7,,1.5,45.0,,,974.88,Other Buildings,110634.00,39570.00,-,-,-,0,0
3,1,20220601 03,16.7,,1.4,48.0,,,953.76,Other Buildings,110634.00,39570.00,-,-,-,0,0
4,1,20220601 04,18.4,,2.8,43.0,,,986.40,Other Buildings,110634.00,39570.00,-,-,-,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,57497.84,40035.23,-,-,-,0,0
203996,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,57497.84,40035.23,-,-,-,0,0
203997,100,20220824 21,21.3,,1.0,92.0,,,825.12,Hotel and Resort,57497.84,40035.23,-,-,-,0,0
203998,100,20220824 22,21.0,,0.3,94.0,,,640.08,Hotel and Resort,57497.84,40035.23,-,-,-,0,0


In [None]:
train['windspeed']= train.windspeed.interpolate()
train['humidity']= train.humidity.interpolate()

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month
train['day_of_week'] = train['date_time'].dt.dayofweek

test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['day_of_week'] = test['date_time'].dt.dayofweek

In [None]:
def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
    result_dict = {}

    grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)

    for (building, month, day), value in grouped_temp.items():
        result_dict.setdefault(building, {}).setdefault(month, {})[day] = value

    dataframe[output_column] = [
        result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
        for _, row in dataframe.iterrows()
    ]

train['day_max_temperature'] = 0.0
train['day_mean_temperature'] = 0.0

calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')

train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']

calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')

test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']

In [None]:
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,...,solar_power_utility,ess_utility,hour,day,month,day_of_week,day_max_temperature,day_mean_temperature,day_min_temperature,day_temperature_range
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,,,1085.28,Other Buildings,...,0,0,0,1,6,2,28.4,22.266667,16.3,12.1
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,,,1047.36,Other Buildings,...,0,0,1,1,6,2,28.4,22.266667,16.3,12.1
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,,,974.88,Other Buildings,...,0,0,2,1,6,2,28.4,22.266667,16.3,12.1
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,,,953.76,Other Buildings,...,0,0,3,1,6,2,28.4,22.266667,16.3,12.1
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,,,986.40,Other Buildings,...,0,0,4,1,6,2,28.4,22.266667,16.3,12.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,2022-08-24 19:00:00,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,...,0,0,19,24,8,2,26.0,22.958333,20.7,5.3
203996,100,2022-08-24 20:00:00,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,...,0,0,20,24,8,2,26.0,22.958333,20.7,5.3
203997,100,2022-08-24 21:00:00,21.3,,1.0,92.0,,,825.12,Hotel and Resort,...,0,0,21,24,8,2,26.0,22.958333,20.7,5.3
203998,100,2022-08-24 22:00:00,21.0,,0.3,94.0,,,640.08,Hotel and Resort,...,0,0,22,24,8,2,26.0,22.958333,20.7,5.3


In [None]:
outlier_list = [68973,71013,112384,123132,150739,150740,150741,150742,
                150883,150884,150885,150886,138904,193120,193121,152393]

train.drop(outlier_list, axis=0,inplace=True)

In [None]:
temp_hol = {2 : ['2022-06-17'],
    5 : ['2022-07-25','2022-08-02','2022-08-09','2022-08-16'],
    11 : ['2022-06-17'], 12 : ['2022-07-02'], 17 : ['2022-06-18','2022-07-25'],
    21 : ['2022-07-01','2022-07-03','2022-07-17','2022-07-30'],
    37 : ['2022-06-20','2022-07-11','2022-08-08'],
    38 : ['2022-06-13','2022-07-25','2022-08-01'],
    39 : ['2022-07-18','2022-08-08'],
    40 : ['2022-06-20','2022-07-18','2022-08-08'],
    41 : ['2022-06-27','2022-07-25','2022-08-08'],
    42 : ['2022-06-13','2022-07-11','2022-08-22'],
    54 : ['2022-08-16','2022-08-17'],74 : ['2022-06-03'],
    75 : ['2022-06-15','2022-06-17','2022-06-20','2022-06-21'],
    86 : ['2022-06-10','2022-08-10'],
    89 : ['2022-07-09'], 91 : ['2022-06-13','2022-07-11','2022-08-22','2022-06-08'], 92 : ['2022-07-30']}

mask = train.apply(lambda x: x['building_number'] in temp_hol and str(x['date_time'])[:10] in temp_hol[x['building_number']], axis=1)

train.drop(train[mask].index, axis=0, inplace=True)

train.reset_index(drop=True, inplace=True)

In [None]:
holi_weekday = ['2022-06-01', '2022-06-06', '2022-08-15']

train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [None]:
holi_sun = ['2022-06-12', '2022-06-26', '2022-07-10', '2022-07-24', '2022-08-14', '2022-08-28']

train['Sunday_holiday'] = np.where((train.day_of_week == 6) & (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_sun)), 1, 0)
test['Sunday_holiday'] = np.where((test.day_of_week == 6) & (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_sun)), 1, 0)

In [None]:
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,...,hour,day,month,day_of_week,day_max_temperature,day_mean_temperature,day_min_temperature,day_temperature_range,holiday,Sunday_holiday
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,,,1085.28,Other Buildings,...,0,1,6,2,28.4,22.266667,16.3,12.1,1,0
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,,,1047.36,Other Buildings,...,1,1,6,2,28.4,22.266667,16.3,12.1,1,0
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,,,974.88,Other Buildings,...,2,1,6,2,28.4,22.266667,16.3,12.1,1,0
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,,,953.76,Other Buildings,...,3,1,6,2,28.4,22.266667,16.3,12.1,1,0
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,,,986.40,Other Buildings,...,4,1,6,2,28.4,22.266667,16.3,12.1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202899,100,2022-08-24 19:00:00,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,...,19,24,8,2,26.0,22.958333,20.7,5.3,0,0
202900,100,2022-08-24 20:00:00,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,...,20,24,8,2,26.0,22.958333,20.7,5.3,0,0
202901,100,2022-08-24 21:00:00,21.3,,1.0,92.0,,,825.12,Hotel and Resort,...,21,24,8,2,26.0,22.958333,20.7,5.3,0,0
202902,100,2022-08-24 22:00:00,21.0,,0.3,94.0,,,640.08,Hotel and Resort,...,22,24,8,2,26.0,22.958333,20.7,5.3,0,0


In [None]:
train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)

train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)

train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)

In [None]:
def CDH(xs):
    cumsum = np.cumsum(xs - 26)
    return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))

def calculate_and_add_cdh(dataframe):
    cdhs = []
    for i in range(1, 101):
        temp = dataframe[dataframe['building_number'] == i]['temperature'].values
        cdh = CDH(temp)
        cdhs.append(cdh)
    return np.concatenate(cdhs)

train['CDH'] = calculate_and_add_cdh(train)
test['CDH'] = calculate_and_add_cdh(test)

In [None]:
train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32

test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32

In [None]:
train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']**0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature']
test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']**0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature']

In [None]:
power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']

power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']

power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']

power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
power_hour_std.columns = ['building_number', 'hour', 'hour_std']

train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')

train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')

train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')

train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')

train = train.reset_index(drop=True)

In [None]:
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,...,cos_month,sin_dayofweek,cos_dayofweek,CDH,THI,WCT,day_hour_mean,day_hour_std,hour_mean,hour_std
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,,,1085.28,Other Buildings,...,1.0,-0.433884,0.900969,-7.4,49.6576,20.584184,1774.744615,517.982222,1706.318118,446.882767
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,,,1047.36,Other Buildings,...,1.0,-0.433884,0.900969,-15.4,47.7625,19.846954,1687.347692,500.769931,1622.620235,439.662704
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,,,974.88,Other Buildings,...,1.0,-0.433884,0.900969,-23.7,47.2225,19.317610,1571.483077,465.227458,1506.971294,412.071906
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,,,953.76,Other Buildings,...,1.0,-0.433884,0.900969,-33.0,44.7856,18.337658,1522.153846,436.601091,1437.365647,391.205981
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,,,986.40,Other Buildings,...,1.0,-0.433884,0.900969,-40.6,49.0061,19.585934,1506.793846,405.518091,1447.321412,381.099697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202899,100,2022-08-24 19:00:00,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,...,0.5,-0.433884,0.900969,-16.6,63.6624,25.094857,964.873846,163.825489,1010.462118,161.399578
202900,100,2022-08-24 20:00:00,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,...,0.5,-0.433884,0.900969,-17.6,62.4024,24.244918,882.184615,153.076049,928.125176,137.566008
202901,100,2022-08-24 21:00:00,21.3,,1.0,92.0,,,825.12,Hotel and Resort,...,0.5,-0.433884,0.900969,-20.1,64.1976,23.241700,779.095385,143.415686,830.032941,128.300189
202902,100,2022-08-24 22:00:00,21.0,,0.3,94.0,,,640.08,Hotel and Resort,...,0.5,-0.433884,0.900969,-22.8,65.0744,23.472275,663.267692,105.147190,723.100235,112.464079


In [None]:
from sklearn.model_selection import KFold, GroupKFold

In [None]:
class CFG:
  folds = 5
  n_layers = 1
  hidden_size = 64
  input_size = 29
  output = 1
  learning_rate = 1e-4
  batch_size = 32
  epoch = 1

In [None]:
#X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                #'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                #'hour','day','month','day_of_week','date_time'],axis =1 )

#y = train[['building_type','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                   'hour','month','day_of_week','day','date_time'], axis=1)

In [None]:
X_features = ['building_number', 'temperature', 'windspeed', 'humidity',
              'total_area', 'cooling_area', 'solar_power_utility',
              'ess_utility', 'day_max_temperature', 'day_mean_temperature',
              'day_min_temperature', 'day_temperature_range', 'holiday',
              'Sunday_holiday', 'sin_hour', 'cos_hour', 'sin_date', 'cos_date',
              'sin_month', 'cos_month', 'sin_dayofweek', 'cos_dayofweek', 'CDH',
              'THI', 'WCT', 'day_hour_mean', 'day_hour_std', 'hour_mean', 'hour_std']

y_features = 'power_consumption'

In [None]:
type_list = []
for value in train.building_type.values:
    if value not in type_list:
        type_list.append(value)

In [None]:
type_list

['Other Buildings',
 'Public',
 'University',
 'Data Center',
 'Department Store and Outlet',
 'Hospital',
 'Commercial',
 'Apartment',
 'Research Institute',
 'Knowledge Industry Center',
 'Discount Mart',
 'Hotel and Resort']

In [None]:
class CustomDataset(Dataset):
  def __init__(self, df, cfg):
    self.cfg = cfg
    self.df = df.copy()
    self.X_features = X_features
    self.y_features = y_features

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    inputs = torch.tensor(self.df.loc[idx, self.X_features],  dtype=torch.float32)
    outputs = torch.tensor(self.df.loc[idx, self.y_features], dtype= torch.float32)

    return inputs, outputs

In [None]:
import torch.nn as nn

In [None]:
class CustomModel(nn.Module):
  def __init__(self, cfg):
    super(CustomModel, self).__init__()
    self.cfg = cfg
    self.cnn = nn.Sequential(*([
        nn.Linear(self.cfg.input_size, self.cfg.hidden_size), nn.LeakyReLU(),] + [nn.Linear(self.cfg.hidden_size, self.cfg.hidden_size), nn.LeakyReLU()] * self.cfg.n_layers)
    )

    self.head = nn.Sequential(
        nn.Linear(self.cfg.hidden_size, self.cfg.output),
        nn.LeakyReLU(),
    )

  def forward(self, x):
    x = self.cnn(x)
    x = self.head(x)

    return x

In [None]:
import torch.optim as optim
device = 'cpu'

model = CustomModel(CFG).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=CFG.learning_rate)

In [None]:
train_ = train[train['building_type'] == 'Other Buildings']
train_ = train_.drop(['building_type'], axis=1).reset_index(drop=True)

kf = KFold(n_splits=CFG.folds)
kf = kf.split(train_)
train_['fold'] = -1

oof1 = np.zeros((len(train_), 1))

for fold, (train_index, valid_index) in enumerate(kf):
  train_.loc[valid_index, 'fold'] = fold

In [None]:
import torch
from tqdm.notebook import tqdm

In [None]:
score = 0

for fold in range(CFG.folds):
  smape = 0
  preds, gt = [], []

  train_folds = train_[train_['fold'] != fold].reset_index(drop=True)
  valid_folds = train_[train_['fold'] == fold].reset_index(drop=True)

  train_dataset = CustomDataset(train_folds, CFG)
  train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)

  valid_dataset = CustomDataset(valid_folds, CFG)
  valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False)

  for epoch in range(CFG.epoch):

    model.train()
    for inputs, output in tqdm(train_loader):
      optimizer.zero_grad()

      inputs = inputs.to(device)
      output = output.to(device)

      outputs = model(inputs)
      loss = criterion(outputs, output)

      loss.backward()
      optimizer.step()

  torch.save(model, f'model_fold{fold}_pth')

  model.eval()
  with torch.no_grad():
    for inputs, output in tqdm(valid_loader):
      inputs = inputs.to(device)
      outputs = model(inputs)

      pred = outputs.detach().cpu().numpy()
      preds.extend(pred)
      gt.extend(np.array(output))

  preds = np.array(preds)
  gt = np.array(gt)
  print(f'fold {fold} SMAPE: ', SMAPE(gt, preds))
  score += SMAPE(gt, preds)

print('SMAPE CV: ', score / CFG.folds)

       building_number           date_time  temperature  rainfall  windspeed  \
0                    3 2022-08-24 15:00:00         26.6       NaN        2.0   
1                    3 2022-08-24 16:00:00         26.3       NaN        2.4   
2                    3 2022-08-24 17:00:00         26.9       NaN        1.6   
3                    3 2022-08-24 18:00:00         26.2       NaN        2.5   
4                    3 2022-08-24 19:00:00         25.5       NaN        2.9   
...                ...                 ...          ...       ...        ...   
24340               15 2022-08-24 19:00:00         21.7       NaN        1.6   
24341               15 2022-08-24 20:00:00         21.8       NaN        1.7   
24342               15 2022-08-24 21:00:00         21.4       NaN        1.1   
24343               15 2022-08-24 22:00:00         21.4       NaN        0.3   
24344               15 2022-08-24 23:00:00         21.5       NaN        0.7   

       humidity  sunshine  solar_radiat

  0%|          | 0/761 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

  0%|          | 0/191 [00:00<?, ?it/s]

fold 0 SMAPE:  47.81213402748108
       building_number           date_time  temperature  rainfall  windspeed  \
0                    1 2022-06-01 00:00:00         18.6       NaN        0.9   
1                    1 2022-06-01 01:00:00         18.0       NaN        1.1   
2                    1 2022-06-01 02:00:00         17.7       NaN        1.5   
3                    1 2022-06-01 03:00:00         16.7       NaN        1.4   
4                    1 2022-06-01 04:00:00         18.4       NaN        2.8   
...                ...                 ...          ...       ...        ...   
24340               15 2022-08-24 19:00:00         21.7       NaN        1.6   
24341               15 2022-08-24 20:00:00         21.8       NaN        1.7   
24342               15 2022-08-24 21:00:00         21.4       NaN        1.1   
24343               15 2022-08-24 22:00:00         21.4       NaN        0.3   
24344               15 2022-08-24 23:00:00         21.5       NaN        0.7   

      

  0%|          | 0/761 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

  0%|          | 0/191 [00:00<?, ?it/s]

fold 1 SMAPE:  64.01821970939636
       building_number           date_time  temperature  rainfall  windspeed  \
0                    1 2022-06-01 00:00:00         18.6       NaN        0.9   
1                    1 2022-06-01 01:00:00         18.0       NaN        1.1   
2                    1 2022-06-01 02:00:00         17.7       NaN        1.5   
3                    1 2022-06-01 03:00:00         16.7       NaN        1.4   
4                    1 2022-06-01 04:00:00         18.4       NaN        2.8   
...                ...                 ...          ...       ...        ...   
24341               15 2022-08-24 19:00:00         21.7       NaN        1.6   
24342               15 2022-08-24 20:00:00         21.8       NaN        1.7   
24343               15 2022-08-24 21:00:00         21.4       NaN        1.1   
24344               15 2022-08-24 22:00:00         21.4       NaN        0.3   
24345               15 2022-08-24 23:00:00         21.5       NaN        0.7   

      

  0%|          | 0/761 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

  0%|          | 0/191 [00:00<?, ?it/s]

fold 2 SMAPE:  65.95290303230286
       building_number           date_time  temperature  rainfall  windspeed  \
0                    1 2022-06-01 00:00:00         18.6       NaN        0.9   
1                    1 2022-06-01 01:00:00         18.0       NaN        1.1   
2                    1 2022-06-01 02:00:00         17.7       NaN        1.5   
3                    1 2022-06-01 03:00:00         16.7       NaN        1.4   
4                    1 2022-06-01 04:00:00         18.4       NaN        2.8   
...                ...                 ...          ...       ...        ...   
24341               15 2022-08-24 19:00:00         21.7       NaN        1.6   
24342               15 2022-08-24 20:00:00         21.8       NaN        1.7   
24343               15 2022-08-24 21:00:00         21.4       NaN        1.1   
24344               15 2022-08-24 22:00:00         21.4       NaN        0.3   
24345               15 2022-08-24 23:00:00         21.5       NaN        0.7   

      

  0%|          | 0/761 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

  0%|          | 0/191 [00:00<?, ?it/s]

fold 3 SMAPE:  59.859561920166016
       building_number           date_time  temperature  rainfall  windspeed  \
0                    1 2022-06-01 00:00:00         18.6       NaN        0.9   
1                    1 2022-06-01 01:00:00         18.0       NaN        1.1   
2                    1 2022-06-01 02:00:00         17.7       NaN        1.5   
3                    1 2022-06-01 03:00:00         16.7       NaN        1.4   
4                    1 2022-06-01 04:00:00         18.4       NaN        2.8   
...                ...                 ...          ...       ...        ...   
24341               13 2022-06-02 05:00:00         18.1       NaN        2.5   
24342               13 2022-06-02 06:00:00         18.1       NaN        2.4   
24343               13 2022-06-02 07:00:00         19.1       NaN        3.9   
24344               13 2022-06-02 08:00:00         20.8       NaN        4.5   
24345               13 2022-06-02 09:00:00         22.3       NaN        4.4   

     

  0%|          | 0/761 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

  0%|          | 0/191 [00:00<?, ?it/s]

fold 4 SMAPE:  26.245015859603882
SMAPE CV:  52.77756690979004


In [None]:
train_ = train[train['building_type'] == 'Other Buildings']
train_ = train_.drop(['building_type'], axis=1).reset_index(drop=True)
oof1 = np.zeros((len(train_), 1))

kf = KFold(n_splits=CFG.folds)
kf = kf.split(train_)

for fold, (train_index, valid_index) in enumerate(kf):
  X_train, y_train = train_.loc[train_index, X_features], train_.loc[train_index, y_features]
  X_valid, y_valid = train_.loc[valid_index, X_features], train_.loc[valid_index, y_features]

  model = build_model()
  model.fit(X_train, y_train,
            validation_data = (X_valid, y_valid),
            batch_size = 16, epochs = 1)

  oof1[valid_index, ] = model.predict(X_valid)

In [None]:
import tensorflow as tf

In [None]:
def build_model():

    inp = tf.keras.Input(shape=(29, 1))

    x = tf.keras.layers.GRU(units=16, return_sequences=True)(inp)
    x = tf.keras.layers.GRU(units=16, return_sequences=True)(x)
    x = tf.keras.layers.GRU(units=16, return_sequences=False)(x)
    x = tf.keras.layers.Dense(1,activation='linear')(x) # OUTPUT SHAPE IS 5
    model = tf.keras.Model(inputs=inp, outputs=x)

    opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
    loss = tf.keras.losses.MeanSquaredError()
    model.compile(loss=loss, optimizer = opt)

    return model

In [None]:
model = build_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 29, 1)]           0         
                                                                 
 gru (GRU)                   (None, 29, 16)            912       
                                                                 
 gru_1 (GRU)                 (None, 29, 16)            1632      
                                                                 
 gru_2 (GRU)                 (None, 16)                1632      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 4193 (16.38 KB)
Trainable params: 4193 (16.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,...,cos_month,sin_dayofweek,cos_dayofweek,CDH,THI,WCT,day_hour_mean,day_hour_std,hour_mean,hour_std
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,,,1085.28,Other Buildings,...,1.0,-0.433884,0.900969,-7.4,49.6576,20.584184,1774.744615,517.982222,1706.318118,446.882767
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,,,1047.36,Other Buildings,...,1.0,-0.433884,0.900969,-15.4,47.7625,19.846954,1687.347692,500.769931,1622.620235,439.662704
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,,,974.88,Other Buildings,...,1.0,-0.433884,0.900969,-23.7,47.2225,19.317610,1571.483077,465.227458,1506.971294,412.071906
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,,,953.76,Other Buildings,...,1.0,-0.433884,0.900969,-33.0,44.7856,18.337658,1522.153846,436.601091,1437.365647,391.205981
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,,,986.40,Other Buildings,...,1.0,-0.433884,0.900969,-40.6,49.0061,19.585934,1506.793846,405.518091,1447.321412,381.099697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202899,100,2022-08-24 19:00:00,23.1,,0.9,86.0,0.5,,881.04,Hotel and Resort,...,0.5,-0.433884,0.900969,-16.6,63.6624,25.094857,964.873846,163.825489,1010.462118,161.399578
202900,100,2022-08-24 20:00:00,22.4,,1.3,86.0,0.0,,798.96,Hotel and Resort,...,0.5,-0.433884,0.900969,-17.6,62.4024,24.244918,882.184615,153.076049,928.125176,137.566008
202901,100,2022-08-24 21:00:00,21.3,,1.0,92.0,,,825.12,Hotel and Resort,...,0.5,-0.433884,0.900969,-20.1,64.1976,23.241700,779.095385,143.415686,830.032941,128.300189
202902,100,2022-08-24 22:00:00,21.0,,0.3,94.0,,,640.08,Hotel and Resort,...,0.5,-0.433884,0.900969,-22.8,65.0744,23.472275,663.267692,105.147190,723.100235,112.464079


In [None]:
train_ = train[train['building_type'] == 'Other Buildings']
train_ = train_.drop(['building_type'], axis=1).reset_index(drop=True)
oof1 = np.zeros((len(train_), 1))

kf = KFold(n_splits=CFG.folds)
kf = kf.split(train_)

for fold, (train_index, valid_index) in enumerate(kf):
  X_train, y_train = train_.loc[train_index, X_features], train_.loc[train_index, y_features]
  X_valid, y_valid = train_.loc[valid_index, X_features], train_.loc[valid_index, y_features]

  model = build_model()
  model.fit(X_train, y_train,
            validation_data = (X_valid, y_valid),
            batch_size = 16, epochs = 1)

  oof1[valid_index, ] = model.predict(X_valid)



In [None]:
print("SMAPE: ", SMAPE(np.array(train_[y_features]), oof1))

In [None]:
train_ = train[train['building_type'] == 'Public']
train_ = train_.drop(['building_type'], axis=1).reset_index(drop=True)
oof2 = np.zeros((len(train_), 1))

kf = KFold(n_splits=CFG.folds)
kf = kf.split(train_)

for fold, (train_index, valid_index) in enumerate(kf):
  X_train, y_train = train_.loc[train_index, X_features], train_.loc[train_index, y_features]
  X_valid, y_valid = train_.loc[valid_index, X_features], train_.loc[valid_index, y_features]

  model = build_model()
  model.fit(X_train, y_train,
            validation_data = (X_valid, y_valid),
            batch_size = 4, epochs = 1)

  oof2[valid_index, ] = model.predict(X_valid)

In [None]:
print("SMAPE: ", SMAPE(np.array(train_[y_features]), oof2))

SMAPE:  198.0119987213825


In [None]:
oofs = []

for i in type_list:
  train_ = train[train['building_type'] == i]
  train_ = train_.drop(['building_type'], axis=1).reset_index(drop=True)
  oof = np.zeros((len(train_), 1))

  kf = KFold(n_splits=CFG.folds)
  kf = kf.split(train_)

  for fold, (train_index, valid_index) in enumerate(kf):
    X_train, y_train = train_.loc[train_index, X_features], train_.loc[train_index, y_features]
    X_valid, y_valid = train_.loc[valid_index, X_features], train_.loc[valid_index, y_features]

    model = build_model()
    model.fit(X_train, y_train,
              validation_data = (X_valid, y_valid),
              batch_size = 4, epochs = 1)

    oof[valid_index, ] = model.predict(X_valid)

  oofs.extend(oof)

In [None]:
xgb_best_params = pd.read_csv('5_07 xgb 파라미터.csv')
xgb_best_params['building_type'] = type_list
xgb_best_params.set_index('building_type',inplace=True)

In [None]:
kf = KFold(n_splits = 7,shuffle=True,random_state=RANDOM_SEED)
answer_df = pd.DataFrame(columns=['answer'])
pred_df = pd.DataFrame(columns=['pred'])

for i in type_list:
    x = X[(X.building_type == i)]
    y = Y[(Y.building_type == i)]
    X_test = test_X[test_X.building_type==i]

    x = pd.get_dummies(x, columns=['building_number'], drop_first=False)
    X_test = pd.get_dummies(X_test, columns=['building_number'], drop_first=False)

    x = x.drop(['building_type'],axis =1)
    X_test = X_test.drop(['building_type'],axis =1)
    y = y['power_consumption']
    x_columns = np.array(x.columns)
    x = np.array(x) ; y = np.array(y)

    j = 0
    xgb_fold_smape = []
    answer_list = []
    pred = pd.DataFrame(index = range(0,y.shape[0]), columns=['pred'])
    answer = pd.DataFrame(columns=['answer'])

    for train_index, valid_index in kf.split(x):
        j += 1

        X_train, X_valid = x[train_index], x[valid_index]
        Y_train, Y_valid = y[train_index], y[valid_index]
        Y_train = np.log(Y_train) ; Y_valid = np.log(Y_valid)


        evals = [(X_train,Y_train),(X_valid,Y_valid)]
        xgb_model = XGBRegressor(learning_rate = 0.05,n_estimators = 5000,
                             max_depth = int(xgb_best_params.loc[i]['max_depth']),
                             random_state = RANDOM_SEED,
                            subsample = xgb_best_params.loc[i]['subsample'],
                             colsample_bytree = xgb_best_params.loc[i]['colsample_bytree'],
                             min_child_weight = int(xgb_best_params.loc[i]['min_child_weight']),
                             objective=weighted_mse(xgb_best_params.loc[i]['alpha']))

        xgb_model.fit(X_train, Y_train, early_stopping_rounds = 100,
                       eval_metric = custom_smape, eval_set = evals, verbose = False)
        xgb_pred = xgb_model.predict(X_valid)
        xgb_pred = np.exp(xgb_pred)
        pred['pred'][valid_index] = xgb_pred
        xgb_smape = smape(np.exp(Y_valid),xgb_pred)
        xgb_answer = xgb_model.predict(X_test)
        answer_list.append(np.exp(xgb_answer))
        xgb_fold_smape.append(xgb_smape)


        if j == 7:
            sorted_idx = xgb_model.feature_importances_.argsort()
            plt.figure(figsize=(8,15))
            plt.barh(x_columns[sorted_idx],  xgb_model.feature_importances_[sorted_idx])
            plt.xlabel('%s model XGB Feature Importance'%(i))
            plt.show()


    type_answer = sum(answer_list) / len(answer_list)

    answer.answer = type_answer
    answer_df = pd.concat([answer_df,answer],ignore_index=True)
    pred_df = pd.concat([pred_df,pred],ignore_index=True)

    avg_smape = sum(xgb_fold_smape) / len(xgb_fold_smape)
    print('Building type = %s : XGBRegressor Model SMAPE : %.4f' %(i,avg_smape))


total_score = smape(Y.power_consumption,pred_df.pred)
print('Total SMAPE : %.4f'%(total_score))