In [1]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')
building = pd.read_csv('data/building_info.csv')

In [2]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop(['num_date_time','sunshine','solar_radiation'], axis = 1, inplace=True)

In [3]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test.drop('num_date_time', axis = 1, inplace=True)

In [4]:
building = building.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

In [5]:
building.head()

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,건물기타,110634.0,39570.0,-,-,-
1,2,건물기타,122233.47,99000.0,-,-,-
2,3,건물기타,171243.0,113950.0,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.0,150000.0,-,2557,1000


In [6]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building['building_type'] = building['building_type'].replace(translation_dict)

In [7]:
building.head()

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other Buildings,110634.0,39570.0,-,-,-
1,2,Other Buildings,122233.47,99000.0,-,-,-
2,3,Other Buildings,171243.0,113950.0,40,-,-
3,4,Other Buildings,74312.98,34419.62,60,-,-
4,5,Other Buildings,205884.0,150000.0,-,2557,1000


## csv 병합

In [8]:
train = pd.merge(train, building, on='building_number', how='left')
test = pd.merge(test, building, on='building_number', how='left')

In [9]:
train.head()

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,power_consumption,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,20220601 00,18.6,,0.9,42.0,1085.28,Other Buildings,110634.0,39570.0,-,-,-
1,1,20220601 01,18.0,,1.1,45.0,1047.36,Other Buildings,110634.0,39570.0,-,-,-
2,1,20220601 02,17.7,,1.5,45.0,974.88,Other Buildings,110634.0,39570.0,-,-,-
3,1,20220601 03,16.7,,1.4,48.0,953.76,Other Buildings,110634.0,39570.0,-,-,-
4,1,20220601 04,18.4,,2.8,43.0,986.4,Other Buildings,110634.0,39570.0,-,-,-


In [60]:
test.head()

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,20220825 00,23.5,0.0,2.2,72,Other Buildings,110634.0,39570.0,-,-,-
1,1,20220825 01,23.0,0.0,0.9,72,Other Buildings,110634.0,39570.0,-,-,-
2,1,20220825 02,22.7,0.0,1.5,75,Other Buildings,110634.0,39570.0,-,-,-
3,1,20220825 03,22.1,0.0,1.3,78,Other Buildings,110634.0,39570.0,-,-,-
4,1,20220825 04,21.8,0.0,1.0,77,Other Buildings,110634.0,39570.0,-,-,-


In [61]:
building.head()

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other Buildings,110634.0,39570.0,-,-,-
1,2,Other Buildings,122233.47,99000.0,-,-,-
2,3,Other Buildings,171243.0,113950.0,40,-,-
3,4,Other Buildings,74312.98,34419.62,60,-,-
4,5,Other Buildings,205884.0,150000.0,-,2557,1000


## train전처리

In [10]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

# date time feature 생성
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month
train['year'] = train['date_time'].dt.year
train.drop('date_time',axis = 1,inplace = True)

In [20]:
train['solar_power_capacity'] = train['solar_power_capacity'].apply(lambda x : 0 if x=='-' else x)
train['ess_capacity'] = train['ess_capacity'].apply(lambda x : 0 if x=='-' else x)
train['pcs_capacity'] = train['pcs_capacity'].apply(lambda x : 0 if x=='-' else x)

In [13]:
train = train.fillna(0)

In [21]:
test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

# date time feature 생성
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['year'] = test['date_time'].dt.year
test.drop('date_time',axis = 1,inplace = True)

In [22]:
test['solar_power_capacity'] = test['solar_power_capacity'].apply(lambda x : 0 if x=='-' else x)
test['ess_capacity'] = test['ess_capacity'].apply(lambda x : 0 if x=='-' else x)
test['pcs_capacity'] = test['pcs_capacity'].apply(lambda x : 0 if x=='-' else x)

In [23]:
test = test.fillna(0)

In [24]:
train = pd.get_dummies(train)

In [25]:
from lightgbm import LGBMRegressor

In [26]:
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

train_x = train.drop('power_consumption',axis = 1)
train_y = train['power_consumption']

train_X, valid_X, train_Y, valid_Y = train_test_split(train_x, train_y, test_size = 0.2, random_state = 42)

In [27]:
model = LGBMRegressor(random_state=46)

In [40]:
lgbm_params = {"learning_rate": [0.01, 0.05, 0.1],
               "n_estimators": [500,1000, 1500,2000]}

In [41]:
lgbm_gs_best = GridSearchCV(model,
                            lgbm_params,
                            cv=5,
                            n_jobs=-1,
                            verbose=True).fit(train_x, train_y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 204000, number of used features: 66
[LightGBM] [Info] Start training from score 2451.036462


In [42]:
model.set_params(**lgbm_gs_best.best_params_).fit(train_X, valid_X.values.ravel())

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 204000, number of used features: 66
[LightGBM] [Info] Start training from score 2451.036462


In [48]:
import pandas as pd
import numpy as np
import os
import glob


def smape(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output

In [49]:
pred = model.predict(valid_X)
print(smape(valid_Y, pred))

19.40814244208724


In [43]:
test_x = pd.get_dummies(test)

In [44]:
preds = model.predict(test_x)

In [45]:
submission['answer'] = preds

In [46]:
submission.to_csv('second_answer.csv',index = False)

In [47]:
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1250.120258
1,1_20220825 01,1194.399325
2,1_20220825 02,1201.010559
3,1_20220825 03,1151.117578
4,1_20220825 04,1144.067555
...,...,...
16795,100_20220831 19,1164.617221
16796,100_20220831 20,1094.148613
16797,100_20220831 21,995.379212
16798,100_20220831 22,876.081030
