In [1]:
import sys
!pip install -q sktime
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

# train = pd.read_csv('/kaggle/input/818818/train_818.csv')
# valid = pd.read_csv('/kaggle/input/818818/valid_818.csv')
np.random.seed(42)

def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 200

def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed



In [2]:
def get_train2():
    info = pd.read_csv('/kaggle/input/big-one/building_info.csv').drop(['ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1)
    info.columns = ['building', 'type', 'all_area', 'cool_area', 'sun']
    info['sun'] = info['sun'].replace('-', 0)
    info['sun'] = info['sun'].astype('float')
    types = info['type'].unique()
    value_dict = {value: index for index, value in enumerate(types)}
    info['type'] = info['type'].map(value_dict)

    info.loc[64, 'cool_area'] = 146585.0
    info.loc[65, 'cool_area'] = 83781.0
    info.loc[67, 'cool_area'] = 310488.0
    info.loc[76, 'cool_area'] = 35716.0
    info.loc[79, 'cool_area'] = 135899.6

    np.random.seed(0)

    cols = ['num_date_time', 'building', 'date_time', 'temp', 'prec','wind', 'hum', 'target']

    train = pd.read_csv('/kaggle/input/big-one/train.csv').drop(['일조(hr)', '일사(MJ/m2)'], axis=1)

    train['풍속(m/s)'] = train['풍속(m/s)'].fillna(method='ffill')
    train['습도(%)'] = train['습도(%)'].fillna(method='ffill')
    train = train.fillna(0)
    train.columns = cols
    #train = train.merge(info, on='building', how='left')

    test = pd.read_csv('/kaggle/input/big-one/test.csv')
    test.columns = cols[:-1]
    #test = test.merge(info, on='building', how='left')

    def add_data(df):
        for i in range(2):
            np.random.seed(i)
            num_rows = len(df)

            random_temp = df['temp'] * np.random.uniform(0.9, 1.1, num_rows)
            random_prec = df['prec'] * np.random.uniform(0.9, 1.1, num_rows)
            random_wind = df['wind'] * np.random.uniform(0.9, 1.1, num_rows)
            random_hum = df['hum'] * np.random.uniform(0.9, 1.1, num_rows)

            # 소수 첫째 자리까지 반올림
            random_temp = np.round(random_temp, 1)
            random_prec = np.round(random_prec, 1)
            random_wind = np.round(random_wind, 1)
            random_hum = np.round(random_hum, 1)

            # 새로운 데이터프레임 생성

            new_df = df.copy()
            new_df['temp'] = random_temp
            new_df['prec'] = random_prec
            new_df['wind'] = random_wind
            new_df['hum'] = random_hum

            # 기존 데이터프레임과 새로운 데이터프레임을 이어붙임
            df = pd.concat([df, new_df], ignore_index=True)
        df = df.sort_values(by=['building', 'date_time']).reset_index(drop=True)
        return df


    # 시간 관련 변수들 생성
    date = pd.to_datetime(train.date_time)
    train['hour'] = date.dt.hour
    train['dow'] = date.dt.weekday
    train['month'] = date.dt.month
    train['week'] = date.dt.isocalendar().week
    train['day'] = date.dt.day

#     avg_temp = pd.pivot_table(train[train['hour']%3 == 0], values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.mean).reset_index()
#     avg_temp.rename(columns={'temp': 'avg_temp'}, inplace=True)
#     train = pd.merge(train, avg_temp, on=['building', 'day', 'month'], how='left')
    
#     max_temp = pd.pivot_table(train, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.max).reset_index()
#     max_temp.rename(columns={'temp': 'max_temp'}, inplace=True)
#     train = pd.merge(train, max_temp, on=['building', 'day', 'month'], how='left')
    
#     min_temp = pd.pivot_table(train, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.min).reset_index()
#     min_temp.rename(columns={'temp': 'min_temp'}, inplace=True)
#     train = pd.merge(train, min_temp, on=['building', 'day', 'month'], how='left')
    
    date = pd.to_datetime(test.date_time)
    test['hour'] = date.dt.hour
    test['dow'] = date.dt.weekday
    test['month'] = date.dt.month
    test['week'] = date.dt.isocalendar().week
    test['day'] = date.dt.day

#     avg_temp = pd.pivot_table(test[test['hour']%3 == 0], values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.mean).reset_index()
#     avg_temp.rename(columns={'temp': 'avg_temp'}, inplace=True)
#     test = pd.merge(test, avg_temp, on=['building', 'day', 'month'], how='left')

#     max_temp = pd.pivot_table(test, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.max).reset_index()
#     max_temp.rename(columns={'temp': 'max_temp'}, inplace=True)
#     test = pd.merge(test, max_temp, on=['building', 'day', 'month'], how='left')
    
#     min_temp = pd.pivot_table(test, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.min).reset_index()
#     min_temp.rename(columns={'temp': 'min_temp'}, inplace=True)
#     test = pd.merge(test, min_temp, on=['building', 'day', 'month'], how='left')
    
    power_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'dow'], aggfunc = np.mean).reset_index()
    power_mean.rename(columns={'target': 'dow_hour_mean'}, inplace=True)
    train = pd.merge(train, power_mean, on=['building', 'hour', 'dow'], how='left')
    test = pd.merge(test, power_mean, on=['building', 'hour', 'dow'], how='left')

#     power_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'dow'], aggfunc = np.std).reset_index()
#     power_std.rename(columns={'target': 'dow_hour_std'}, inplace=True)
#     train = pd.merge(train, power_std, on=['building', 'hour', 'dow'], how='left')
#     test = pd.merge(test, power_std, on=['building', 'hour', 'dow'], how='left')

    # type_mean = pd.pivot_table(train, values = 'target', index = ['type', 'hour', 'dow'], aggfunc = np.mean).reset_index()
    # type_mean.rename(columns={'target': 'type_hour_mean'}, inplace=True)
    # train = pd.merge(train, type_mean, on=['type', 'hour', 'dow'], how='left')
    # test = pd.merge(test, type_mean, on=['type', 'hour', 'dow'], how='left')

    # type_std = pd.pivot_table(train, values = 'target', index = ['type', 'hour', 'dow'], aggfunc = np.std).reset_index()
    # type_std.rename(columns={'target': 'type_hour_std'}, inplace=True)
    # train = pd.merge(train, type_std, on=['type', 'hour', 'dow'], how='left')
    # test = pd.merge(test, type_std, on=['type', 'hour', 'dow'], how='left')

    ### 공휴일 변수 추가
    test['date'] = pd.to_datetime(test['date_time'], format='%Y-%m-%d %H')
    train['date'] = pd.to_datetime(train['date_time'], format='%Y-%m-%d %H')

    train['holiday'] = train.apply(lambda x : 0 if x['dow'] < 5 else 1, axis = 1)
    test['holiday'] = test.apply(lambda x : 0 if x['dow'] < 5 else 1, axis = 1)

    train.loc[train['building'] == 3, 'holiday'] = 0
    train.loc[(train['building'] == 3) & (train['dow'] == 0) , 'holiday'] = 1
    train.loc[(train['building'] == 3) & (train['date_time'].str.match(r'^20220731 \d{2}$')) , 'holiday'] = 1
    train.loc[(train['building'] == 3) & (train['date_time'].str.match(r'^20220723 \d{2}$')) , 'holiday'] = 1
    train.loc[(train['building'] == 3) & (train['date_time'].str.match(r'^20220720 \d{2}$')) , 'holiday'] = 1
    test.loc[test['building'] == 3, 'holiday'] = 0
    test.loc[(test['building'] == 3) & (test['dow'] == 0) , 'holiday'] = 1

    train.loc[train['building'] == 2, 'holiday'] = 0
    train.loc[(train['building'] == 2) & (train['dow'] == 0) , 'holiday'] = 1
    train.loc[(train['building'] == 2) & (train['date_time'].str.match(r'^20220607 \d{2}$')) , 'holiday'] = 1
    train.loc[(train['building'] == 2) & (train['date_time'].str.match(r'^20220617 \d{2}$')) , 'holiday'] = 1
    test.loc[test['building'] == 2, 'holiday'] = 0
    test.loc[(test['building'] == 2) & (test['dow'] == 0) , 'holiday'] = 1

    train.loc[train['building'] == 54, 'holiday'] = 0
    train.loc[(train['building'] == 54) & (train['dow'] == 0) , 'holiday'] = 1
    train.loc[(train['building'] == 54) & (train['date_time'].str.match(r'^20220816 \d{2}$')) , 'holiday'] = 1
    train.loc[(train['building'] == 54) & (train['date_time'].str.match(r'^20220817 \d{2}$')) , 'holiday'] = 1
    test.loc[test['building'] == 54, 'holiday'] = 0
    test.loc[(test['building'] == 54) & (test['dow'] == 0) , 'holiday'] = 1

    train.loc[(train['date_time'].str.match(r'^20220601 \d{2}$')) & (train['building'] != 14) , 'holiday'] = 1
    train.loc[(train['date_time'].str.match(r'^20220606 \d{2}$')) & (train['building'] != 14), 'holiday'] = 1
    train.loc[(train['date_time'].str.match(r'^20220815 \d{2}$')) & (train['building'] != 14), 'holiday'] = 1
    train.loc[(train['building'] == 14) & (train['date_time'].str.match(r'^20220614 \d{2}$')) , 'holiday'] = 1

    def week_of_month(date):
        first_day = date.replace(day=1)
        if (date.week - first_day.week + 1) % 2 == 0:
            if date.weekday() == 6:
                return 1
        return 0

    train['week_of_month'] = train['date'].apply(week_of_month)
    test['week_of_month'] = test['date'].apply(week_of_month)

    target_buildings = [87,88,89,90,91,92]
    train.loc[(train['building'].isin(target_buildings)) , 'holiday'] = 0
    train.loc[(train['building'].isin(target_buildings)) & (train['week_of_month'] == 1), 'holiday'] = 1
    test.loc[(test['building'].isin(target_buildings)) , 'holiday'] = 0
    test.loc[(test['building'].isin(target_buildings)) & (test['week_of_month'] == 1), 'holiday'] = 1

    train.loc[train['building'] == 85, 'holiday'] = 0
    test.loc[test['building'] == 85, 'holiday'] = 0

    test['date'] = pd.to_datetime(test['date_time'], format='%Y-%m-%d %H').dt.date
    train['date'] = pd.to_datetime(train['date_time'], format='%Y-%m-%d %H').dt.date

    target_day = ['2022-06-10', '2022-08-10', '2022-07-10', '2022-07-24', '2022-06-26', '2022-07-30']
    train.loc[train['building'] == 86, 'holiday'] = 0
    test.loc[test['building'] == 86, 'holiday'] = 0
    for i in target_day:
        k = pd.to_datetime(i)
        train.loc[(train['date'] == k.date()) & (train['building'] == 86), 'holiday'] = 1
        test.loc[(test['date'] == k.date()) & (train['building'] == 86), 'holiday'] = 1

    power_holiday_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.mean).reset_index()
    power_holiday_mean.rename(columns={'target': 'holiday_mean'}, inplace=True)
    train = pd.merge(train, power_holiday_mean, on=['building', 'hour', 'holiday'], how='left')
    test = pd.merge(test, power_holiday_mean, on=['building', 'hour', 'holiday'], how='left')

    power_holiday_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.std).reset_index()
    power_holiday_std.rename(columns={'target': 'holiday_std'}, inplace=True)
    train = pd.merge(train, power_holiday_std, on=['building', 'hour', 'holiday'], how='left')
    test = pd.merge(test, power_holiday_std, on=['building', 'hour', 'holiday'], how='left')

    power_hour_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.mean).reset_index()
    power_hour_mean.rename(columns={'target': 'hour_mean'}, inplace=True)
    train = pd.merge(train, power_hour_mean, on=['building', 'hour', ], how='left')
    test = pd.merge(test, power_hour_mean, on=['building', 'hour', ], how='left')

    power_hour_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.std).reset_index()
    power_hour_std.rename(columns={'target': 'hour_std'}, inplace=True)
    train = pd.merge(train, power_hour_std, on=['building', 'hour', ], how='left')
    test = pd.merge(test, power_hour_std, on=['building', 'hour', ], how='left')

    # train = add_data(train)
    # valid = add_data(valid)

    ## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
    train['sin_time'] = np.sin(2*np.pi*train.hour/24)
    train['cos_time'] = np.cos(2*np.pi*train.hour/24)
    test['sin_time'] = np.sin(2*np.pi*test.hour/24)
    test['cos_time'] = np.cos(2*np.pi*test.hour/24)

    train['THI'] = 9/5*train['temp'] - 0.55*(1-train['hum']/100)*(9/5*train['hum']-26)+32
    #train['THI']=pd.cut(train.THI, bins=[-100, 68, 75, 80, 200], labels=['0', '1', '2', '3'])
    test['THI'] = 9/5*test['temp'] - 0.55*(1-test['hum']/100)*(9/5*test['hum']-26)+32
    #test['THI']=pd.cut(test.THI, bins=[-100, 68, 75, 80, 200], labels=['0', '1', '2', '3'])

    train['WC']=13.12+0.6215*train['temp']-13.947*train['wind']**0.16+0.486*train['temp']*train['wind']**0.16
    #train['WC']=pd.cut(train.WC, bins=[-100, 21, 25, 28, 31, 100], labels=[0,1,2,3,4])
    test['WC']=13.12+0.6215*test['temp']-13.947*test['wind']**0.16+0.486*test['temp']*test['wind']**0.16
    #test['WC']=pd.cut(test.WC, bins=[-100, 21, 25, 28, 31, 100], labels=[0,1,2,3,4])

    def CDH(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)]-26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)]-26))
        return np.array(ys)

    cdhs = np.array([])
    for num in range(1,101):
        temp = train[train['building'] == num]
        cdh = CDH(temp['temp'].values)
        cdhs = np.concatenate([cdhs, cdh])
    train['CDH'] = cdhs

    cdhs = np.array([])
    for num in range(1,101):
        temp = test[test['building'] == num]
        cdh = CDH(temp['temp'].values)
        cdhs = np.concatenate([cdhs, cdh])
    test['CDH'] = cdhs

#     train['THI'] = train['THI'].astype('int')
#     train['WC'] = train['WC'].astype('int')
#     test['THI'] = test['THI'].astype('int')
#     test['WC'] = test['WC'].astype('int')
    train['week'] = train['week'].astype(np.int32)
    test['week'] = test['week'].astype(np.int32)

    def new_type(i):
        if i in [1,2,3,8,9,11,12,13,14,15]: return 0
        elif i in [17,18,19,20,21,22,23]: return 1
        elif i in [24,25,26,27,28,29,30,31]: return 2
        elif i in [32,33,34,35,36]: return 3
        elif i in [37,38,39,40,41,42,43,44]: return 4
        elif i in [45,46,47,48,49,50,51,52]: return 5
        elif i in [55,56,57,58]: return 6
        elif i in [53,54,59,60]: return 7
        elif i in [61,62,63,64,65,66,67,68]: return 8
        elif i in [69,70,71,72,73,74,75,76]: return 9
        elif i in [77,78,79,80,81,82,83,84]: return 10
        elif i in [87,88,89,90,91,92]: return 11
        elif i in [93,95,97,98,99,100]: return 12
        else: return 13

#     train['new_type'] = train['building'].apply(new_type)
#     test['new_type'] = test['building'].apply(new_type)

    # 37 : 620 711 88 617
    # 38 : 613 725 81
    # 39 : 718 88
    # 40 : 718 620 617 88
    # 41 : 627 725 88
    # 42 : 613 822 711
    
    target_buildings = [37,38,39,40,41,42]
    train.loc[(train['building'].isin(target_buildings)) , 'holiday'] = 0
    test.loc[(test['building'].isin(target_buildings)) , 'holiday'] = 0
    train.loc[(train['building'] == 37) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17']])), 'holiday'] = 1
    train.loc[(train['building'] == 38) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-13', '2022-07-25', '2022-08-01']])), 'holiday'] = 1
    train.loc[(train['building'] == 39) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-07-18', '2022-08-08']])), 'holiday'] = 1
    train.loc[(train['building'] == 40) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08']])), 'holiday'] = 1
    train.loc[(train['building'] == 41) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-27', '2022-07-25', '2022-08-08']])), 'holiday'] = 1
    train.loc[(train['building'] == 42) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-13', '2022-07-11', '2022-08-22']])), 'holiday'] = 1
    
#     train.drop(train[(train['building'] == 37) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17']]))].index, inplace=True)
#     train.drop(train[(train['building'] == 38) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-13', '2022-07-25', '2022-08-01',]]))].index, inplace=True)
#     train.drop(train[(train['building'] == 39) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-07-18', '2022-08-08',]]))].index, inplace=True)
#     train.drop(train[(train['building'] == 40) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08']]))].index, inplace=True)
#     train.drop(train[(train['building'] == 41) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-27', '2022-07-25', '2022-08-08']]))].index, inplace=True)
#     train.drop(train[(train['building'] == 42) & (train['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-13', '2022-07-11', '2022-08-22',]]))].index, inplace=True)
    
    def weather(train):
        condition = train['prec'] > 0
        filtered_df = train[condition].index.tolist()
        train['weather'] = 0
        # 3개의 1로 이루어진 열 생성
        for idx in filtered_df:
            if idx - 3 >= 0:
                train.loc[idx - 3, 'weather'] = 1
            if idx - 2 >= 0:
                train.loc[idx - 2, 'weather'] = 1
            if idx - 1 >= 0:
                train.loc[idx - 1, 'weather'] = 1
            train.loc[idx, 'weather'] = 1
            if idx + 1 < len(train):
                train.loc[idx + 1, 'weather'] = 1
            if idx + 2 < len(train):
                train.loc[idx + 2, 'weather'] = 1
            if idx + 3 < len(train):
                train.loc[idx + 3, 'weather'] = 1
        return train
    
#     train = weather(train)
#     test = weather(test)
            
            
    train.drop(['hour', 'week_of_month', 'prec', 'day', 'date'], axis = 1, inplace = True)
    test.drop(['hour', 'week_of_month', 'prec', 'day', 'date'], axis = 1, inplace = True)
# type day
    print('done')
    return train, test
train, test = get_train2()

done


In [None]:
import optuna
import optuna.logging
from tqdm import tqdm
optuna.logging.set_verbosity(optuna.logging.WARNING)

df = pd.DataFrame()

for i in tqdm(range(100)):
    y = train.loc[train.building == i+1, 'target']
    x = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)
    
    def objective(trial):
        param = {
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
            'gamma': trial.suggest_float('gamma', 1e-3, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
            'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        }

        xgb = XGBRegressor(**param, n_estimators=100, learning_rate=0.01)
        
        xgb.fit(x_train, y_train, verbose=False)

        preds = xgb.predict(x_valid)
        smape = SMAPE(y_valid, preds)

        return smape

    study = optuna.create_study(direction='minimize', study_name=None)
    study.optimize(objective, n_trials=500)

    df = pd.concat([df, study.trials_dataframe().sort_values(by=['value'], ascending=[True]).head(1)]).reset_index(drop=True)
df.to_csv('parameters.csv', index=False)
df.head(5)
# params_colsample_bytree 0.9 params_gamma 8.161415 params_max_depth 6 params_min_child_weight 47 
# params_reg_alpha 6.721675 params_reg_lambda 6.064121 params_subsample 1

 30%|███       | 30/100 [53:36<1:58:15, 101.36s/it]

In [None]:
scores = []   # smape 값을 저장할 list
best_it = []  # best interation을 저장할 list
for i in tqdm(range(100)):
    y = train.loc[train.building == i+1, 'target']
    x = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)
    
    xgb_reg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.01, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.00999999978, max_delta_step=0, max_depth=5,
             min_child_weight=6, monotone_constraints='()',
             n_estimators=10000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
             subsample=0.9, tree_method='exact', validate_parameters=1,
             verbosity=None, early_stopping_rounds=300)
    
#     xgb_reg = XGBRegressor(colsample_bytree=df.params_colsample_bytree[i], gamma=df.params_gamma[i], max_depth=df.params_max_depth[i],
#                           min_child_weight=df.params_min_child_weight[i], reg_alpha=df.params_reg_alpha[i],
#                           reg_lambda=df.params_reg_lambda[i], subsample=df.params_subsample[i], 
#                           n_estimators=10000, early_stopping_rounds=300, eval_metric=SMAPE)
    
    xgb_reg.set_params(**{'objective':weighted_mse(100)})
    
    xgb_reg.fit(x_train, y_train, eval_set=[(x_train, y_train), 
                                            (x_valid, y_valid)], verbose=False)
    
    y_pred = xgb_reg.predict(x_valid)  
    
    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb_reg.best_iteration+1) ## 실제 best iteration은 이 값에 +1 해주어야 함.
print(sum(scores)/len(scores)) # 10

In [None]:
preds = np.array([]) 

for i in tqdm(range(100)):
    pred_df = pd.DataFrame()   # 시드별 예측값을 담을 data frame
    
    for seed in [0,1,2,3,4]: # 각 시드별 예측
        y_train = train.loc[train.building == i+1, 'target']
        x_train = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
        x_test = test.loc[test.building == i+1, ].iloc[:,3:]
        
        xgb = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                 colsample_bynode=1, colsample_bytree=0.8, eta=0.01, gamma=0,
                 gpu_id=-1, importance_type='gain', interaction_constraints='',
                 learning_rate=0.00999999978, max_delta_step=0, max_depth=5,
                 min_child_weight=6, monotone_constraints='()',
                 n_estimators=best_it[i], n_jobs=0, num_parallel_tree=1,
                 random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                 seed=seed, subsample=0.9, tree_method='exact', validate_parameters=1,
                 verbosity=None)

#         xgb = XGBRegressor(colsample_bytree=df.params_colsample_bytree[i], gamma=df.params_gamma[i], max_depth=df.params_max_depth[i],
#                       min_child_weight=df.params_min_child_weight[i], reg_alpha=df.params_reg_alpha[i],
#                       reg_lambda=df.params_reg_lambda[i], subsample=df.params_subsample[i], 
#                       n_estimators=best_it[i])
#         if xgb_params.loc[index, 'alpha'] != 0:
#             xgb.set_params(**{'objective':weighted_mse(xgb_params.loc[index, 'alpha'])})
        
        xgb.fit(x_train, y_train)
        y_pred = xgb.predict(x_test)
        pred_df.loc[:,seed] = y_pred   # 각 시드별 예측 담기
        
    pred = pred_df.mean(axis=1)        # (i+1)번째 건물의 예측 =  (i+1)번째 건물의 각 시드별 예측 평균값
    preds = np.append(preds, pred)   
    
submission = pd.read_csv('/kaggle/input/big-one/sample_submission.csv')
submission['answer'] = preds
submission.to_csv('./submission_xgb.csv', index = False)
submission