In [1]:
VERSION = 144
SUB_VERSION = 0
DATASET_VERSION = f'EE_DATASET_{VERSION}'
TRAIN_VERSION = f'EE_TRAIN_{VERSION}_{SUB_VERSION}'
PATH = 'data'

In [2]:
import numpy as np
import pandas as pd
import polars as pl
import pickle
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time

%load_ext memory_profiler

pl.Config.set_tbl_rows(101)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 101)

In [3]:
import random

SEED = 0

os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [4]:
N_FOLDS = 12
INPUT_LENGTHS = 74
OUTPUT_LENGTHS = 1
VALIDATION_LENGTHS = 24 * 7
VALIDATION_SPACE = 24

In [5]:
dataset_path = os.path.join(PATH, 'dataset', DATASET_VERSION)
print(dataset_path)
train_path = os.path.join(PATH, 'model')
os.makedirs(train_path, exist_ok=True)
train_path = os.path.join(PATH, 'model', TRAIN_VERSION)
os.makedirs(train_path, exist_ok=True)
print(train_path)

data\dataset\EE_DATASET_144
data\model\EE_TRAIN_144_0


In [6]:
def pkl_save(obj, path, file_name):
    f_name = os.path.join(path, f'{file_name}.pkl')
    with open(f_name, 'wb') as file:
        pickle.dump(obj, file)

def pkl_load(path, file_name):
    f_name = os.path.join(path, f'{file_name}.pkl')
    obj = pickle.load(open(f_name, 'rb'))

    return obj

In [7]:
mean_std = pkl_load(dataset_path, 'power_log1p')

COLUMNS_X_CAT = pkl_load(dataset_path, 'COLUMNS_X_CAT')
COLUMNS_X_NUM = pkl_load(dataset_path, f'COLUMNS_X_NUM_{SUB_VERSION}')
COLUMNS_X = list(COLUMNS_X_CAT) + COLUMNS_X_NUM
COLUMNS_Y = pkl_load(dataset_path, 'COLUMNS_Y')

df = pl.read_csv(f'{dataset_path}/train.csv')

In [8]:
print(COLUMNS_X)

['whc', 'sin_hour', 'cos_hour', 'temperature_squared', 'temperature_squared_mean', 'THI', 'THI_mean', 'humidity_squared', 'power_log1p_stdd_mean', 'power_log1p_stdd_shift', 'power_log1p_stdd_cumweek_mean_shift', 'power_log1p_stdd_thisweek_mean_shift']


In [9]:
def smape(y_true, y_pred):
    
    # CONVERT TO NUMPY
    y_true = y_true.copy().reshape(-1)
    y_pred = y_pred.copy().reshape(-1)
    
    # WHEN BOTH EQUAL ZERO, METRIC IS ZERO
    both = np.abs(y_true) + np.abs(y_pred)
    idx = np.where(both==0)[0]
    y_true[idx]=1; y_pred[idx]=1
    
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [10]:
def compute_score(model, X_valid, y_valid, mean_std):
    if mean_std:
        p = np.expm1(model.predict(X_valid) * mean_std['std'] + mean_std['mean'])
    else:
        p = np.expm1(model.predict(X_valid))
    t = y_valid['power']
    
    return smape(t, p), p

In [11]:
from xgboost import XGBRegressor

def train_xgb_model(x_train, y_train, x_valid, y_valid, path, suffix):
    xgb_params = {
        'objective': 'reg:pseudohubererror',
        'eval_metric': 'mae',
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist',
        'learning_rate': 0.02,
        'max_depth': 4,
        'subsample': 0.7,
        'colsample_bylevel': 0.7,
        'n_jobs': -1,
        'seed': SEED,
        'alpha': 0.1
    }
    model = XGBRegressor(**xgb_params)
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_valid, y_valid)],
        verbose=1000,
    )
    f_name = os.path.join(path, f'xgb_{suffix}.xgb')
    model.save_model(f_name, )

    return model

def train_xgb_inference_model(n_estimators, x_train, y_train, path, suffix):
    xgb_params = {
        'objective': 'reg:pseudohubererror',
        'eval_metric': 'mae',
        'n_estimators': n_estimators,
        'tree_method': 'gpu_hist',
        'learning_rate': 0.02,
        'max_depth': 4,
        'subsample': 0.7,
        'colsample_bylevel': 0.7,
        'n_jobs': -1,
        'seed': SEED,
        'alpha': 0.1
    }
    model = XGBRegressor(**xgb_params)
    model.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train)],
        verbose=1000,
    )
    f_name = os.path.join(path, f'xgb_{suffix}.xgb')
    model.save_model(f_name, )

    return model

In [12]:
from catboost import CatBoostRegressor, Pool

def train_cat_model(x_train, y_train, x_valid, y_valid, path, suffix):
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_valid, y_valid)
    cat_params = {
        'loss_function': 'RMSE',
        'eval_metric': 'MAE',
        'iterations': 10000,
        'early_stopping_rounds': 100,
        'depth': 4,
        'subsample': 0.8,
        'colsample_bylevel': 0.7,
        'random_seed': SEED,
        #'l2_leaf_reg': 4,
    }
    model = CatBoostRegressor(**cat_params)
    model.fit(train_pool, 
              eval_set=valid_pool,
              verbose=1000,)
    f_name = os.path.join(path, f'cat_{suffix}.cbm')
    model.save_model(f_name)

    return model

def train_cat_inference_model(n_estimators, x_train, y_train, path, suffix):
    train_pool = Pool(x_train, y_train)
    valid_pool = Pool(x_valid, y_valid)
    cat_params = {
        'loss_function': 'RMSE',
        'eval_metric': 'MAE',
        'iterations': n_estimators,
        'depth': 4,
        'subsample': 0.8,
        'colsample_bylevel': 0.7,
        'random_seed': SEED,
        #'l2_leaf_reg': 4,
    }
    model = CatBoostRegressor(**cat_params)
    model.fit(train_pool, 
              verbose=1000,)
    f_name = os.path.join(path, f'cat_{suffix}.cbm')
    model.save_model(f_name)

    return model

In [13]:
def train_gbdt_model(model_type, x_train, y_train, x_valid, y_valid, path, suffix):
    if model_type == 'xgb':
        model = train_xgb_model(x_train, y_train, x_valid, y_valid, path, suffix)
    elif model_type == 'cat':
        model = train_cat_model(x_train, y_train, x_valid, y_valid, path, suffix)

    return model

def train_gbdt_inference_model(model_type, n_estimators, x_train, y_train, path, suffix):
    if model_type == 'xgb':
        model = train_xgb_inference_model(n_estimators, x_train, y_train, path, suffix)
    elif model_type == 'cat':
        model = train_cat_inference_model(n_estimators, x_train, y_train, path, suffix)

    return model

In [14]:
def create_numpy_dataset(df, c_cat, c_num, c_y):
    X = {}
    y = {}
    for c in c_cat:
        X[c] = np.eye(c_cat[c])[df[c].to_numpy()]
    for n in c_num:
        X[n] = df[[n]].to_numpy()
    X = np.concatenate([X[c] for c in X], axis=1).astype(np.float32)
    for c in c_y:
        y[c] = df[[c]].to_numpy().astype(np.float32)
    
    return X, y

In [None]:
%%time
%%memit

#models = {}
ss = {}
best_iters = {}
feature_importances = []

for b in range(1, 101):
    #models[f'b{b}'] = {}
    ss[b] = []
    best_iters[b] = []
    tmp = df.filter(df['building']==b)
    chunks = len(tmp) // N_FOLDS
    indices = list(range(INPUT_LENGTHS, len(tmp)))
    for f in range(N_FOLDS):
        print('-----------------------------')
        print(f'b{b}f{f}')
        print()
        
        if f == 0:
            valid_indices = indices[-(f+1)*VALIDATION_LENGTHS-VALIDATION_SPACE:]
            train_indices = list(set(indices) - set(valid_indices))
            valid_indices = indices[-(f+1)*VALIDATION_LENGTHS:]
        else:
            valid_indices = indices[-(f+1)*VALIDATION_LENGTHS-VALIDATION_SPACE: -f*VALIDATION_LENGTHS+VALIDATION_SPACE]
            train_indices = list(set(indices) - set(valid_indices))
            valid_indices = indices[-(f+1)*VALIDATION_LENGTHS: -f*VALIDATION_LENGTHS]
        '''
        Public Score: 25, 26, 27 (3)
        Private Score: 25, 26, 27, 28, 29, 30, 31 (7)
        we're going to focus on Private Score.
        '''
        if len(valid_indices) == 0 or len(valid_indices) < VALIDATION_LENGTHS:
            continue
        train = tmp[train_indices]
        valid = tmp[valid_indices]
        X_train, y_train = create_numpy_dataset(train, COLUMNS_X_CAT, COLUMNS_X_NUM, COLUMNS_Y)
        X_valid, y_valid = create_numpy_dataset(valid, COLUMNS_X_CAT, COLUMNS_X_NUM, COLUMNS_Y)

        suffix = f'b{b}f{f}'
        model = train_gbdt_model('xgb', X_train, y_train['power_log1p_stdd'], X_valid, y_valid['power_log1p_stdd'], train_path, suffix)
        s, p = compute_score(model, X_valid, y_valid, mean_std[b])
        ss[b].append(s)
        best_iters[b].append(model.best_iteration)
        feature_importances.append(model.get_booster().get_score(importance_type='gain'))

        valid = valid.with_columns(
            pl.from_numpy(p, schema=['power_pred'])
        )
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(11, 3))
        data = pl.concat([train, valid], how='diagonal').to_pandas()
        _ = sns.lineplot(data=data, x='index', y='power', estimator='mean', errorbar=None, ax=ax)
        _ = sns.lineplot(data=data, x='index', y='power_pred', estimator='mean', errorbar=None, ax=ax)
        _ = ax.set_title(f'{b} {f}')
        plt.show()
        
        del train, valid, X_train, y_train, X_valid, y_valid, model, suffix, s, p, fig, ax, data
        gc.collect()
    print(f'b{b} Overall {N_FOLDS} fold CV')
    print(*np.round(ss[b], 2))
    print(f'{np.mean(ss[b]):.2f}')
    print(*best_iters[b])
    print(int(np.mean(best_iters[b])))
    print(int(np.median(best_iters[b])))

    X_train, y_train = create_numpy_dataset(tmp, COLUMNS_X_CAT, COLUMNS_X_NUM, COLUMNS_Y)
    suffix = f'b{b}_inference'
    model = train_gbdt_inference_model('xgb', int(np.mean(best_iters[b])), X_train, y_train['power_log1p_stdd'], train_path, suffix)

In [16]:
ss_df = pd.DataFrame.from_dict(ss, orient='index')
ss_df.loc['mean'] = ss_df.mean(axis=0)
ss_df['mean'] = ss_df.mean(axis=1)
display(ss_df)
ss_df.to_csv(f'{train_path}/ss_df.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,mean
1,6.05692,9.098327,8.61171,8.010885,4.772969,4.872916,7.791214,10.033974,12.822971,5.983303,7.111261,7.742404
2,8.728576,9.226416,9.404702,9.253359,6.778168,6.784786,5.432215,6.724028,8.035076,11.242687,14.894928,8.773176
3,12.905882,10.380771,11.021073,11.30788,16.443727,12.364982,7.303857,7.746586,7.953749,8.58694,9.168696,10.471286
4,5.518073,7.310429,4.729692,5.026267,5.351066,4.058631,5.717899,5.906774,4.77175,4.080981,4.760328,5.202899
5,4.705423,7.507447,6.05144,5.17944,5.060048,4.056581,6.343985,12.861227,9.24757,4.026193,6.179084,6.474404
6,4.333073,4.235118,3.462181,3.922126,3.521625,4.002359,4.976073,3.455142,4.190004,4.388855,5.743599,4.202741
7,7.322439,5.600375,7.151128,7.306892,5.043459,4.935871,,,,,,6.226694
8,4.579963,5.703129,4.105885,3.958545,4.882026,4.071086,4.284735,3.894476,4.40857,4.920531,5.165588,4.543139
9,3.273014,4.246264,4.21624,3.988293,2.674942,3.271141,3.794066,3.225328,4.287265,5.800051,7.848396,4.238636
10,6.968164,6.552689,6.237467,7.800804,6.896465,5.535828,6.003906,3.915703,7.151181,6.049575,9.277217,6.580818


In [17]:
cat_init = 0
for i in COLUMNS_X_CAT:
    cat_init += COLUMNS_X_CAT[i]
fnum = [f'f{i}' for i in range(cat_init, cat_init + len(COLUMNS_X_NUM))]
feature_importances_df = pd.DataFrame(feature_importances)[fnum]
feature_importances_df.columns = COLUMNS_X_NUM
feature_importances_df = feature_importances_df.T
feature_importances_df['mean'] = feature_importances_df.mean(axis=1)
display(feature_importances_df[['mean']].sort_values('mean', ascending=False))
print(feature_importances_df[['mean']].sort_values('mean', ascending=False)[:12].index.to_list())
feature_importances_df.to_csv(f'{train_path}/feature_importances_df.csv')

Unnamed: 0,mean
power_log1p_stdd_mean,11.225888
power_log1p_stdd_shift,9.112626
power_log1p_stdd_thisweek_mean_shift,3.538538
cos_hour,2.691489
THI_mean,2.21936
temperature_squared_mean,1.367848
THI,1.304061
sin_hour,1.113261
power_log1p_stdd_cumweek_mean_shift,0.710931
temperature_squared,0.629741


['power_log1p_stdd_mean', 'power_log1p_stdd_shift', 'power_log1p_stdd_thisweek_mean_shift', 'cos_hour', 'THI_mean', 'temperature_squared_mean', 'THI', 'sin_hour', 'power_log1p_stdd_cumweek_mean_shift', 'temperature_squared', 'humidity_squared']
