In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
from tsm.data_selector import data_subset_by_dict
import matplotlib.pyplot as plt
import numpy as np

In [4]:
train_data = pd.read_pickle('kaggle/input/ashrae-energy-prediction/train.pkl')

## Level 6A)
* 1 use: 4\
* 1 meter reading - meter 0
* Weather features

In [34]:
level_6a = {'primary_use': 4, 'meter': 0}
level_6a_cols = ['building_id', 'meter_reading', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction','wind_speed']

In [35]:
data_6a = data_subset_by_dict(train_data, level_6a, level_6a_cols)

In [36]:
# Many buildings?
len(set(data_6a.building_id))

145

In [37]:
data_6a.meter_reading.ewm(com=0.8).mean().tail()

20216063     13.279575
20216064     11.124256
20216065     14.110780
20216090     92.243680
20216098    129.649967
Name: meter_reading, dtype: float64

In [18]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [19]:
from tsm.eval_metrics import root_mean_squared_log_error

In [20]:
SEED = 7

In [21]:
lgb_reg_params = {'objective':'regression',  'boosting_type':'gbdt', 'metric':'rmse',
                  'n_jobs':-1, 'learning_rate':0.07, 'num_leaves': 2**8, 'max_depth':-1,
                  'tree_learner':'serial', 'colsample_bytree': 0.7, 'subsample_freq':1,
                  'subsample':0.5, 'max_bin':255, 'verbose':1, 'seed': SEED, 'early_stopping_rounds': 40} 

In [22]:
# Cross Validation
from sklearn.model_selection import KFold

In [23]:
K = 5
kf = KFold(n_splits=K)

In [50]:
feat_cols = [x for x in level_6a_cols if x != 'meter_reading']

In [51]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    data_6a1_tr_x = data_6a.iloc[tr_ix[:int(len(tr_ix)*.9)],:][feat_cols]
    data_6a1_tr_y = data_6a.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a1_val_x = data_6a.iloc[tr_ix[int(len(tr_ix)*.9):],:][feat_cols]
    data_6a1_val_y = data_6a.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a1_tr_x, data_6a1_tr_y)
    lgb_eval = lgb.Dataset(data_6a1_val_x, data_6a1_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a1_ts_x = data_6a.iloc[ts_ix,:][feat_cols]
    data_6a1_ts_y = data_6a.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a1_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a1_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 82.8401	valid_1's rmse: 89.9542
[40]	training's rmse: 60.1001	valid_1's rmse: 82.9771
[60]	training's rmse: 54.329	valid_1's rmse: 84.0334
Early stopping, best iteration is:
[35]	training's rmse: 62.6793	valid_1's rmse: 82.6418
Iteration done, nrmlse: 2.161364548450659
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 77.6344	valid_1's rmse: 91.7351
[40]	training's rmse: 53.4194	valid_1's rmse: 85.1312
[60]	training's rmse: 47.5567	valid_1's rmse: 86.3733
Early stopping, best iteration is:
[39]	training's rmse: 53.5883	valid_1's rmse: 85.0971
Iteration done, nrmlse: 2.0400296826390014
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 81.5901	valid_1's rmse: 102.236
[40]	training's rmse: 60.1182	valid_1's rmse: 98.8657
[60]	training's rmse: 53.9892	valid_1's rmse: 100.588
Early stopping, best iteration is:
[33]	training's rmse: 64.3432	vali

In [42]:
from tsm.data_selector import get_series_past_k_lags

In [65]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    k_air = get_series_past_k_lags(data_6a['air_temperature'], k=[3 * (2**x) for x in range(6)]).fillna(99).reset_index(drop=True)
    k_dew = get_series_past_k_lags(data_6a['dew_temperature'], k=[3 * (2**x) for x in range(6)]).fillna(99).reset_index(drop=True)
    data_6a2 = pd.concat([data_6a.reset_index(drop=True), k_air, k_dew], axis=1, sort=False)
    data_6a2_tr_x = data_6a2.iloc[tr_ix[:int(len(tr_ix)*.9)],:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_tr_y = data_6a2.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a2_val_x = data_6a2.iloc[tr_ix[int(len(tr_ix)*.9):],:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_val_y = data_6a2.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a2_tr_x, data_6a2_tr_y)
    lgb_eval = lgb.Dataset(data_6a2_val_x, data_6a2_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a2_ts_x = data_6a2.iloc[ts_ix,:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_ts_y = data_6a2.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a2_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a2_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 72.4483	valid_1's rmse: 98.4111
[40]	training's rmse: 51.2874	valid_1's rmse: 97.2414
[60]	training's rmse: 45.8293	valid_1's rmse: 97.1034
Early stopping, best iteration is:
[30]	training's rmse: 58.4144	valid_1's rmse: 96.5205
Iteration done, nrmlse: 2.110309949385576
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 67.1509	valid_1's rmse: 100.904
[40]	training's rmse: 45.5705	valid_1's rmse: 100.702
[60]	training's rmse: 40.5828	valid_1's rmse: 103.486
Early stopping, best iteration is:
[28]	training's rmse: 53.9049	valid_1's rmse: 99.881
Iteration done, nrmlse: 2.116670609552658
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 72.3268	valid_1's rmse: 110.005
[40]	training's rmse: 52.2432	valid_1's rmse: 110.743
[60]	training's rmse: 46.3092	valid_1's rmse: 112.64
Early stopping, best iteration is:
[24]	training's rmse: 64.9345	valid_

In [66]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    k_air_e_001 = data_6a.air_temperature.ewm(alpha=0.01).mean().reset_index(drop=True)
    k_air_e_001.name = 'k_air_e_001'
    k_dew_e_001 = data_6a.dew_temperature.ewm(alpha=0.01).mean().reset_index(drop=True)
    k_dew_e_001.name = 'k_dew_e_001'
    k_air_e_01 = data_6a.air_temperature.ewm(alpha=0.1).mean().reset_index(drop=True)
    k_air_e_01.name = 'k_air_e_01'
    k_dew_e_01 = data_6a.dew_temperature.ewm(alpha=0.1).mean().reset_index(drop=True)
    k_dew_e_01.name = 'k_dew_e_01'
    k_air_e_025 = data_6a.air_temperature.ewm(alpha=0.25).mean().reset_index(drop=True)
    k_air_e_025.name = 'k_air_e_025'
    k_dew_e_025 = data_6a.dew_temperature.ewm(alpha=0.25).mean().reset_index(drop=True)
    k_dew_e_025.name = 'k_dew_e_025'
    data_6a3 = pd.concat([data_6a.reset_index(drop=True), k_air_e_001,k_air_e_01, k_air_e_025, k_dew_e_001, k_dew_e_01, k_dew_e_025], axis=1, sort=False)
    data_6a3_tr_x = data_6a3.iloc[tr_ix[:int(len(tr_ix)*.9)],:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_tr_y = data_6a3.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a3_val_x = data_6a3.iloc[tr_ix[int(len(tr_ix)*.9):],:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_val_y = data_6a3.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a3_tr_x, data_6a3_tr_y)
    lgb_eval = lgb.Dataset(data_6a3_val_x, data_6a3_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a3_ts_x = data_6a3.iloc[ts_ix,:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_ts_y = data_6a3.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a3_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a3_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 69.9383	valid_1's rmse: 96.525
[40]	training's rmse: 53.1254	valid_1's rmse: 96.381
[60]	training's rmse: 47.3866	valid_1's rmse: 97.884
Early stopping, best iteration is:
[26]	training's rmse: 63.5542	valid_1's rmse: 95.4657
Iteration done, nrmlse: 2.1968590566300246
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 66.0385	valid_1's rmse: 103.795
[40]	training's rmse: 48.8384	valid_1's rmse: 104.64
[60]	training's rmse: 43.403	valid_1's rmse: 105.468
Early stopping, best iteration is:
[24]	training's rmse: 61.3371	valid_1's rmse: 103.63
Iteration done, nrmlse: 2.092771433241134
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 70.2691	valid_1's rmse: 111.016
[40]	training's rmse: 54.0296	valid_1's rmse: 112.089
[60]	training's rmse: 48.2066	valid_1's rmse: 113.027
Early stopping, best iteration is:
[22]	training's rmse: 68.1507	valid_1's

## Level 6B)
* 1 use: 0
* 1 meter reading - meter 1
* Weather features

In [67]:
level_6a = {'primary_use': 0, 'meter': 1}
level_6a_cols = ['building_id', 'site_id', 'meter_reading', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction','wind_speed']

In [68]:
data_6a = data_subset_by_dict(train_data, level_6a, level_6a_cols)

In [69]:
# Many buildings?
len(set(data_6a.building_id))

216

In [70]:
feat_cols = [x for x in level_6a_cols if x != 'meter_reading']

In [71]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    data_6a1_tr_x = data_6a.iloc[tr_ix[:int(len(tr_ix)*.9)],:][feat_cols]
    data_6a1_tr_y = data_6a.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a1_val_x = data_6a.iloc[tr_ix[int(len(tr_ix)*.9):],:][feat_cols]
    data_6a1_val_y = data_6a.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a1_tr_x, data_6a1_tr_y)
    lgb_eval = lgb.Dataset(data_6a1_val_x, data_6a1_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a1_ts_x = data_6a.iloc[ts_ix,:][feat_cols]
    data_6a1_ts_y = data_6a.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a1_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a1_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 3134.98	valid_1's rmse: 4950.83
[40]	training's rmse: 2718.94	valid_1's rmse: 4958.92
[60]	training's rmse: 2568.75	valid_1's rmse: 4981.97
Early stopping, best iteration is:
[21]	training's rmse: 3088.38	valid_1's rmse: 4950.28
Iteration done, nrmlse: 3.2945231936682275
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 3063.1	valid_1's rmse: 4973.14
[40]	training's rmse: 2675.39	valid_1's rmse: 4978
Early stopping, best iteration is:
[19]	training's rmse: 3119.61	valid_1's rmse: 4972.44
Iteration done, nrmlse: 2.8737188324442475
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 3067.99	valid_1's rmse: 4975.5
[40]	training's rmse: 2656	valid_1's rmse: 4985.36
Early stopping, best iteration is:
[19]	training's rmse: 3109.98	valid_1's rmse: 4974.4
Iteration done, nrmlse: 2.18080974095189
Training until validation scores don't improve for 40 

In [75]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    k_air = get_series_past_k_lags(data_6a['air_temperature'], k=[1, 2] + [3 * (2**x) for x in range(6)]).fillna(99).reset_index(drop=True)
    k_dew = get_series_past_k_lags(data_6a['dew_temperature'], k=[1, 2] + [3 * (2**x) for x in range(6)]).fillna(99).reset_index(drop=True)
    data_6a2 = pd.concat([data_6a.reset_index(drop=True), k_air, k_dew], axis=1, sort=False)
    data_6a2_tr_x = data_6a2.iloc[tr_ix[:int(len(tr_ix)*.9)],:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_tr_y = data_6a2.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a2_val_x = data_6a2.iloc[tr_ix[int(len(tr_ix)*.9):],:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_val_y = data_6a2.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a2_tr_x, data_6a2_tr_y)
    lgb_eval = lgb.Dataset(data_6a2_val_x, data_6a2_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a2_ts_x = data_6a2.iloc[ts_ix,:][[x for x in list(data_6a2) if x != 'meter_reading']]
    data_6a2_ts_y = data_6a2.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a2_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a2_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2677.69	valid_1's rmse: 4956.71
[40]	training's rmse: 2130.4	valid_1's rmse: 4925.67
[60]	training's rmse: 1832.96	valid_1's rmse: 4930.88
Early stopping, best iteration is:
[35]	training's rmse: 2211.2	valid_1's rmse: 4920.56
Iteration done, nrmlse: 2.891556871974376
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2621.63	valid_1's rmse: 4978.63
[40]	training's rmse: 2065.37	valid_1's rmse: 4984.67
[60]	training's rmse: 1758.02	valid_1's rmse: 4989.88
Early stopping, best iteration is:
[27]	training's rmse: 2357.99	valid_1's rmse: 4978.46
Iteration done, nrmlse: 2.689779236982014
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2505.84	valid_1's rmse: 4974.92
[40]	training's rmse: 1956.49	valid_1's rmse: 4973.17
[60]	training's rmse: 1646.98	valid_1's rmse: 4969.87
[80]	training's rmse: 1507.52	valid_1's rmse: 4969.2
[100]	training's r

In [78]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_6a):
    k_air = get_series_past_k_lags(data_6a['air_temperature'], k=[1, 2, 3, 6]).fillna(99).reset_index(drop=True)
    k_dew = get_series_past_k_lags(data_6a['dew_temperature'], k=[1, 2, 3, 6]).fillna(99).reset_index(drop=True)
    k_air_e_01 = data_6a.air_temperature.ewm(alpha=0.1).mean().reset_index(drop=True)
    k_air_e_01.name = 'k_air_e_01'
    k_dew_e_01 = data_6a.dew_temperature.ewm(alpha=0.1).mean().reset_index(drop=True)
    k_dew_e_01.name = 'k_dew_e_01'
    k_air_e_025 = data_6a.air_temperature.ewm(alpha=0.25).mean().reset_index(drop=True)
    k_air_e_025.name = 'k_air_e_025'
    k_dew_e_025 = data_6a.dew_temperature.ewm(alpha=0.25).mean().reset_index(drop=True)
    k_dew_e_025.name = 'k_dew_e_025'
    data_6a3 = pd.concat([data_6a.reset_index(drop=True), k_air, k_air_e_001,k_air_e_01, k_air_e_025, k_dew, k_dew_e_001, k_dew_e_01, k_dew_e_025], axis=1, sort=False)
    data_6a3_tr_x = data_6a3.iloc[tr_ix[:int(len(tr_ix)*.9)],:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_tr_y = data_6a3.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_6a3_val_x = data_6a3.iloc[tr_ix[int(len(tr_ix)*.9):],:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_val_y = data_6a3.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_6a3_tr_x, data_6a3_tr_y)
    lgb_eval = lgb.Dataset(data_6a3_val_x, data_6a3_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_6a3_ts_x = data_6a3.iloc[ts_ix,:][[x for x in list(data_6a3) if x != 'meter_reading']]
    data_6a3_ts_y = data_6a3.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_6a3_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_6a3_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2734.24	valid_1's rmse: 4978.18
[40]	training's rmse: 2110.4	valid_1's rmse: 4976.44
[60]	training's rmse: 1835.68	valid_1's rmse: 4990.21
Early stopping, best iteration is:
[34]	training's rmse: 2245.21	valid_1's rmse: 4974.43
Iteration done, nrmlse: 2.8958435500973945
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2676.11	valid_1's rmse: 4977.07
[40]	training's rmse: 2070.11	valid_1's rmse: 4974.36
[60]	training's rmse: 1795.88	valid_1's rmse: 4979.35
Early stopping, best iteration is:
[37]	training's rmse: 2132.68	valid_1's rmse: 4972.46
Iteration done, nrmlse: 2.451909444460404
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 2559.84	valid_1's rmse: 4976.97
[40]	training's rmse: 1907.09	valid_1's rmse: 4974.72
[60]	training's rmse: 1669.25	valid_1's rmse: 4976.74
Early stopping, best iteration is:
[28]	training's rmse: 2161.46	vali