In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tsm.data_selector import data_subset_by_dict
import matplotlib.pyplot as plt
import numpy as np

In [3]:
train_data = pd.read_pickle('kaggle/input/ashrae-energy-prediction/train.pkl')

In [12]:
train_data.primary_use.value_counts()

0     8165504
6     4394864
1     2264917
4     2146413
9     1662826
3      398527
7      242222
8      213796
5      125713
2      114090
11     112657
15     111861
12      96519
13      77627
14      56203
10      32361
Name: primary_use, dtype: int64

## Level 5A)
* 1 primary use
* 1 meter reading - meter 0
* Site ID

In [13]:
level_5a = {'primary_use': 1, 'meter': 0}
level_5a_cols = ['building_id', 'site_id', 'meter_reading', 'dt_m', 'dt_w', 'dt_d', 'dt_hour', 
                 'dt_day_week', 'dt_day_month', 'dt_week_month']

In [14]:
data_5a = data_subset_by_dict(train_data, level_5a, level_5a_cols)

In [15]:
data_5a.head()

Unnamed: 0,building_id,site_id,meter_reading,dt_m,dt_w,dt_d,dt_hour,dt_day_week,dt_day_month,dt_week_month
10,10,0,0.0,1,53,1,0,4,1,1
39,40,0,0.0,1,53,1,0,4,1,1
57,59,0,0.0,1,53,1,0,4,1,1
85,87,0,0.0,1,53,1,0,4,1,1
86,88,0,0.0,1,53,1,0,4,1,1


In [16]:
len(set(data_5a.site_id))

14

In [17]:
len(set(data_5a.building_id))

179

In [18]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [19]:
from tsm.eval_metrics import root_mean_squared_log_error

In [20]:
SEED = 7

In [21]:
lgb_reg_params = {'objective':'regression',  'boosting_type':'gbdt', 'metric':'rmse',
                  'n_jobs':-1, 'learning_rate':0.07, 'num_leaves': 2**8, 'max_depth':-1,
                  'tree_learner':'serial', 'colsample_bytree': 0.7, 'subsample_freq':1,
                  'subsample':0.5, 'max_bin':255, 'verbose':1, 'seed': SEED, 'early_stopping_rounds': 40} 

In [22]:
# Cross Validation
from sklearn.model_selection import KFold

In [23]:
K = 5
kf = KFold(n_splits=K)

In [25]:
feat_cols = ['building_id', 'site_id', 'dt_m', 'dt_w', 'dt_d', 'dt_hour', 
             'dt_day_week', 'dt_day_month', 'dt_week_month']

In [26]:
nrmlses = []
for tr_ix, ts_ix in kf.split(data_5a):
    data_5a1_tr_x = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:][feat_cols]
    data_5a1_tr_y = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_5a1_val_x = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:][feat_cols]
    data_5a1_val_y = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_5a1_tr_x, data_5a1_tr_y)
    lgb_eval = lgb.Dataset(data_5a1_val_x, data_5a1_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_5a1_ts_x = data_5a.iloc[ts_ix,:][feat_cols]
    data_5a1_ts_y = data_5a.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_5a1_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_5a1_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)



Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 104.504	valid_1's rmse: 104.533
[40]	training's rmse: 64.4752	valid_1's rmse: 68.0111
[60]	training's rmse: 55.5058	valid_1's rmse: 62.8308
[80]	training's rmse: 50.7556	valid_1's rmse: 62.0566
[100]	training's rmse: 48.7052	valid_1's rmse: 62.3951
Early stopping, best iteration is:
[76]	training's rmse: 51.24	valid_1's rmse: 61.8567


  return np.sqrt(np.sum(np.power(np.log(predicted + 1) - np.log(actual + 1), 2)) / len(actual))


Iteration done, nrmlse: 0.9147511177005977
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 107.267	valid_1's rmse: 107.079
[40]	training's rmse: 67.9961	valid_1's rmse: 68.3983
[60]	training's rmse: 59.0902	valid_1's rmse: 62.771
[80]	training's rmse: 54.3492	valid_1's rmse: 61.198
[100]	training's rmse: 52.2522	valid_1's rmse: 61.557
[120]	training's rmse: 50.7028	valid_1's rmse: 61.864
Early stopping, best iteration is:
[80]	training's rmse: 54.3492	valid_1's rmse: 61.198
Iteration done, nrmlse: 1.0905780289853297
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 103.899	valid_1's rmse: 106.688
[40]	training's rmse: 67.0821	valid_1's rmse: 68.2716
[60]	training's rmse: 58.8813	valid_1's rmse: 62.8168
[80]	training's rmse: 54.7206	valid_1's rmse: 61.5554
[100]	training's rmse: 52.8112	valid_1's rmse: 62.0648
[120]	training's rmse: 51.3197	valid_1's rmse: 62.2418
Early stopping, best iteration is:
[80]	training's rmse:

In [38]:
feat_cols = ['building_id', 'site_id_mean', 'dt_m', 'dt_w', 'dt_d', 'dt_hour', 
             'dt_day_week', 'dt_day_month', 'dt_week_month']

In [39]:
# Simple building_id mean
nrmlses = []
for tr_ix, ts_ix in kf.split(data_5a):
    full_training_data = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:]
    site_id_means = full_training_data.groupby('site_id')['meter_reading'].mean().to_dict()
    full_training_data['site_id_mean'] = full_training_data.site_id.apply(lambda x: site_id_means[x])
    data_5a2_tr_x = full_training_data[feat_cols]
    data_5a2_tr_y = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    data_5a2_val_x = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:]
    data_5a2_val_x['site_id_mean'] = data_5a2_val_x.site_id.apply(lambda x: site_id_means[x])
    data_5a2_val_x = data_5a2_val_x[feat_cols]
    data_5a2_val_y = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    lgb_train = lgb.Dataset(data_5a2_tr_x, data_5a2_tr_y)
    lgb_eval = lgb.Dataset(data_5a2_val_x, data_5a2_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    data_5a2_ts_x = data_5a.iloc[ts_ix,:]
    data_5a2_ts_x['site_id_mean'] = data_5a2_ts_x.site_id.apply(lambda x: site_id_means[x])
    data_5a2_ts_x = data_5a2_ts_x[feat_cols]
    data_5a2_ts_y = data_5a.iloc[ts_ix,:].meter_reading
    lgbm_hat = (gbm_regress.predict(data_5a2_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_5a2_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 104.526	valid_1's rmse: 104.565
[40]	training's rmse: 64.5215	valid_1's rmse: 67.9205
[60]	training's rmse: 55.5031	valid_1's rmse: 63.0459
[80]	training's rmse: 50.6891	valid_1's rmse: 62.1984
[100]	training's rmse: 48.5753	valid_1's rmse: 62.8547
Early stopping, best iteration is:
[74]	training's rmse: 51.5427	valid_1's rmse: 62.1577


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Iteration done, nrmlse: 0.9337120409527903
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 107.32	valid_1's rmse: 106.444
[40]	training's rmse: 68.0428	valid_1's rmse: 67.6736
[60]	training's rmse: 59.0753	valid_1's rmse: 62.2627
[80]	training's rmse: 54.3752	valid_1's rmse: 60.8162
[100]	training's rmse: 52.2662	valid_1's rmse: 61.1037
Early stopping, best iteration is:
[79]	training's rmse: 54.5537	valid_1's rmse: 60.7362
Iteration done, nrmlse: 1.1056979977728238
Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 103.905	valid_1's rmse: 106.59
[40]	training's rmse: 67.0124	valid_1's rmse: 68.4711
[60]	training's rmse: 58.8923	valid_1's rmse: 63.1222
[80]	training's rmse: 54.7208	valid_1's rmse: 62.0003
[100]	training's rmse: 52.8181	valid_1's rmse: 62.4144
[120]	training's rmse: 51.4515	valid_1's rmse: 62.5848
Early stopping, best iteration is:
[81]	training's rmse: 54.6571	valid_1's rmse: 61.9961
Iteration done, nrm

In [42]:
# Site_id mean by month
nrmlses = []
for tr_ix, ts_ix in kf.split(data_5a):
    # Building Means
    full_training_data = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:]
    site_id_means = full_training_data.groupby(['site_id', 'dt_m'], as_index=False)['meter_reading'].mean()
    sid_dicts = {}
    for sid in list(set(site_id_means.site_id)):
        sid_data = site_id_means[site_id_means.site_id == sid][['dt_m', 'meter_reading']].to_dict('records')
        sid_dicts[sid] = sid_data
    # Using Means
    sid_means_tr = pd.DataFrame(full_training_data.site_id.apply(lambda x: [x['meter_reading'] for x in sid_dicts[x]]).values.tolist(),
                      columns=['site_id_dt_m_{}_mean'.format(m) for m in [mm['dt_m'] for mm in sid_dicts[sid]]]).reset_index(drop=True)
    data_4a2_tr_x = full_training_data.reset_index(drop=True).merge(sid_means_tr, left_index=True, right_index=True)
    data_4a2_tr_x = data_4a2_tr_x[[x for x in list(data_4a2_tr_x) if x not in ['meter_reading', 'site_id']]].fillna(0)
    data_4a2_tr_y = data_5a.iloc[tr_ix[:int(len(tr_ix)*.9)],:].meter_reading
    
    data_4a2_val_x = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:]
    bid_means_val = pd.DataFrame(data_4a2_val_x.site_id.apply(lambda x: [x['meter_reading'] for x in sid_dicts[x]]).values.tolist(),
                  columns=['site_id_dt_m_{}_mean'.format(m) for m in [mm['dt_m'] for mm in sid_dicts[sid]]]).reset_index(drop=True)
    data_4a2_val_x = data_4a2_val_x.reset_index(drop=True).merge(bid_means_val, left_index=True, right_index=True)
    data_4a2_val_x = data_4a2_val_x[[x for x in list(data_4a2_val_x) if x not in ['meter_reading', 'site_id']]].fillna(0)
    data_4a2_val_y = data_5a.iloc[tr_ix[int(len(tr_ix)*.9):],:].meter_reading
    
    lgb_train = lgb.Dataset(data_4a2_tr_x, data_4a2_tr_y)
    lgb_eval = lgb.Dataset(data_4a2_val_x, data_4a2_val_y)
    gbm_regress = lgb.train(lgb_reg_params, lgb_train, num_boost_round=2000, valid_sets=(lgb_train, lgb_eval),
                            early_stopping_rounds=20, verbose_eval = 20)
    
    data_4a2_ts_x = data_5a.iloc[ts_ix,:]
    bid_means_ts = pd.DataFrame(data_4a2_ts_x.site_id.apply(lambda x: [x['meter_reading'] for x in sid_dicts[x]]).values.tolist(),
                  columns=['site_id_dt_m_{}_mean'.format(m) for m in [mm['dt_m'] for mm in sid_dicts[sid]]]).reset_index(drop=True)
    data_4a2_ts_x = data_4a2_ts_x.reset_index(drop=True).merge(bid_means_ts, left_index=True, right_index=True)
    data_4a2_ts_x = data_4a2_ts_x[[x for x in list(data_4a2_ts_x) if x not in ['meter_reading', 'site_id']]].fillna(0)
    data_4a2_ts_y = data_5a.iloc[ts_ix,:].meter_reading
    
    lgbm_hat = (gbm_regress.predict(data_4a2_ts_x, num_iteration=gbm_regress.best_iteration))
    nrmlse = root_mean_squared_log_error(lgbm_hat, data_4a2_ts_y)
    print('Iteration done, nrmlse:', nrmlse)
    nrmlses.append(nrmlse)
print('FinalScore:', sum(nrmlses) / K)

Training until validation scores don't improve for 40 rounds
[20]	training's rmse: 92.2488	valid_1's rmse: 91.4311
[40]	training's rmse: 59.4937	valid_1's rmse: 64.2938
[60]	training's rmse: 52.2074	valid_1's rmse: 62.4267
[80]	training's rmse: 48.7975	valid_1's rmse: 63.0233
Early stopping, best iteration is:
[56]	training's rmse: 53.0344	valid_1's rmse: 62.3055
Iteration done, nrmlse: 0.9871468957185453


AssertionError: 10 columns passed, passed data had 11 columns