# How do different types of models perform on our sample data?

Let's start by looking just at the first meter (Electricity)

In [1]:
import pandas as pd

In [2]:
from tsm.evaluators import k_fold_validator

In [3]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [4]:
import joblib

In [12]:
data = pd.read_pickle('data/prep/train_meter_0.pkl')

In [6]:
SEED = 42

We need to exclude some features from actual modelling

In [19]:
exclude = ['timestamp', 'suspicious_day', 'suspicious_month']
y_col = ['log_meter_reading']
x_cols = [x for x in list(data) if x not in exclude + y_col]

Default choice: **no using** suspicious

In [13]:
# Meter 0: if suspcious_day → remove
data.day_suspicious.value_counts(normalize=True)

0    0.923804
1    0.076196
Name: day_suspicious, dtype: float64

In [18]:
data = data[data.day_suspicious == False].reset_index(drop=True)

## Baseline: DummyRegressor

In [20]:
from sklearn.dummy import DummyRegressor

In [21]:
dummy = DummyRegressor("mean")

In [22]:
errors = []
for tr_idx, ts_idx in k_fold_validator(k=3, data=data.index.values, shuffle=True, random_state=SEED):
    
    x = data[x_cols].values
    y = data[y_col].values
    x_tr, x_ts, y_tr, y_ts = x[tr_idx], x[ts_idx], y[tr_idx], y[ts_idx]
    
    dummy.fit(x_tr, y_tr)
    rmsle = sqrt(mean_squared_error(y_ts, dummy.predict(x_ts)))
    print('Fold rmsle:', rmsle)
    errors.append(rmsle)

print('Overall rmsle:', sum(errors) / len(errors))

Fold rmsle: 1.4189585024174038
Fold rmsle: 1.4184159343497882
Fold rmsle: 1.4192611647343643
Overall rmsle: 1.418878533833852


## RandomForestRegressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
rfreg = RandomForestRegressor(n_estimators=7, max_depth=10, n_jobs=-1)

In [25]:
%%time
errors = []
for tr_idx, ts_idx in k_fold_validator(k=3, data=data.index.values, shuffle=True, random_state=SEED):
    
    x = data[x_cols].values
    y = data[y_col].values
    x_tr, x_ts, y_tr, y_ts = x[tr_idx], x[ts_idx], y[tr_idx], y[ts_idx]
    
    rfreg.fit(x_tr, y_tr.ravel())
    rmsle = sqrt(mean_squared_error(y_ts, rfreg.predict(x_ts)))
    print('Fold rmsle:', rmsle)
    errors.append(rmsle)

print('Overall rmsle:', sum(errors) / len(errors))

Fold rmsle: 0.6144330476351351
Fold rmsle: 0.6134558442236404
Fold rmsle: 0.6153539690786224
Overall rmsle: 0.6144142869791326
CPU times: user 2h 2min 28s, sys: 10.2 s, total: 2h 2min 38s
Wall time: 18min 6s


## LightGBM

In [7]:
import lightgbm as lgb

In [8]:
lgb_reg_params = {'objective':'regression',  'boosting_type':'gbdt', 'metric':'rmse',
                  'n_jobs':-1, 'learning_rate':0.07, 'num_leaves': 2**8, 'max_depth':-1,
                  'tree_learner':'serial', 'colsample_bytree': 0.7, 'subsample_freq':1,
                  'subsample':0.5, 'max_bin': 255, 'verbose':1, 'seed': SEED}

In [None]:
%%time
errors = []
for tr_idx, ts_idx in k_fold_validator(k=3, data=data.index.values, shuffle=True, random_state=42):
    
    x = data[x_cols].values
    y = data[y_col].values
    x_tr, x_ts, y_tr, y_ts = x[tr_idx], x[ts_idx], y[tr_idx], y[ts_idx]
    
    
    lgb_train = lgb.Dataset(x_tr, y_tr.ravel())
    lgb_eval = lgb.Dataset(x_ts, y_ts.ravel())
    lgb_reg = lgb.train(lgb_reg_params, lgb_train, valid_sets=(lgb_train, lgb_eval),num_boost_round=1000,early_stopping_rounds=100,verbose_eval=10)
    rmsle = sqrt(mean_squared_error(y_ts, lgb_reg.predict(x_ts, num_iteration=lgb_reg.best_iteration)))
    print('Fold rmsle:', rmsle)
    errors.append(rmsle)

print('Overall rmsle:', sum(errors) / len(errors))

Training until validation scores don't improve for 100 rounds
[10]	training's rmse: 1.12948	valid_1's rmse: 1.13001
[20]	training's rmse: 0.81353	valid_1's rmse: 0.814139
[30]	training's rmse: 0.689982	valid_1's rmse: 0.690583
[40]	training's rmse: 0.606731	valid_1's rmse: 0.607398
[50]	training's rmse: 0.557475	valid_1's rmse: 0.558293
[60]	training's rmse: 0.522376	valid_1's rmse: 0.523213
[70]	training's rmse: 0.49405	valid_1's rmse: 0.495039
[80]	training's rmse: 0.473465	valid_1's rmse: 0.474588
[90]	training's rmse: 0.451841	valid_1's rmse: 0.453155
[100]	training's rmse: 0.436818	valid_1's rmse: 0.438296
[110]	training's rmse: 0.424357	valid_1's rmse: 0.426011
[120]	training's rmse: 0.413872	valid_1's rmse: 0.415652
[130]	training's rmse: 0.404167	valid_1's rmse: 0.406205
[140]	training's rmse: 0.395855	valid_1's rmse: 0.39796
[150]	training's rmse: 0.388486	valid_1's rmse: 0.390745
[160]	training's rmse: 0.381966	valid_1's rmse: 0.384341
[170]	training's rmse: 0.375417	valid_1'

In [None]:
joblib.dump(lgb_reg, 'objects/lgb_reg_met0_rmsle_{}.pkl'.format(sum(errors) / len(errors)))

## RandomForestRegressor

In [8]:
import xgboost as xgb

In [9]:
%%time
errors = []
for tr_idx, ts_idx in k_fold_validator(k=3, data=data.index.values, shuffle=True, random_state=SEED):
    
    x = data[x_cols].values
    y = data[y_col].values
    x_tr, x_ts, y_tr, y_ts = x[tr_idx], x[ts_idx], y[tr_idx], y[ts_idx]
    
    dtrain = xgb.DMatrix(x_tr, label=y_tr)
    dtest = xgb.DMatrix(x_ts, label=y_ts)
    num_round = 10
    param = {}
    param = {'max_depth': 10, 'eta': 0.1, 'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)
    
    rmsle = sqrt(mean_squared_error(y_ts, bst.predict(dtest, ntree_limit=bst.best_ntree_limit)))
    print('Fold rmsle:', rmsle)
    errors.append(rmsle)

print('Overall rmsle:', sum(errors) / len(errors))

[0]	eval-rmse:3.50534	train-rmse:3.50419
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 10 rounds.
[1]	eval-rmse:3.17519	train-rmse:3.17394
[2]	eval-rmse:2.88094	train-rmse:2.87974
[3]	eval-rmse:2.61653	train-rmse:2.61559
[4]	eval-rmse:2.3811	train-rmse:2.38012
[5]	eval-rmse:2.17181	train-rmse:2.17077
[6]	eval-rmse:1.98368	train-rmse:1.98284
[7]	eval-rmse:1.81761	train-rmse:1.81672
[8]	eval-rmse:1.66999	train-rmse:1.66908
[9]	eval-rmse:1.53877	train-rmse:1.53789
Fold rmsle: 1.5388655173404837


KeyboardInterrupt: 