In [1]:
import sys
import gc
import time
import argparse
import numpy as np
import pandas as pd
import h5py
from datetime import datetime
from tsforest import forecaster
from tsforest.metrics import compute_rmse, compute_rmsle
from utils import reduce_mem_usage
from config import get_model_params
from scaling import target_transform, target_inverse_transform
from precompute import precompute_model, precompute_models
import optuna
import copy

In [2]:
# excluded features to avoid data leakage
EXCLUDE_FEATURES = ["year","quarter","month","days_in_month","year_week","year_day",
                    "month_day","year_day_cos","year_day_sin","year_week_cos",
                    "year_week_sin","month_cos","month_sin","month_progress"]

AVAILABLE_CLASSES = ["CatBoostForecaster",
                     "LightGBMForecaster",
                     "XGBoostForecaster",
                     "H2OGBMForecaster"]

In [3]:
model_class_name = "LightGBMForecaster"
if model_class_name not in AVAILABLE_CLASSES:
    print(f"{model_class_name} is not a valid model class.")
    sys.exit()
model_class = getattr(forecaster, model_class_name)

***

In [4]:
site_id = 0
meter = 0

In [5]:
train_data = (pd.read_hdf('../data/train_data.h5', 'train_data')
              .query(f"site_id == {site_id} & meter == {meter}"))
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)
leak_data = (pd.read_feather("../data/leakage.feather")
             .query(f"site_id == {site_id} & meter == {meter}")
             .pipe(reduce_mem_usage)
             .query("timestamp >= '2017-01-01 00:00:00'"))
leak_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

Mem. usage decreased to 96.98 Mb (58.7% reduction)


In [6]:
np.sort(train_data.building_id.unique())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104])

In [7]:
np.sort(leak_data.building_id.unique())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104])

In [8]:
train_data = (pd.concat([train_data, leak_data.loc[:, train_data.columns]])
              .reset_index(drop=True))

In [9]:
#valid_index = train_data.query("ds >= '2017-05-21 00:00:00'").index
predict_columns = [feat for feat in train_data.columns if feat!="y"]

In [10]:
train_data["y"] = np.log1p(train_data["y"].values)

***
subset of total buildings

In [11]:
n_buildings = len(train_data.building_id.unique())
buildings = train_data.building_id.unique()[0:n_buildings//2]
train_data = train_data.query("building_id in @buildings").reset_index(drop=True)
valid_index = train_data.query("ds >= '2017-05-21 00:00:00'").index

***
some validations

In [13]:
buildings_train = train_data.query("ds <= '2016-12-31 23:00:00'").building_id.unique()
buildings_leak = train_data.query("ds > '2016-12-31 23:00:00'").building_id.unique()

print(f"Building in train_data but not in leak_data: {set(buildings_train) - set(buildings_leak) }")
print(f"Building in leak_data but not in train_data: {set(buildings_leak) - set(buildings_train) }")

Building in train_data but not in leak_data: set()
Building in leak_data but not in train_data: set()


In [14]:
meters_train = train_data.query("ds <= '2016-12-31 23:00:00'").meter.unique()
meters_leak = train_data.query("ds > '2016-12-31 23:00:00'").meter.unique()
print(f"Meters in train_data: {meters_train}")
print(f"Meters in leak_data: {meters_leak}")

Meters in train_data: [0]
Meters in leak_data: [0]


***

In [15]:
model_kwargs = {"feature_sets":['calendar', 'calendar_cyclical'],
                "exclude_features":EXCLUDE_FEATURES,
                "categorical_features":{"building_id":"default",
                                        "primary_use":"default"},
                "ts_uid_columns":["building_id"],
                "detrend":False,
                "target_scaler":None}
precomputed_model = precompute_model(train_data, valid_index, model_class_name, model_kwargs)

Mem. usage decreased to 30.94 Mb (66.8% reduction)
Mem. usage decreased to 48.77 Mb (66.8% reduction)


***

In [16]:
pd.merge(precomputed_model.train_features.loc[:, ["ds","meter","building_id"]],
         precomputed_model.valid_features.loc[:, ["ds","meter","building_id"]],
         how="inner")

Unnamed: 0,ds,meter,building_id


In [17]:
precomputed_model.input_features

['week_day',
 'hour',
 'hour_cos',
 'hour_sin',
 'week_day_cos',
 'week_day_sin',
 'building_id',
 'meter',
 'site_id',
 'square_feet',
 'air_temperature',
 'dew_temperature',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'primary_use']

***

In [18]:
model_params = get_model_params(model_class_name)
model_params["early_stopping_rounds"] = 50
model_params['num_leaves'] = 511
model_params["learning_rate"] = 0.01
model_params["lambda_l2"] = 0.
model_params["feature_fraction"] = 0.7
model_params["min_data_in_leaf"] = 5
print(model_params)	

{'boosting_type': 'gbrt', 'objective': 'regression', 'num_iterations': 1000, 'early_stopping_rounds': 50, 'num_leaves': 511, 'min_data_in_leaf': 5, 'learning_rate': 0.01, 'feature_fraction': 0.7, 'lambda_l2': 0.0, 'verbosity': 1}


In [19]:
fcaster = copy.deepcopy(precomputed_model)
fcaster.set_params(model_params=model_params)

In [20]:
fcaster.fit(fit_kwargs={"verbose_eval":10})

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's l2: 1.06238
[20]	valid_0's l2: 0.873228
[30]	valid_0's l2: 0.720302
[40]	valid_0's l2: 0.595575
[50]	valid_0's l2: 0.494659
[60]	valid_0's l2: 0.41316
[70]	valid_0's l2: 0.355759
[80]	valid_0's l2: 0.313006
[90]	valid_0's l2: 0.269327
[100]	valid_0's l2: 0.236167
[110]	valid_0's l2: 0.211384
[120]	valid_0's l2: 0.184159
[130]	valid_0's l2: 0.162353
[140]	valid_0's l2: 0.145007
[150]	valid_0's l2: 0.131148
[160]	valid_0's l2: 0.120068
[170]	valid_0's l2: 0.111839
[180]	valid_0's l2: 0.104779
[190]	valid_0's l2: 0.0991488
[200]	valid_0's l2: 0.0958384
[210]	valid_0's l2: 0.0922484
[220]	valid_0's l2: 0.0899045
[230]	valid_0's l2: 0.0877741
[240]	valid_0's l2: 0.0862289
[250]	valid_0's l2: 0.0850406
[260]	valid_0's l2: 0.0842232
[270]	valid_0's l2: 0.0837593
[280]	valid_0's l2: 0.0833434
[290]	valid_0's l2: 0.0831058
[300]	valid_0's l2: 0.082971
[310]	valid_0's l2: 0.0829398
[320]	valid_0's l2: 0.0829483
[330]	va

In [21]:
(fcaster.model.model.best_score["valid_0"]["l2"])**0.5

0.28798354192000836

***