In [1]:
import sys
import gc
import time
import argparse
import numpy as np
import pandas as pd
import h5py
from datetime import datetime
from tsforest import forecaster
from tsforest.metrics import compute_rmse, compute_rmsle
from utils import reduce_mem_usage
from config import get_model_params
from scaling import target_transform, target_inverse_transform
from precompute import precompute_model, precompute_models
import optuna
import copy

In [2]:
# excluded features to avoid data leakage
EXCLUDE_FEATURES = ["year","days_in_month","year_day",
                    "month_day","year_day_cos","year_day_sin"]

AVAILABLE_CLASSES = ["CatBoostForecaster",
                     "LightGBMForecaster",
                     "XGBoostForecaster",
                     "H2OGBMForecaster"]

In [3]:
model_class_name = "LightGBMForecaster"
if model_class_name not in AVAILABLE_CLASSES:
    print(f"{model_class_name} is not a valid model class.")
    sys.exit()
model_class = getattr(forecaster, model_class_name)

***

In [93]:
(leak.loc[:, ["site_id","meter"]]
 .drop_duplicates()
 .groupby("site_id")
 .count())

Unnamed: 0_level_0,meter
site_id,Unnamed: 1_level_1
0,2
1,1
2,3
4,1
15,3


In [94]:
leak.query("site_id == 0").meter.unique()

array([0, 1])

In [95]:
leak.query("site_id == 1").meter.unique()

array([0])

In [96]:
leak.query("site_id == 2").meter.unique()

array([0, 1, 3])

In [97]:
leak.query("site_id == 4").meter.unique()

array([0])

In [98]:
leak.query("site_id == 15").meter.unique()

array([0, 2, 1])

***

In [232]:
site_id = 1
meter = 0

In [233]:
train_data = (pd.read_hdf('../data/train_data.h5', 'train_data')
              .query(f"site_id == {site_id} & meter == {meter}"))
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)
leak_data = (pd.read_feather("../data/leakage.feather")
             .query(f"site_id == {site_id} & meter == {meter}")
             .pipe(reduce_mem_usage)
             .query("timestamp >= '2017-01-01 00:00:00'"))
leak_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

Mem. usage decreased to 55.19 Mb (57.7% reduction)


In [234]:
train_data = (pd.concat([train_data, leak_data.loc[:, train_data.columns]])
              .reset_index(drop=True))

In [235]:
valid_index = train_data.query("ds >= '2017-01-01 00:00:00'").index
predict_columns = [feat for feat in train_data.columns if feat!="y"]

In [236]:
train_data["y"] = np.log1p(train_data["y"].values)

***
some validations

In [237]:
diff = set(fcaster.train_features.building_id.unique()) - set(fcaster.valid_features.building_id.unique()) 
print(f"Building not present in train_data: {diff}")

Building not present in train_data: {245}


In [238]:
print(f"Meters in train_data: {train_data.meter.unique()}")
print(f"Meters in leak_data: {leak_data.meter.unique()}")

Meters in train_data: [0]
Meters in leak_data: [0]


***

In [239]:
model_kwargs = {"feature_sets":['calendar', 'calendar_cyclical'],
                "exclude_features":EXCLUDE_FEATURES,
                "categorical_features":{"building_id":"default",
                                        "meter":"default",
                                        "site_id":"default",
                                        "primary_use":"default"},
                "ts_uid_columns":["building_id","meter"],
                "detrend":False,
                "target_scaler":None}
precomputed_model = precompute_model(train_data, valid_index, model_class_name, model_kwargs)

Mem. usage decreased to 30.76 Mb (66.4% reduction)
Mem. usage decreased to 60.15 Mb (66.4% reduction)


***

In [240]:
pd.merge(precomputed_model.train_features.loc[:, ["ds","meter","building_id"]],
         precomputed_model.valid_features.loc[:, ["ds","meter","building_id"]],
         how="inner")

Unnamed: 0,ds,meter,building_id


***

In [288]:
model_params = get_model_params(model_class_name)
model_params["early_stopping_rounds"] = 50
model_params['num_leaves'] = 8
model_params["learning_rate"] = 0.005
model_params["lambda_l2"] = 3.
model_params["feature_fraction"] = 0.9
print(model_params)

{'boosting_type': 'gbrt', 'objective': 'regression', 'num_iterations': 1000, 'early_stopping_rounds': 50, 'num_leaves': 8, 'min_data_in_leaf': 20, 'learning_rate': 0.005, 'feature_fraction': 0.9, 'lambda_l2': 3.0, 'verbosity': 1}


In [289]:
fcaster = copy.deepcopy(precomputed_model)
fcaster.set_params(model_params=model_params)

In [290]:
fcaster.fit(fit_kwargs={"verbose_eval":10})

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's l2: 1.56429
[20]	valid_0's l2: 1.47448
[30]	valid_0's l2: 1.39852
[40]	valid_0's l2: 1.32792
[50]	valid_0's l2: 1.26463
[60]	valid_0's l2: 1.21019
[70]	valid_0's l2: 1.16819
[80]	valid_0's l2: 1.12327
[90]	valid_0's l2: 1.0843
[100]	valid_0's l2: 1.04874
[110]	valid_0's l2: 1.01721
[120]	valid_0's l2: 0.989444
[130]	valid_0's l2: 0.964925
[140]	valid_0's l2: 0.943894
[150]	valid_0's l2: 0.925319
[160]	valid_0's l2: 0.909375
[170]	valid_0's l2: 0.895114
[180]	valid_0's l2: 0.882598
[190]	valid_0's l2: 0.871903
[200]	valid_0's l2: 0.862691
[210]	valid_0's l2: 0.85492
[220]	valid_0's l2: 0.848656
[230]	valid_0's l2: 0.843092
[240]	valid_0's l2: 0.838615
[250]	valid_0's l2: 0.833704
[260]	valid_0's l2: 0.829894
[270]	valid_0's l2: 0.826458
[280]	valid_0's l2: 0.8241
[290]	valid_0's l2: 0.822672
[300]	valid_0's l2: 0.821641
[310]	valid_0's l2: 0.820123
[320]	valid_0's l2: 0.819949
[330]	valid_0's l2: 0.819464
[340]

In [291]:
y_pred = (fcaster.predict(leak_data.loc[:, predict_columns])).y_pred.values
y_real = train_data.loc[valid_index, "y"].values

In [292]:
def compute_rmse(yreal, ypred):
    return np.sqrt(np.mean((yreal-ypred)**2))

In [293]:
compute_rmse(y_real, y_pred)

0.9043525789823939

In [294]:
0.844813**0.5

0.919137095323652

***