In [1]:
import os
import sys
import time
import argparse
import numpy as np
import pandas as pd
import h5py
import copy
from datetime import datetime
from tsforest import forecaster
from tsforest.metrics import compute_rmse, compute_rmsle
from utils import reduce_mem_usage
from config import get_model_params
from scaling import target_transform, target_inverse_transform
from precompute import precompute_model, precompute_models
import optuna
from optuna.integration import LightGBMPruningCallback

In [2]:
AVAILABLE_CLASSES = ["CatBoostForecaster",
                     "LightGBMForecaster",
                     "XGBoostForecaster",
                     "H2OGBMForecaster"]

In [3]:
EXCLUDE_FEATURES = ["year","quarter","month","days_in_month","year_week","year_day",
                    "month_day","year_day_cos","year_day_sin","year_week_cos",
                    "year_week_sin","month_cos","month_sin","month_progress"]

In [4]:
# input params
model_class_name = "LightGBMForecaster"
meter = 0
site = 2

In [5]:
if model_class_name not in AVAILABLE_CLASSES:
    print(f"{model_class_name} is not a valid model class.")
    sys.exit()
model_class = getattr(forecaster, model_class_name)

In [6]:
# loading train data
train_data = pd.read_csv(f"../mirrors/train_data_meter{meter}_site{site}.csv", parse_dates=["timestamp"])
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)
# loading leak data
leak_data = (pd.read_csv(f"../mirrors/leak_data_meter{meter}_site{site}.csv", parse_dates=["timestamp"])
             .query("timestamp >= '2017-01-01 00:00:00'"))
leak_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

In [7]:
len(train_data.building_id.unique())

135

In [8]:
len(leak_data.building_id.unique())

134

In [9]:
# merge of both datasets
#train_data = (pd.concat([train_data, leak_data.loc[:, train_data.columns]])
#              .reset_index(drop=True))

In [11]:
len(train_data)

1174568

In [12]:
len(train_data.drop_duplicates())

1174568

In [13]:
train_data["square_feet"] = np.log1p(train_data["square_feet"].values)
train_data["y"] = np.log1p(train_data["y"].values)

leak_data["square_feet"] = np.log1p(leak_data["square_feet"].values)
leak_data["y"] = np.log1p(leak_data["y"].values)

In [14]:
# index for validation data
#valid_index = train_data.query("site_id != 0 & ds >= '2017-01-01 00:00:00'").index
#valid_index = valid_index.union(train_data.query("site_id == 0 & ds >= '2017-05-21 00:00:00'").index)

In [15]:
# removes not useful columns
train_data.drop(["site_id","meter"], axis=1, inplace=True)
leak_data.drop(["site_id","meter"], axis=1, inplace=True)
predict_columns = [feat for feat in train_data.columns if feat!="y"]

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174568 entries, 0 to 1174567
Data columns (total 10 columns):
building_id           1174568 non-null int64
ds                    1174568 non-null datetime64[ns]
y                     1174568 non-null float64
primary_use           1174568 non-null object
square_feet           1174568 non-null float64
air_temperature       1174568 non-null float64
dew_temperature       1174568 non-null float64
sea_level_pressure    1174568 non-null float64
wind_direction        1174568 non-null float64
wind_speed            1174568 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 89.6+ MB


In [40]:
train_data.y.describe()

count    1.174568e+06
mean     4.296970e+00
std      1.290925e+00
min      9.950331e-03
25%      3.460723e+00
50%      4.480967e+00
75%      5.148889e+00
max      8.069208e+00
Name: y, dtype: float64

In [41]:
leak_data.y.describe()

count    2.314564e+06
mean     4.278492e+00
std      1.282440e+00
min      9.950331e-03
25%      3.430433e+00
50%      4.429983e+00
75%      5.155846e+00
max      8.007567e+00
Name: y, dtype: float64

***

In [17]:
model_kwargs = {"feature_sets":['calendar', 'calendar_cyclical'],
                "exclude_features":EXCLUDE_FEATURES,
                "categorical_features":{"building_id":"default",
                                        "primary_use":"default"},
                "ts_uid_columns":["building_id"],
                "detrend":False,
                "target_scaler":None}

In [18]:
precomputed_model = precompute_model(train_data, None, model_class_name, model_kwargs)

Mem. usage decreased to 73.93 Mb (73.4% reduction)


In [19]:
precomputed_model.train_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1174568 entries, 0 to 1174567
Data columns (total 30 columns):
ds                    1174568 non-null datetime64[ns]
quarter               1174568 non-null int8
month                 1174568 non-null int8
days_in_month         1174568 non-null int8
year_week             1174568 non-null int8
year_day              1174568 non-null int16
month_day             1174568 non-null int8
week_day              1174568 non-null int8
hour                  1174568 non-null int8
month_progress        1174568 non-null float16
hour_cos              1174568 non-null float16
hour_sin              1174568 non-null float16
week_day_cos          1174568 non-null float16
week_day_sin          1174568 non-null float16
year_day_cos          1174568 non-null float16
year_day_sin          1174568 non-null float16
year_week_cos         1174568 non-null float16
year_week_sin         1174568 non-null float16
month_cos             1174568 non-null float16
month_sin 

In [20]:
precomputed_model.input_features

['week_day',
 'hour',
 'hour_cos',
 'hour_sin',
 'week_day_cos',
 'week_day_sin',
 'building_id',
 'square_feet',
 'air_temperature',
 'dew_temperature',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'primary_use']

In [21]:
precomputed_model.valid_features

In [22]:
precomputed_model.model_params

{}

In [23]:
precomputed_model.categorical_features

{'building_id': 'default', 'primary_use': 'default'}

***

In [42]:
default_model_params = get_model_params(model_class_name)
best_model_params = {'num_leaves':32, 
                     'min_data_in_leaf':5, 
                     'feature_fraction':0.7, 
                     'lambda_l2':3.0,
                     'num_iterations':720}
model_params = {**default_model_params, **best_model_params}
model_params["learning_rate"] = 0.01
model_params

{'boosting_type': 'gbrt',
 'objective': 'regression',
 'num_iterations': 720,
 'early_stopping_rounds': 50,
 'num_leaves': 32,
 'min_data_in_leaf': 5,
 'learning_rate': 0.01,
 'feature_fraction': 0.7,
 'lambda_l2': 3.0,
 'verbosity': 1}

In [43]:
fcaster = copy.deepcopy(precomputed_model)
fcaster.set_params(model_params=model_params)

In [44]:
fcaster.fit()

In [45]:
fcaster.evaluate(leak_data, metric="rmse")

0.40156463828339795

***