In [1]:
import os
import sys
import time
import argparse
import numpy as np
import pandas as pd
import h5py
import copy
from datetime import datetime
from tsforest import forecaster
from tsforest.metrics import compute_rmse, compute_rmsle
from utils import reduce_mem_usage
from config import get_model_params
from scaling import target_transform, target_inverse_transform
from precompute import precompute_model, precompute_models
import optuna
from optuna.integration import LightGBMPruningCallback

In [2]:
# available methods
AVAILABLE_CLASSES = ["CatBoostForecaster",
                     "LightGBMForecaster",
                     "XGBoostForecaster",
                     "H2OGBMForecaster"]
# excluded features to avoid data leakage
EXCLUDE_FEATURES = ["year","quarter","month","days_in_month","year_week","year_day",
                    "month_day","year_day_cos","year_day_sin","year_week_cos",
                    "year_week_sin","month_cos","month_sin","month_progress"]

In [3]:
model_class_name = "LightGBMForecaster"

In [4]:
if model_class_name not in AVAILABLE_CLASSES:
    print(f"{model_class_name} is not a valid model class.")
    sys.exit()
model_class = getattr(forecaster, model_class_name)

In [5]:
site = 0
meter = 0

In [7]:
train_data = pd.read_csv(f"../mirrors/train_data_meter{meter}_site{site}.csv", parse_dates=["timestamp"])
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

In [8]:
leak_data = (pd.read_csv(f"../mirrors/leak_data_meter{meter}_site{site}.csv", parse_dates=["timestamp"])
             .query("timestamp >= '2017-01-01 00:00:00'"))
leak_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

In [9]:
train_data = (pd.concat([train_data, leak_data.loc[:, train_data.columns]])
              .reset_index(drop=True))
train_data["square_feet"] = np.log1p(train_data["square_feet"].values)
train_data["y"] = np.log1p(train_data["y"].values)

In [23]:
valid_index = train_data.query("site_id != 0 & ds >= '2017-01-01 00:00:00'").index
valid_index = valid_index.union(train_data.query("site_id == 0 & ds >= '2017-05-21 00:00:00'").index)

In [25]:
train_data.drop(["site_id","meter"], axis=1, inplace=True)
predict_columns = [feat for feat in train_data.columns if feat!="y"]

In [26]:
model_kwargs = {"feature_sets":['calendar', 'calendar_cyclical'],
                "exclude_features":EXCLUDE_FEATURES,
                "categorical_features":{"building_id":"default",
                                        "primary_use":"default"},
                "ts_uid_columns":["building_id"],
                "detrend":False,
                "target_scaler":None}
precomputed_model = precompute_model(train_data, valid_index, model_class_name, model_kwargs)

Mem. usage decreased to 58.23 Mb (73.8% reduction)
Mem. usage decreased to 92.75 Mb (73.8% reduction)
