In [1]:
import sys
import time
import argparse
import numpy as np
import pandas as pd
import h5py
from datetime import datetime
from tsforest import forecaster
from tsforest.metrics import compute_rmse, compute_rmsle
from scripts.utils import reduce_mem_usage
from scripts.config import get_model_params
from scripts.scaling import target_transform, target_inverse_transform
import optuna

In [16]:
train_data = pd.read_hdf('data/train_data_nw.h5', 'train_data')
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

robust_scaler = pd.read_csv("data/robust_scaler.csv")
train_data = target_transform(train_data, robust_scaler, target="y")

In [17]:
test_data = pd.read_hdf('data/test_data_nw.h5', 'test_data')
test_data.rename({"timestamp":"ds"}, axis=1, inplace=True)

In [None]:
# timestamp of the starting execution time
timestamp = datetime.now().strftime("%Y/%m/%d, %H:%M:%S").replace("/","-").replace(" ","")

AVAILABLE_CLASSES = ["CatBoostForecaster",
                     "LightGBMForecaster",
                     "XGBoostForecaster",
                     "H2OGBMForecaster"]

model_class_name = "LightGBMForecaster"
if model_class_name not in AVAILABLE_CLASSES:
    print(f"{model_class_name} is not a valid model class.")
    sys.exit()
model_class = getattr(forecaster, model_class_name)

print("[INFO] loading data")
tic = time.time()
train_data = pd.read_hdf('data/train_data_nw.h5', 'train_data')
train_data.rename({"timestamp":"ds", "meter_reading":"y"}, axis=1, inplace=True)

robust_scaler = pd.read_csv("data/robust_scaler.csv")
train_data = target_transform(train_data, robust_scaler, target="y")

test_data = pd.read_hdf('data/test_data_nw.h5', 'test_data')
test_data.rename({"timestamp":"ds"}, axis=1, inplace=True)
tac = time.time()
print(f"[INFO] time elapsed loading data: {(tac-tic)/60.} min.\n")

In [2]:
N_FOLDS = 4
i = 0
train_features = pd.read_hdf(f"data/train_features_nw_{N_FOLDS}fold.h5", key=f"fold{i}")
valid_features = pd.read_hdf(f"data/valid_features_nw_{N_FOLDS}fold.h5", key=f"fold{i}")

In [3]:
len(train_features)+len(valid_features)

19866224

In [7]:
model_class = getattr(forecaster, model_class_name)

In [8]:
model_kwargs = {"model_params":get_model_params(model_class_name),
                "feature_sets":['calendar', 'calendar_cyclical'],
                "exclude_features":["year","days_in_month"],
                "categorical_features":{"building_id":"default",
                                        "meter":"default",
                                        "site_id":"default",
                                        "primary_use":"default"},
                "ts_uid_columns":["building_id","meter"],
                "detrend":False,
                "target_scaler":None}

In [9]:
model_kwargs

{'model_params': {'boosting_type': 'gbrt',
  'objective': 'regression',
  'num_iterations': 2000,
  'early_stopping_round': 100,
  'num_leaves': 724,
  'min_data_in_leaf': 20,
  'learning_rate': 0.1,
  'feature_fraction': 1.0,
  'verbosity': 1},
 'feature_sets': ['calendar', 'calendar_cyclical'],
 'exclude_features': ['year', 'days_in_month'],
 'categorical_features': {'building_id': 'default',
  'meter': 'default',
  'site_id': 'default',
  'primary_use': 'default'},
 'ts_uid_columns': ['building_id', 'meter'],
 'detrend': False,
 'target_scaler': None}

In [10]:
fcaster = model_class(**model_kwargs)

In [11]:
fcaster.set_features(train_features, valid_features)

In [12]:
fcaster.fit()

In [20]:
train_data.head()

Unnamed: 0,building_id,meter,ds,y,site_id,primary_use,square_feet
0,46,0,2016-01-01,2.793029,0,Retail,9045
1,105,0,2016-01-01,-1.717601,1,Education,50623
2,106,0,2016-01-01,-0.3746,1,Education,5374
3,106,3,2016-01-01,0.0,1,Education,5374
4,107,0,2016-01-01,-0.45045,1,Education,97532


In [21]:
valid_features.loc[:, test_data.columns].head()

Unnamed: 0,row_id,building_id,meter,ds,site_id,primary_use,square_feet
158073,,46,0,2016-01-04 00:00:00,0,6,9045
160269,,46,0,2016-01-04 01:00:00,0,6,9045
162463,,46,0,2016-01-04 02:00:00,0,6,9045
164660,,46,0,2016-01-04 03:00:00,0,6,9045
166857,,46,0,2016-01-04 04:00:00,0,6,9045


In [22]:
fcaster.predict(valid_features.loc[:, test_data.columns])

ValueError: 'predict_data' shoud have the same columns as 'train_data' except for 'y'.

In [18]:
fcaster.train_data = train_data

In [None]:
fcaster.

***

In [9]:
def objective(trial):
    model_kwargs = {"feature_sets":['calendar', 'calendar_cyclical'],
                    "exclude_features":["year","days_in_month"],
                    "categorical_features":{"building_id":"default",
                                            "meter":"default",
                                            "site_id":"default",
                                            "primary_use":"default"},
                    "ts_uid_columns":["building_id","meter"],
                    "detrend":False,
                    "target_scaler":None}
    model_params = {
        "num_leaves":int(trial.suggest_loguniform('num_leaves', 2**5, 2**10+1)),
        "learning_rate":trial.suggest_uniform('learning_rate', 0.01, 0.31),
        "min_data_in_leaf":int(trial.suggest_discrete_uniform("min_data_in_leaf", 20, 40, 20)),
        "feature_fraction":trial.suggest_discrete_uniform("feature_fraction", 0.9, 1.0, 0.1),
        "lambda_l2":trial.suggest_discrete_uniform("lambda_l2", 0., 1.0, 1.0)
    }
    model_kwargs["model_params"] = model_params

    valid_errors = list()
    for i in range(N_FOLDS):
        fcaster = model_class(**model_kwargs)
        
        print(f"[INFO] preparing the features - fold: {i}")
        tic = time.time()
        train_features = pd.read_hdf(f"data/train_features_nw_{N_FOLDS}fold.h5", key=f"fold{i}")
        valid_features = pd.read_hdf(f"data/valid_features_nw_{N_FOLDS}fold.h5", key=f"fold{i}")
        fcaster.set_features(train_features, valid_features)
        tac = time.time()
        print(f"[INFO] time elapsed preparing the features: {(tac-tic)/60.} min.\n")

        print(f"[INFO] fitting the model - fold: {i}")
        tic = time.time()
        fcaster.fit()
        tac = time.time()
        print(f"[INFO] time elapsed fitting the model: {(tac-tic)/60.} min.\n")

        print(f"[INFO] evaluating the model - fold: {i}")
        tic = time.time()
        valid_predictions = fcaster.predict(train_data.loc[valid_index, test_data.columns])

        y_real = (target_inverse_transform(train_data.loc[valid_index, :], robust_scaler, target="y")).y.values
        y_pred_val = (target_inverse_transform(valid_predictions, robust_scaler, target="y_pred")).y_pred.values

        y_pred_val[y_pred_val<0] = 0   
        valid_error = compute_rmsle(y_real, y_pred_val)
        valid_errors.append(valid_error)
        print(f"[INFO] validation error on fold{i}: {valid_error}")
        tac = time.time()
        print(f"[INFO] time elapsed evaluating the model: {(tac-tic)/60.} min.\n")
    
    return np.mean(valid_errors)

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)
print(study.best_trial)

[INFO] preparing the features - fold: 0
Mem. usage decreased to 909.41 Mb (68.0% reduction)
Mem. usage decreased to 303.12 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.511084524790446 min.

[INFO] fitting the model - fold: 0
[INFO] time elapsed fitting the model: 0.5713108539581299 min.

[INFO] evaluating the model - fold: 0
[INFO] validation error on fold0: 1.0577284210559563
[INFO] time elapsed evaluating the model: 0.5155432899792989 min.

[INFO] preparing the features - fold: 1
Mem. usage decreased to 909.43 Mb (68.0% reduction)
Mem. usage decreased to 303.11 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.377772835890452 min.

[INFO] fitting the model - fold: 1
[INFO] time elapsed fitting the model: 0.6975279927253724 min.

[INFO] evaluating the model - fold: 1
[INFO] validation error on fold1: 0.9311259959313566
[INFO] time elapsed evaluating the model: 0.4482148369153341 min.

[INFO] preparing the features - fold: 2
Mem. usage decreased t

[I 2019-12-01 15:31:03,817] Finished trial#0 resulted in value: 1.0275343797143832. Current best value is 1.0275343797143832 with parameters: {'num_leaves': 77.85339833286481, 'learning_rate': 0.06825700128818045, 'min_data_in_leaf': 40.0, 'feature_fraction': 0.9, 'lambda_l2': 1.0}.


[INFO] preparing the features - fold: 0
Mem. usage decreased to 909.41 Mb (68.0% reduction)
Mem. usage decreased to 303.12 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.4303881804148357 min.

[INFO] fitting the model - fold: 0
[INFO] time elapsed fitting the model: 1.0551750500996908 min.

[INFO] evaluating the model - fold: 0
[INFO] validation error on fold0: 1.1834674143432384
[INFO] time elapsed evaluating the model: 0.5362764159838359 min.

[INFO] preparing the features - fold: 1
Mem. usage decreased to 909.43 Mb (68.0% reduction)
Mem. usage decreased to 303.11 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.3582255919774373 min.

[INFO] fitting the model - fold: 1
[INFO] time elapsed fitting the model: 1.1953715244928995 min.

[INFO] evaluating the model - fold: 1
[INFO] validation error on fold1: 1.0115509158094549
[INFO] time elapsed evaluating the model: 0.5771966218948364 min.

[INFO] preparing the features - fold: 2
Mem. usage decreased

[I 2019-12-01 15:47:15,431] Finished trial#1 resulted in value: 1.1903450947898508. Current best value is 1.0275343797143832 with parameters: {'num_leaves': 77.85339833286481, 'learning_rate': 0.06825700128818045, 'min_data_in_leaf': 40.0, 'feature_fraction': 0.9, 'lambda_l2': 1.0}.


[INFO] preparing the features - fold: 0
Mem. usage decreased to 909.41 Mb (68.0% reduction)
Mem. usage decreased to 303.12 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.479695693651835 min.

[INFO] fitting the model - fold: 0
[INFO] time elapsed fitting the model: 0.4575957218805949 min.

[INFO] evaluating the model - fold: 0
[INFO] validation error on fold0: 1.5926178562139242
[INFO] time elapsed evaluating the model: 0.4326853394508362 min.

[INFO] preparing the features - fold: 1
Mem. usage decreased to 909.43 Mb (68.0% reduction)
Mem. usage decreased to 303.11 Mb (68.0% reduction)
[INFO] time elapsed preparing the features: 2.4856940428415935 min.

[INFO] fitting the model - fold: 1
[INFO] time elapsed fitting the model: 0.771871789296468 min.

[INFO] evaluating the model - fold: 1
[INFO] validation error on fold1: 0.9156246276553192
[INFO] time elapsed evaluating the model: 0.45116540988286336 min.

[INFO] preparing the features - fold: 2
Mem. usage decreased 

[I 2019-12-01 16:01:10,479] Finished trial#2 resulted in value: 1.3358126205787708. Current best value is 1.0275343797143832 with parameters: {'num_leaves': 77.85339833286481, 'learning_rate': 0.06825700128818045, 'min_data_in_leaf': 40.0, 'feature_fraction': 0.9, 'lambda_l2': 1.0}.


FrozenTrial(number=0, state=TrialState.COMPLETE, value=1.0275343797143832, datetime_start=datetime.datetime(2019, 12, 1, 15, 16, 47, 857505), datetime_complete=datetime.datetime(2019, 12, 1, 15, 31, 3, 816359), params={'num_leaves': 77.85339833286481, 'learning_rate': 0.06825700128818045, 'min_data_in_leaf': 40.0, 'feature_fraction': 0.9, 'lambda_l2': 1.0}, distributions={'num_leaves': LogUniformDistribution(high=1025, low=32), 'learning_rate': UniformDistribution(high=0.31, low=0.01), 'min_data_in_leaf': DiscreteUniformDistribution(high=40, low=20, q=20), 'feature_fraction': DiscreteUniformDistribution(high=1.0, low=0.9, q=0.1), 'lambda_l2': DiscreteUniformDistribution(high=1.0, low=0.0, q=1.0)}, user_attrs={}, system_attrs={'_number': 0}, intermediate_values={}, trial_id=0)


In [11]:
print(study.best_trial)

FrozenTrial(number=0, state=TrialState.COMPLETE, value=1.0275343797143832, datetime_start=datetime.datetime(2019, 12, 1, 15, 16, 47, 857505), datetime_complete=datetime.datetime(2019, 12, 1, 15, 31, 3, 816359), params={'num_leaves': 77.85339833286481, 'learning_rate': 0.06825700128818045, 'min_data_in_leaf': 40.0, 'feature_fraction': 0.9, 'lambda_l2': 1.0}, distributions={'num_leaves': LogUniformDistribution(high=1025, low=32), 'learning_rate': UniformDistribution(high=0.31, low=0.01), 'min_data_in_leaf': DiscreteUniformDistribution(high=40, low=20, q=20), 'feature_fraction': DiscreteUniformDistribution(high=1.0, low=0.9, q=0.1), 'lambda_l2': DiscreteUniformDistribution(high=1.0, low=0.0, q=1.0)}, user_attrs={}, system_attrs={'_number': 0}, intermediate_values={}, trial_id=0)


In [14]:
df = study.trials_dataframe()

In [15]:
df

Unnamed: 0_level_0,number,state,value,datetime_start,datetime_complete,params,params,params,params,params,system_attrs
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,feature_fraction,lambda_l2,learning_rate,min_data_in_leaf,num_leaves,_number
0,0,TrialState.COMPLETE,1.027534,2019-12-01 15:16:47.857505,2019-12-01 15:31:03.816359,0.9,1.0,0.068257,40.0,77.853398,0
1,1,TrialState.COMPLETE,1.190345,2019-12-01 15:31:03.824479,2019-12-01 15:47:15.430479,0.9,1.0,0.013493,20.0,99.059872,1
2,2,TrialState.COMPLETE,1.335813,2019-12-01 15:47:15.437217,2019-12-01 16:01:10.478800,1.0,1.0,0.083087,40.0,49.01649,2


***