In [1]:
import pandas as pd
from scipy.stats import distributions
import itertools
from tqdm.notebook import tqdm
import time

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.storage import get_demand_model_data, store_results, get_results_df, get_demand_orig_dest_model_data, get_availability_model_data
from sklearn.experimental import enable_halving_search_cv
from modules.xgboost import perform_randomized_grid_search, get_best_hyperparameters, model_already_evaluated, evaluate

  from pandas import MultiIndex, Int64Index


# XGBoost Training
In this notebook, we will train XGBoost as an additional model. The reason we decided to put in this extra effort is, because we recently learned about XGBoost and recognized the large bicycle sharing dataset as a perfect fit for it.  
With the right hyperparameters XGBoost is extremely scalable to large datasets.
E.g. we decided to use `hist` as the method for building the trees. This means during the tree building process, the algorithm will not try every possible split but only some at certain quantiles. This approach is known as the (weighted) quantile sktech. Also it uses an approximation of the real histograms to determine the quantiles, wich is even faster.  

To find the best hyperparameters, we perform a randomized hyperparameter grid search based on successive halving. This approach is an enhanced version of the successive halving algorithm we used for SVMs.
Instead of trying out hyperparameters specified by the explicit hyperparameter grid, the algorithm tries out a random subset.
Randomized search has multiple advantages compared to grid search.
Firstly, distributions instead of discrete values can be defined. Secondly, the number of hyperparameters to be tested can be easily altered. Thirdly, randomized searches are more efficient that grid searches.


In [3]:
hyperparameter_grid = {
    "eta": distributions.uniform(0, 0.3),
    "gamma": distributions.uniform(0, 5),
    "max_depth": distributions.randint(1, 10),
    "lambda": distributions.expon(),
    "booster": ["gbtree"],
    "tree_method": ["hist"]
}

In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Hyperparameter Search

In [5]:
for outcome in ['demand', 'availability']:
	result_path = XGBOOST_FIRST_STAGE_DEMAND_RESULTS_PATH if outcome == 'demand' else XGBOOST_FIRST_STAGE_AVAILABILITY_RESULTS_PATH
	if not get_results_df(result_path).empty:
		print(f"Already performed grid search for {outcome}")
		continue
	res = perform_randomized_grid_search(
		get_demand_model_data if outcome == 'demand' else get_availability_model_data,
		hyperparameter_grid,
		TUNE_H3_RESOLUTION,
		TUNE_TIME_INTERVAL_LENGTH,
	)
	store_results(res, result_path)

Already performed grid search for demand
Already performed grid search for availability


In [6]:
res = get_results_df(XGBOOST_FIRST_STAGE_DEMAND_RESULTS_PATH)
res.sort_values(by=["rank_test_score"])

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_eta,param_gamma,param_lambda,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
1503,6,153819,1.253628,0.080693,0.021332,0.002862,gbtree,0.081692,0.091953,2.456889,...,-13.050668,0.884757,1,-11.670141,-11.501171,-11.378437,-11.860564,-11.539376,-11.589938,0.164219
1502,6,153819,1.103867,0.031088,0.018577,0.000411,gbtree,0.154679,2.444233,1.099205,...,-13.626666,1.055941,2,-12.466102,-12.301324,-12.230210,-12.669970,-12.158934,-12.365308,0.183256
1500,5,51273,0.517885,0.015503,0.009318,0.001029,gbtree,0.081692,0.091953,2.456889,...,-14.680344,1.518570,3,-10.037725,-9.629830,-9.737628,-11.334504,-10.079148,-10.163767,0.609976
1501,5,51273,0.444084,0.012859,0.008346,0.000331,gbtree,0.154679,2.444233,1.099205,...,-15.329992,1.709187,4,-10.827068,-10.849388,-10.944522,-12.151200,-11.144852,-11.183406,0.496734
1499,5,51273,0.452911,0.012183,0.009282,0.001599,gbtree,0.089171,1.499596,1.359431,...,-16.046733,1.793300,5,-12.580637,-11.961383,-12.302119,-13.817094,-12.433894,-12.619026,0.633094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,0,211,0.061079,0.009042,0.004153,0.000214,gbtree,0.003647,3.370100,0.053228,...,-223.201994,266.234970,1500,-39.781105,-28.049236,-38.738112,-46.206308,-46.596891,-39.874330,6.729328
652,0,211,0.079253,0.002553,0.004052,0.000170,gbtree,0.000695,0.220810,0.418220,...,-224.152720,265.603102,1501,-57.190265,-37.842771,-55.060084,-64.433728,-65.389961,-55.983362,9.912446
267,0,211,0.085730,0.010827,0.004940,0.001609,gbtree,0.002846,0.263419,0.092711,...,-227.299311,281.665833,1502,-37.602344,-24.917929,-36.082163,-43.316048,-43.819009,-37.147499,6.833217
295,0,211,0.093715,0.006574,0.004455,0.000359,gbtree,0.001544,4.099735,0.259581,...,-227.774339,280.359382,1503,-48.908906,-32.567209,-47.245316,-55.258048,-56.230843,-48.042064,8.485092


In [7]:
resolutions = list(
	itertools.product(
		PREDICTIVE_H3_RESOLUTIONS,
		CALC_TIME_INTERVAL_LENGTHS,
	)
)+ ADDITIONAL_PREDICTIVE_RESOLUTIONS
iterator = list(itertools.product(
	resolutions,
	['demand', 'availability'],
))

### Training For All Resolutions
Now where we found a good set of hyperparameters, we can train the model for all resolutions.

In [8]:
for (h3_res, time_interval_length), outcome in tqdm(iterator):
    padded_outcome = "availability" if outcome == "availability" else "demand      "
    print_output = f"{padded_outcome} - {h3_res} - {time_interval_length}"
    tqdm.write(print_output + " started", end="\r")

    result_path, first_stage_result_path = (
        (
            XGBOOST_SECOND_STAGE_DEMAND_RESULTS_PATH,
            XGBOOST_FIRST_STAGE_DEMAND_RESULTS_PATH,
        )
        if outcome == "demand"
        else (
            XGBOOST_SECOND_STAGE_AVAILABILITY_RESULTS_PATH,
            XGBOOST_FIRST_STAGE_AVAILABILITY_RESULTS_PATH,
        )
    )
    if model_already_evaluated(result_path, h3_res, time_interval_length):
        tqdm.write(print_output + " already done")
        continue
    hyperparameters = get_best_hyperparameters(get_results_df(first_stage_result_path))
    start = time.time()
    res = evaluate(
        get_demand_model_data,
        hyperparameters,
        h3_res,
        time_interval_length,
    )
    duration = time.time() - start
    res = pd.DataFrame(res, index=[0])
    res["h3_res"] = h3_res
    res["time_interval_length"] = time_interval_length
    res["train_duration"] = duration

    store_results(res, result_path)
    tqdm.write(print_output + " done    ")


  0%|          | 0/18 [00:00<?, ?it/s]

demand       - 7 - 1 already done
availability - 7 - 1 already done
demand       - 7 - 2 already done
availability - 7 - 2 already done
demand       - 7 - 6 already done
availability - 7 - 6 already done
demand       - 7 - 24 already done
availability - 7 - 24 already done
demand       - 8 - 1 already done
availability - 8 - 1 already done
demand       - 8 - 2 already done
availability - 8 - 2 already done
demand       - 8 - 6 already done
availability - 8 - 6 already done
demand       - 8 - 24 already done
availability - 8 - 24 already done
demand       - 9 - 24 already done
availability - 9 - 24 already done


Similar to SVMs and Neural Networks, we also train one additional model for origin-destination pairs.

In [9]:
results_df = get_results_df(XGBOOST_SECOND_STAGE_DEMAND_RESULTS_PATH)
if results_df[results_df["h3_res"] == int(f"{ORIGIN_DESTINATION_H3_RESOLUTION}{ORIGIN_DESTINATION_H3_RESOLUTION}")].empty:
	hyperparameters = get_best_hyperparameters(get_results_df(XGBOOST_FIRST_STAGE_DEMAND_RESULTS_PATH))
	start = time.time()
	res = evaluate(
		lambda _, __: get_demand_orig_dest_model_data(),
		hyperparameters,
		ORIGIN_DESTINATION_H3_RESOLUTION,
		ORIGIN_DESTINATION_H3_RESOLUTION,
	)
	duration = time.time() - start
	res = pd.DataFrame(res, index=[0])
	res["h3_res"] = int(f"{ORIGIN_DESTINATION_H3_RESOLUTION}{ORIGIN_DESTINATION_H3_RESOLUTION}")
	res["time_interval_length"] = time_interval_length
	res["train_duration"] = duration
	store_results(res, XGBOOST_SECOND_STAGE_DEMAND_RESULTS_PATH)