In [None]:
# As we use our own external modules, we need the folder src to be in the PYTHONPATH env variable.
# However we do not expect the reader to add that folder to the env variable,
# therefore we manually load it temporarily in each notebook.
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
from sklearn_extra.cluster import KMedoids
from modules.config import PATH_SCENARIOS, N_REDUCED_SCNEARIOS, N_SCENARIOS, PATH_SCENARIOS_REDUCED, PATH_SCENARIO_PROBABILITY

# Scenario Reduction
To reduce our scenarios we will use k-medoids clustering. k-medoids is very similar to k-means, however a cluster center in k-medoids is not the mean of all points belonging to that cluster, but rather a point of the cluster itself. For our purpose this is a lot more meaningful, as there can be no unrealistic centers, that have demand values which are not whole numbers.  

In [None]:
scenarios = pd.read_pickle(PATH_SCENARIOS)

We transform the dataframe so that one entry corresponds to exactly one scenario.

In [None]:
scenarios = scenarios.unstack(level=['start_hex_ids', 'end_hex_ids', 'time', 'vehicle_types'])

In [None]:
kmedoids = KMedoids(
    n_clusters=N_REDUCED_SCNEARIOS, 
    random_state=0, 
    ).fit(scenarios.values)

We save which scenario was assigned to which center, so that we can later calculate the probability of each center.

In [None]:
scenario_reduction_assignment = pd.DataFrame(index = scenarios.index)
scenario_reduction_assignment['cluster_label'] = kmedoids.labels_
label_to_scenario_id_map = {i:indice for i,indice in enumerate(list(kmedoids.medoid_indices_))}
scenario_reduction_assignment['cluster_label'] = scenario_reduction_assignment['cluster_label'].replace(label_to_scenario_id_map)

In [None]:
# note that because the scenarios dataframe was sorted the indices of the medoids are also the 
# scenario ids
selected_scenario_ids = kmedoids.medoid_indices_

In [None]:
selected_scenarios = scenarios.loc[selected_scenario_ids]

In [None]:
selected_scenarios = selected_scenarios.stack(['start_hex_ids', 'end_hex_ids', 'time', 'vehicle_types'])

In [None]:
scenario_id_list = list(selected_scenarios.index.get_level_values('scenarios').unique())
scenario_reset_map = {id:i for i, id in enumerate(scenario_id_list)}

In [None]:
selected_scenarios = selected_scenarios.rename(index=scenario_reset_map)
scenario_reduction_assignment['cluster_label'] = scenario_reduction_assignment['cluster_label'].replace(scenario_reset_map)

We calculate the probability of each center as the sum of the probabilities of the scenarios that are assigned to that center.   

In [None]:
scenario_probability =  scenario_reduction_assignment.reset_index().groupby('cluster_label').count()
scenario_probability = scenario_probability.rename(columns={'scenarios': 'n_scenarios'})
scenario_probability.index.names = ['scenarios']
scenario_probability['probability'] = scenario_probability['n_scenarios'] / N_SCENARIOS

We now have the reduced number of scenarios. The last preperation we have to make is to reindex the scenarios so that the demands index contains the complete cartasian product of the regions. This is necessary as it might happen that there are no regions starting in a region, but there are trips ending there. As our model will only use one set of regions we need to address this issue.

In [None]:
hex_ids = {*selected_scenarios.index.get_level_values('start_hex_ids').unique()}
hex_ids = list(hex_ids.union({*selected_scenarios.index.get_level_values('end_hex_ids').unique()}))

In [None]:
complete_index = pd.MultiIndex.from_product(
    [
        selected_scenarios.index.get_level_values('scenarios').unique(),
        pd.Index(hex_ids, name="start_hex_ids"),
        pd.Index(hex_ids, name="end_hex_ids"),
        selected_scenarios.index.get_level_values('time').unique(),
        selected_scenarios.index.get_level_values('vehicle_types').unique(),
    ]
)

In [None]:
selected_scenarios = selected_scenarios.reindex(complete_index, fill_value=0)

In [None]:
os.makedirs(os.path.dirname(PATH_SCENARIO_PROBABILITY), exist_ok=True)
scenario_probability.to_pickle(PATH_SCENARIO_PROBABILITY)

os.makedirs(os.path.dirname(PATH_SCENARIOS_REDUCED), exist_ok=True)
selected_scenarios.to_pickle(PATH_SCENARIOS_REDUCED)

In [None]:
selected_scenarios.reset_index().nunique()

scenarios         4
start_hex_ids    29
end_hex_ids      29
time              3
vehicle_types     3
demand           85
dtype: int64