In [1]:
# As we use our own external modules, we need the folder src to be in the PYTHONPATH env variable.
# However we do not expect the reader to add that folder to the env variable,
# therefore we manually load it temporarily in each notebook.
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from scipy.stats import poisson
import numpy as np
import pandas as pd
from timeit import default_timer as timer
from modules.config import (
    PATH_TRIPS_GROUPED,
    PERIOD_DURATION,
    N_SCENARIOS,
    N_REALIZATIONS,
    PATH_SCENARIOS,
    MODE_IS_WEEKEND,
)
from modules.helpers import format_bytes


# Scenario Generation
In this notebook we will use the previously aggregated trip data to generate an arbitrary amound of scenarios while also ensuring the scenario tree structure of the generated scenarios.  

#### A scenario tree where each node has two branches
![Scenario Tree](../resources/tree.png)  
Each branch in a scenario tree corresponds to a possible realiziation of a random variable.  
In our case, the number of realization is configurable in the `config.py`.
<hr>

We start by reading the grouped trip data from the pickle file.  
We also add the `is_weekend` column. As we will only use scenario data from either weekends or weekdays.  
This weekend decision is configurable in the `config.py`.

In [3]:
trips = pd.read_pickle(PATH_TRIPS_GROUPED)

In [4]:
trips = trips.rename_axis(['start_hex_id', 'end_hex_id', 'daytime']).reset_index()

In [5]:
trips['is_weekend'] = trips['daytime'].dt.dayofweek > 4
trips['time'] = trips['daytime'].dt.time

To generate scenarios we will use the poisson distribution.    
As the estimator we use the maximum likelihood estimate of the poisson distribution, which is the mean.  
So for a n samples $ k_i \in \mathbb{N} $ for $ i =1,..,n $, we get the maximum likelihood estimate with:  
$ \lambda_{MLE} = \frac{1}{n}\sum_{i=1}^{n}k_i$


In [6]:
mu = trips.drop('daytime', axis=1) \
          .groupby(['is_weekend', 'time', 'start_hex_id', 'end_hex_id']) \
          .mean()

To ensure the tree structure we have to generate increaslingly more demand values for subsequent time periods. e.g. if the number of realiziations is equal to 2, we would generate only 1 demand value for the first period, 2 for the second period, 4 for the third, 8 for the fourth and so on...  
If there were only demand value per time period and we would have 3 time period with 2 realizations each, then the demand values (in scenario tree structure) could be represented by the following matrix.
$$ \begin{pmatrix}
2 & 3 & 2\\
2 & 3 & 3\\
2 & 4 & 1\\
2 & 4 & 2
\end{pmatrix}
$$
  
Let $r$ be the number of realizations of each random variable, $d$ the duration of each period, $n$ the number of scenarios and $t$ a specific time interval.  
So for example:  
$r = 2, d=4, n=32, t \in \{0, 4, 8, 12, 16, 20\}$  
Then we can calculate the number of differing scenarios (scenario groups) for a specific time interval with:  
$\Large r^{\frac{t}{d}}$  
Also we can calculate the number of scenarios in each group with:  
$\Large\frac{n}{r^{\frac{h}{d}}}$

In [7]:
trips['time'].unique()

array([datetime.time(0, 0), datetime.time(12, 0)], dtype=object)

In [8]:
hours_list = list(map(lambda time: time.hour,trips['time'].unique()))
# TODO rename these variables!
batch_map = {
    hour: {
        'n_batches': N_REALIZATIONS**(int(hour / PERIOD_DURATION)),
        'batch_size': int(N_SCENARIOS / N_REALIZATIONS**(hour/PERIOD_DURATION)),
    }
    for hour in hours_list
}
batch_map

{0: {'n_batches': 1, 'batch_size': 128},
 12: {'n_batches': 128, 'batch_size': 1}}

In [9]:
start = timer()
scenario_dict = {}
for [[time, start_hex_id, end_hex_id], [mu_kick_scooter, mu_car, mu_bicycle]] in mu.loc[
    MODE_IS_WEEKEND
].iterrows():
    n_batches, batch_size = batch_map[time.hour].values()

    kick_scooter_scenarios = np.repeat(
        poisson.rvs(mu_kick_scooter, size=n_batches, random_state=42), batch_size
    )
    car_scenarios = np.repeat(
        poisson.rvs(mu_car, size=n_batches, random_state=42), batch_size
    )
    bicycle_scenarios = np.repeat(
        poisson.rvs(mu_bicycle, size=n_batches, random_state=42), batch_size
    )
    
    scenario_dict[
        (start_hex_id, end_hex_id, time, "kick_scooter")
    ] = kick_scooter_scenarios
    scenario_dict[(start_hex_id, end_hex_id, time, "bicycle")] = bicycle_scenarios
    scenario_dict[(start_hex_id, end_hex_id, time, "car")] = car_scenarios

end = timer()
print(f"Succesfully generated {N_SCENARIOS} scenarios in {(end - start):.2f} seconds")


Succesfully generated 128 scenarios in 1.05 seconds


The `scenarios` dataframe now consists of the configured number of scenarios.

In [10]:
scenarios = pd.DataFrame.from_dict(scenario_dict, orient="index")
del scenario_dict

In [11]:
scenarios.index = pd.MultiIndex.from_tuples(scenarios.index)
scenarios = scenarios.stack().to_frame()

In [12]:
scenarios.index = scenarios.index.rename(['start_hex_ids', 'end_hex_ids', 'time', 'vehicle_types', 'scenarios'])
scenarios = scenarios.rename(columns={0: 'demand'})

After transforming the scenarios back to our previous data format we can now save the scenario data as a pickle file.  
We could use this data as a direct input for our model, however with a large number of scenarios the Linear Program that is underlying our model can take very long to solve.  
Therefore we will reduce the generated scenarios, so that we have a smaller subset that still represents the original dataset.

In [13]:
os.makedirs(os.path.dirname(PATH_SCENARIOS), exist_ok=True)
scenarios.to_pickle(PATH_SCENARIOS)

print(f"scenario filesize: {format_bytes(os.path.getsize(PATH_SCENARIOS))}")

scenario filesize: 8.03 megabytes


In [14]:
scenarios.reset_index().nunique()

start_hex_ids     27
end_hex_ids       29
time               2
vehicle_types      3
scenarios        128
demand           345
dtype: int64

In [15]:
scenarios.head(3)
# free the memory
%reset -f
import gc
gc.collect()
# this still does not free all memory for some reason
# we recommend to close the notebook after execution or restart the kernel manually

0