In [1]:
# As we use our own external modules, we need the folder src to be in the PYTHONPATH env variable.
# However we do not expect the reader to add that folder to the env variable,
# therefore we manually load it temporarily in each notebook.
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from timeit import default_timer as timer
import pandas as pd
from modules.config import PATH_TRIPS, PERIOD_DURATION, PATH_TRIPS_GROUPED

# Scenario Extraction
In this notebook we will convert the given dataset into data that our model can process.  
One entry of the dataset currently represents one trip. So the most important columns are the trip's starting and ending location and time, as well as the vehicle type of the vehicle that was used for the trip.  
We will now aggregate that data so that the resulting data represents the number of trips made with a certain vehicle type in a certain time period, starting in a certain region, ending in a certain region.  
We will use the aggregated data as the demand for our model.  
  
$ d_{ijtm} $  
where $i$ and $j$ are the starting and ending regions, $t$ is the time period , $m$ is the vehicle and $d$ is the number of trips.

In [3]:
trips = pd.read_pickle(PATH_TRIPS)

Assign each trip a certain period $t$.

In [4]:
trips['datetime_start_floored'] = trips['datetime_start'].dt.floor('%dH' % PERIOD_DURATION)

Group trips by regions $i$ $j$, period $t$ and vehicle type $m$.

In [5]:
trips_grouped = (
    trips.groupby(
        ["vehicleType", "start_hex_id", "end_hex_id", "datetime_start_floored"]
    )
    .size()
    .to_frame("demand")
)


Currently we only have entries, where the number of trips is larger than 0. However we want to have entries for every $t$,$i$,$j$ and $m$. Therefore we reindex the dataframe.  
We also check that we do not change any existing entries, by comparing the values before and after the reindex for a sample entry.

In [6]:
sanity_index = trips_grouped.index[0]
sanity_check_entry_1 = trips_grouped.loc[sanity_index][0]


full_index = pd.MultiIndex.from_product([
    trips['vehicleType'].unique(),
    trips['start_hex_id'].unique(), 
    trips['end_hex_id'].unique(),
    trips['datetime_start_floored'].unique(),
])

trips_grouped = trips_grouped.reindex(full_index)
trips_grouped = trips_grouped.fillna(0)

sanity_check_entry_2 = trips_grouped.loc[sanity_index][0]
sanity_check = sanity_check_entry_1 == sanity_check_entry_2

print("sanity check: " + "️️✔️" if sanity_check else "❌")

sanity check: ️️✔️


In [7]:
trips_grouped = trips_grouped.reset_index(level=0).rename(columns={"level_0": "vehicle_type"})

We now move the vehicle type column to the index, so that our index represents $i$, $j$, $t$ and $m$.

In [8]:
demand_dict = {}
for vehicle_type in trips_grouped['vehicle_type'].unique():
    demand_dict[vehicle_type] = trips_grouped[trips_grouped['vehicle_type'] == vehicle_type]['demand']

In [9]:
trips_seperated = pd.DataFrame(demand_dict)

sanity_vehicle_type = list(sanity_index)[0]
sanity_index_short = list(sanity_index)[1:]

sanity_check_entry_3 = trips_seperated.loc[tuple(sanity_index_short), sanity_vehicle_type]
sanity_check = sanity_check_entry_2 == sanity_check_entry_3

print("sanity check: " + "️️✔️" if sanity_check else "❌")

sanity check: ️️✔️


In [10]:
trips_seperated.head(3)

Unnamed: 0,Unnamed: 1,Unnamed: 2,kick_scooter,bicycle,car
871fa199affffff,871fa199affffff,2019-12-29 00:00:00,20.0,0.0,7.0
871fa199affffff,871fa199affffff,2019-12-29 12:00:00,37.0,1.0,19.0
871fa199affffff,871fa199affffff,2019-12-12 00:00:00,82.0,7.0,6.0


We save the resulting data as a pickle file. The data is now  in the correct format (except for scenario tree structure) for our model. We will use now use this data to generate a arbitrary number of scenarios and simultaneously ensure that the generated scenarios represent a scenario tree. This basically means that for the first period all demand values are the same (root of the scenario tree) and for subsequent periods more and more demand values will differ. 

In [11]:
start = timer()

os.makedirs(os.path.dirname(PATH_TRIPS_GROUPED), exist_ok=True)
trips_seperated.to_pickle(PATH_TRIPS_GROUPED)

end = timer()
print(f"Succesfully saved dataframe to pickle in {(end - start):.2f} seconds")

Succesfully saved dataframe to pickle in 0.01 seconds


In [12]:
trips_seperated.reset_index().nunique()

level_0          27
level_1          29
level_2         204
kick_scooter    537
bicycle          39
car              81
dtype: int64