In [36]:
import pandas as pd
import h3
import itertools
from tqdm.notebook import tqdm

In [37]:
import sys, os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.preprocessing import aggregate

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
trips = pd.read_parquet(TRIPS_PATH.replace(".pkl", ".parquet"))
relocations = pd.read_parquet(RELOCATIONS_PATH.replace(".pkl", ".parquet"))
movements = pd.read_parquet(MOVEMENTS_PATH.replace(".pkl", ".parquet"))
trips.head(2)

Unnamed: 0_level_0,p_spot_start,b_battery_pack_start,b_pedelec_battery_start,p_address_start,lat_start,b_lock_types,p_name_start,p_number_start,p_bikes_start,p_uid_start,...,p_terminal_type_end,p_booked_bikes_end,lng_end,p_bike_types_end,in_free_flexzone_end,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1242,False,,0.0,,51.361585,frame_lock,BIKE 23341,0.0,1,12110922,...,free,0.0,12.367135,"{""71"": 5}",True,True,trip,0.9548,7.0,8.183999
18171,False,,0.0,,51.306756,frame_lock,BIKE 23341,0.0,1,8507302,...,,0.0,12.377438,"{""71"": 1}",False,True,trip,0.279204,295.0,0.056787


In [54]:
trip_dfs = []
relocation_dfs = []
movement_dfs = []

for h3_res, time_interval_length, df_type in tqdm(
    list(
        itertools.product(
            CALC_H3_RESOLUTIONS,
            CALC_TIME_INTERVAL_LENGTHS,
            ["trip", "relocation", "movement"],
        )
    )
):
    df = aggregate(
        trips
        if df_type == "trip"
        else relocations
        if df_type == "relocation"
        else movements,
        h3_res,
        time_interval_length,
        start_and_end=df_type == "movement",
    )
    df = df.reset_index()
    df["h3_res"] = h3_res
    df["time_interval_length"] = time_interval_length

    if df_type == "trip":
        trip_dfs.append(df)
    elif df_type == "relocation":
        relocation_dfs.append(df)
    elif df_type == "movement":
        movement_dfs.append(df)


  0%|          | 0/24 [00:00<?, ?it/s]

In [56]:
trip_df = pd.concat(trip_dfs)
relocation_df = pd.concat(relocation_dfs)
movement_df = pd.concat(movement_dfs)

trip_df.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,881f1a1643fffff,881f1a8c83fffff,2,8,1
1,2019-01-20,881f1a1645fffff,881f1a8d9bfffff,1,8,1


In [57]:
trip_df.to_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
relocation_df.to_parquet(RELOCATIONS_GROUPED_SPATIO_TEMPORAL_PATH)
movement_df.to_parquet(MOVEMENTS_GROUPED_SPATIO_TEMPORAL_PATH)

In [16]:
if not 'start_hex_id' in trips.columns:
	trips["start_hex_id"] = trips.apply(
		lambda row: h3.geo_to_h3(row["lat_start"], row["lng_start"], H3_RESOLUTION), axis=1
	)
	trips["end_hex_id"] = trips.apply(
		lambda row: h3.geo_to_h3(row["lat_end"], row["lng_end"], H3_RESOLUTION), axis=1
	)
	relocations["start_hex_id"] = relocations.apply(
		lambda row: h3.geo_to_h3(row["lat_start"], row["lng_start"], H3_RESOLUTION), axis=1
	)
	relocations["end_hex_id"] = relocations.apply(
		lambda row: h3.geo_to_h3(row["lat_end"], row["lng_end"], H3_RESOLUTION), axis=1
	)
	movements["start_hex_id"] = movements.apply(
		lambda row: h3.geo_to_h3(row["lat_start"], row["lng_start"], H3_RESOLUTION), axis=1
	)
	movements["end_hex_id"] = movements.apply(
		lambda row: h3.geo_to_h3(row["lat_end"], row["lng_end"], H3_RESOLUTION), axis=1
	)

	trips.to_parquet(TRIPS_PATH)
	relocations.to_parquet(RELOCATIONS_PATH)
	movements.to_parquet(MOVEMENTS_PATH)

In [17]:

trips.to_parquet(TRIPS_PATH)
relocations.to_parquet(RELOCATIONS_PATH)
movements.to_parquet(MOVEMENTS_PATH)

In [18]:
trips[
    ["start_hex_id", "lat_start", "lng_start", "end_hex_id", "lat_end", "lng_end"]
].head(2)


Unnamed: 0_level_0,start_hex_id,lat_start,lng_start,end_hex_id,lat_end,lng_end
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1242,881f1a8c81fffff,51.361585,12.357547,881f1a8c8bfffff,51.367748,12.367135
18171,881f1a1601fffff,51.306756,12.380606,881f1a1601fffff,51.308302,12.377438


### Aggregate Trips Spatio-Temporally

In [19]:

trips['datetime_start_floored'] = trips['datetime_start'].dt.floor(f'{TIME_INTERVAL_LENGTH}H')
relocations['datetime_start_floored'] = relocations['datetime_start'].dt.floor(f'{TIME_INTERVAL_LENGTH}H')
movements['datetime_start_floored'] = movements['datetime_start'].dt.floor(f'{TIME_INTERVAL_LENGTH}H')
movements['datetime_end_floored'] = movements['datetime_end'].dt.floor(f'{TIME_INTERVAL_LENGTH}H')

In [20]:
trips.datetime_start_floored.head(2)

index
1242    2019-01-20 18:00:00
18171   2019-01-28 12:00:00
Name: datetime_start_floored, dtype: datetime64[ns]

In [21]:
trips_grouped = (
    trips.groupby(["datetime_start_floored", "start_hex_id", "end_hex_id"])
    .size()
    .to_frame("demand")
)
relocations_grouped = (
    relocations.groupby(["datetime_start_floored", "start_hex_id", "end_hex_id"])
    .size()
    .to_frame("relocations")
)

# we group movements by both start and end location and time, as we will use this
# to accurately calculate the availability of bicycles at each location at each time
movements_grouped = (
    movements.groupby(
        ["datetime_start_floored", "datetime_end_floored", "start_hex_id", "end_hex_id"]
    )
    .size()
    .to_frame("movements")
)
trips_grouped.head(2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,demand
datetime_start_floored,start_hex_id,end_hex_id,Unnamed: 3_level_1
2019-01-20,881f1a1601fffff,881f1a1659fffff,3
2019-01-20,881f1a1609fffff,881f1a1643fffff,2


In [22]:
from modules.aggregate import aggregate
mov = aggregate(trips, H3_RESOLUTION, TIME_INTERVAL_LENGTH, start_and_end=True)

In [23]:
from pandas.util import hash_pandas_object
h_movements = hash_pandas_object(movements_grouped).sum()
h_trips = hash_pandas_object(trips_grouped).sum()

In [24]:
hash_pandas_object(mov).sum() == h_movements

False

In [25]:
mov.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,demand
datetime_start_floored,datetime_end_floored,start_hex_id,end_hex_id,Unnamed: 4_level_1
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1601fffff,881f1a1659fffff,3
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1609fffff,881f1a1643fffff,2
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1609fffff,881f1a1659fffff,1
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a160dfffff,881f1a1641fffff,1
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a160dfffff,881f1a1659fffff,1
...,...,...,...,...
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1ab96dfffff,881f1a165bfffff,1
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1ab96dfffff,881f1a8d99fffff,1
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1ab96dfffff,881f1ab965fffff,1
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1ab96dfffff,881f1ab96dfffff,1


In [26]:
movements_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,movements
datetime_start_floored,datetime_end_floored,start_hex_id,end_hex_id,Unnamed: 4_level_1
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1601fffff,881f1a1601fffff,4
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1601fffff,881f1a1659fffff,3
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1609fffff,881f1a1609fffff,2
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1609fffff,881f1a1643fffff,2
2019-01-20 00:00:00,2019-01-20 00:00:00,881f1a1609fffff,881f1a1659fffff,1
...,...,...,...,...
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1ab96dfffff,881f1ab96dfffff,8
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1abb25fffff,881f1abb25fffff,2
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1abb27fffff,881f1abb25fffff,1
2019-12-30 18:00:00,2019-12-30 18:00:00,881f1abb31fffff,881f1abb31fffff,1


In [27]:
trips_grouped.index.names = ["time_interval_start", "start_hex_id", "end_hex_id"]
relocations_grouped.index.names = ["time_interval_start", "start_hex_id", "end_hex_id"]
movements_grouped.index.names = ["time_interval_start", "time_interval_end", "start_hex_id", "end_hex_id"]

In [28]:
trips_grouped.to_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
relocations_grouped.to_parquet(RELOCATIONS_GROUPED_SPATIO_TEMPORAL_PATH)
movements_grouped.to_parquet(MOVEMENTS_GROUPED_SPATIO_TEMPORAL_PATH)