In [1]:
import pandas as pd
import h3
import itertools
from tqdm.notebook import tqdm

In [2]:
import sys, os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.preprocessing import aggregate, reduce_mem_usage

# Trips Aggregation
In this notebook we will aggregate the trips spatially and temporally.  

For spatial aggregation we use the Hexagonal Hierarchical Geospatial Indexing System ([H3](https://h3geo.org/)), which partitions the earth into multiple hexagons of varying sizes depending on the chosen resolutions.
We use the resolutions 7, 8 and 9, where the area of one hexagons covers between \(0.1\) km\(^2\) and \(5\) km\(^2\).  
Additionally, we aggregate trips across multiple time intervals of lengths 1, 2, 6 and 24 hours.  

Our aggregation will give information about the number of trips/relocations/movements that start in each hexagon at a given time.

Additionally, we will also perform a second aggregation, where use a two dimensional spatial aggregation, which then results in origin destination pairs. This aggregation will give information about the number of trips/relocaitons/movements that start in a specific region and end in another region at a given time.


In [3]:
trips = pd.read_parquet(TRIPS_PATH.replace(".pkl", ".parquet"))
relocations = pd.read_parquet(RELOCATIONS_PATH.replace(".pkl", ".parquet"))
movements = pd.read_parquet(MOVEMENTS_PATH.replace(".pkl", ".parquet"))
trips.head(2)

Unnamed: 0,p_spot_start,b_battery_pack_start,b_pedelec_battery_start,p_address_start,lat_start,b_lock_types,p_name_start,p_number_start,p_bikes_start,p_uid_start,...,p_terminal_type_end,p_booked_bikes_end,lng_end,p_bike_types_end,in_free_flexzone_end,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed
1242,False,,0.0,,51.361585,frame_lock,BIKE 23341,0.0,1,12110922,...,free,0.0,12.367135,"{""71"": 5}",True,True,trip,0.9548,7.0,8.183999
18171,False,,0.0,,51.306756,frame_lock,BIKE 23341,0.0,1,8507302,...,,0.0,12.377438,"{""71"": 1}",False,True,trip,0.279204,295.0,0.056787


In [4]:
trip_dfs = []
relocation_dfs = []
movement_dfs = []

for h3_res, time_interval_length, df_type in tqdm(
    list(
        itertools.product(
            CALC_H3_RESOLUTIONS,
            CALC_TIME_INTERVAL_LENGTHS,
            ["trip", "relocation", "movement"],
        )
    )
):
    df = aggregate(
        trips
        if df_type == "trip"
        else relocations
        if df_type == "relocation"
        else movements,
        h3_res,
        time_interval_length,
        start_and_end=df_type == "movement",
    )
    df = df.reset_index()
    df["h3_res"] = h3_res
    df["time_interval_length"] = time_interval_length

    if df_type == "trip":
        trip_dfs.append(df)
    elif df_type == "relocation":
        relocation_dfs.append(df)
    elif df_type == "movement":
        movement_dfs.append(df)


  0%|          | 0/36 [00:00<?, ?it/s]

In [5]:
trip_df = pd.concat(trip_dfs)
relocation_df = pd.concat(relocation_dfs)
movement_df = pd.concat(movement_dfs)

trip_df.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1


In [6]:
trip_df.memory_usage().sum() / 1024**2

227.1425552368164

Because the data is quite large we now try to reduce the memory consumption by changing the data types of the columns to the smallest possible data type, that preserves all information

In [7]:
trip_df = reduce_mem_usage(trip_df)
relocation_df = reduce_mem_usage(relocation_df)
movement_df = reduce_mem_usage(movement_df)

Memory usage of dataframe is 227.14 MB
Memory usage after optimization is: 198.75 MB
Decreased by 12.5%
Memory usage of dataframe is 33.54 MB
Memory usage after optimization is: 29.35 MB
Decreased by 12.5%
Memory usage of dataframe is 428.28 MB
Memory usage after optimization is: 381.44 MB
Decreased by 10.9%


In [8]:
trip_df.to_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
relocation_df.to_parquet(RELOCATIONS_GROUPED_SPATIO_TEMPORAL_PATH)
movement_df.to_parquet(MOVEMENTS_GROUPED_SPATIO_TEMPORAL_PATH)

In [9]:
trip_df

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1
2,2019-01-20,871f1a164ffffff,871f1a8cbffffff,1,7,1
3,2019-01-20,871f1a164ffffff,871f1a8d9ffffff,3,7,1
4,2019-01-20,871f1a165ffffff,871f1ab96ffffff,1,7,1
...,...,...,...,...,...,...
494293,2019-12-30,891f1abb267ffff,891f1abb27bffff,2,9,24
494294,2019-12-30,891f1abb27bffff,891f1abb243ffff,1,9,24
494295,2019-12-30,891f1abb63bffff,891f1a8cda7ffff,1,9,24
494296,2019-12-30,891f1abb63bffff,891f1ab968bffff,1,9,24
