In [1]:
import pandas as pd
import h3
import itertools
from tqdm.notebook import tqdm

In [2]:
import sys, os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.preprocessing import aggregate, reduce_mem_usage

In [3]:
trips = pd.read_parquet(TRIPS_PATH.replace(".pkl", ".parquet"))
relocations = pd.read_parquet(RELOCATIONS_PATH.replace(".pkl", ".parquet"))
movements = pd.read_parquet(MOVEMENTS_PATH.replace(".pkl", ".parquet"))
trips.head(2)

Unnamed: 0,p_spot_start,b_lock_types,p_terminal_type_start,p_place_type_start,p_number_start,p_uid_start,b_number,b_boardcomputer,datetime_start,lng_start,...,lat_end,b_pedelec_battery_end,p_bike_types_end,b_battery_pack_end,in_free_flexzone_end,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed
500,False,frame_lock,,12,0.0,12086406,23316,7551004484,2019-01-20 01:54:00,12.319806,...,51.33856,0.0,"{""71"": 1}",,True,True,trip,3.126285,14.0,13.398364
747,False,frame_lock,,12,0.0,12100930,23316,7551004484,2019-01-20 11:45:00,12.361363,...,51.332102,0.0,"{""71"": 1}",,False,True,trip,0.920725,6.0,9.207247


In [4]:
trip_dfs = []
relocation_dfs = []
movement_dfs = []

for h3_res, time_interval_length, df_type in tqdm(
    list(
        itertools.product(
            CALC_H3_RESOLUTIONS,
            CALC_TIME_INTERVAL_LENGTHS,
            ["trip", "relocation", "movement"],
        )
    )
):
    df = aggregate(
        trips
        if df_type == "trip"
        else relocations
        if df_type == "relocation"
        else movements,
        h3_res,
        time_interval_length,
        start_and_end=df_type == "movement",
    )
    df = df.reset_index()
    df["h3_res"] = h3_res
    df["time_interval_length"] = time_interval_length

    if df_type == "trip":
        trip_dfs.append(df)
    elif df_type == "relocation":
        relocation_dfs.append(df)
    elif df_type == "movement":
        movement_dfs.append(df)


  0%|          | 0/36 [00:00<?, ?it/s]

In [5]:
trip_df = pd.concat(trip_dfs)
relocation_df = pd.concat(relocation_dfs)
movement_df = pd.concat(movement_dfs)

trip_df.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1


In [6]:
trip_df.memory_usage().sum() / 1024**2

227.1425552368164

In [7]:
trip_df = reduce_mem_usage(trip_df)
relocation_df = reduce_mem_usage(relocation_df)
movement_df = reduce_mem_usage(movement_df)

Memory usage of dataframe is 227.14 MB
Memory usage after optimization is: 198.75 MB
Decreased by 12.5%
Memory usage of dataframe is 33.54 MB
Memory usage after optimization is: 29.35 MB
Decreased by 12.5%
Memory usage of dataframe is 428.28 MB
Memory usage after optimization is: 381.44 MB
Decreased by 10.9%


In [8]:
trip_df.to_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
relocation_df.to_parquet(RELOCATIONS_GROUPED_SPATIO_TEMPORAL_PATH)
movement_df.to_parquet(MOVEMENTS_GROUPED_SPATIO_TEMPORAL_PATH)

In [10]:
trip_df

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1
2,2019-01-20,871f1a164ffffff,871f1a8cbffffff,1,7,1
3,2019-01-20,871f1a164ffffff,871f1a8d9ffffff,3,7,1
4,2019-01-20,871f1a165ffffff,871f1ab96ffffff,1,7,1
...,...,...,...,...,...,...
494293,2019-12-30,891f1abb267ffff,891f1abb27bffff,2,9,24
494294,2019-12-30,891f1abb27bffff,891f1abb243ffff,1,9,24
494295,2019-12-30,891f1abb63bffff,891f1a8cda7ffff,1,9,24
494296,2019-12-30,891f1abb63bffff,891f1ab968bffff,1,9,24
