In [32]:
import pandas as pd
import h3
import itertools
from tqdm.notebook import tqdm

In [33]:
import sys, os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.preprocessing import aggregate, reduce_mem_usage

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
trips = pd.read_parquet(TRIPS_PATH.replace(".pkl", ".parquet"))
relocations = pd.read_parquet(RELOCATIONS_PATH.replace(".pkl", ".parquet"))
movements = pd.read_parquet(MOVEMENTS_PATH.replace(".pkl", ".parquet"))
trips.head(2)

Unnamed: 0_level_0,p_spot_start,b_battery_pack_start,b_pedelec_battery_start,p_address_start,lat_start,b_lock_types,p_name_start,p_number_start,p_bikes_start,p_uid_start,...,p_terminal_type_end,p_booked_bikes_end,lng_end,p_bike_types_end,in_free_flexzone_end,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1242,False,,0.0,,51.361585,frame_lock,BIKE 23341,0.0,1,12110922,...,free,0.0,12.367135,"{""71"": 5}",True,True,trip,0.9548,7.0,8.183999
18171,False,,0.0,,51.306756,frame_lock,BIKE 23341,0.0,1,8507302,...,,0.0,12.377438,"{""71"": 1}",False,True,trip,0.279204,295.0,0.056787


In [4]:
trip_dfs = []
relocation_dfs = []
movement_dfs = []

for h3_res, time_interval_length, df_type in tqdm(
    list(
        itertools.product(
            CALC_H3_RESOLUTIONS,
            CALC_TIME_INTERVAL_LENGTHS,
            ["trip", "relocation", "movement"],
        )
    )
):
    df = aggregate(
        trips
        if df_type == "trip"
        else relocations
        if df_type == "relocation"
        else movements,
        h3_res,
        time_interval_length,
        start_and_end=df_type == "movement",
    )
    df = df.reset_index()
    df["h3_res"] = h3_res
    df["time_interval_length"] = time_interval_length

    if df_type == "trip":
        trip_dfs.append(df)
    elif df_type == "relocation":
        relocation_dfs.append(df)
    elif df_type == "movement":
        movement_dfs.append(df)


  0%|          | 0/36 [00:00<?, ?it/s]

In [5]:
trip_df = pd.concat(trip_dfs)
relocation_df = pd.concat(relocation_dfs)
movement_df = pd.concat(movement_dfs)

trip_df.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1


In [34]:
trip_df.memory_usage().sum() / 1024**2

227.1425552368164

In [36]:
trip_df = reduce_mem_usage(trip_df)
relocation_df = reduce_mem_usage(relocation_df)
movement_df = reduce_mem_usage(movement_df)

Memory usage of dataframe is 198.75 MB
Memory usage after optimization is: 198.75 MB
Decreased by 0.0%
Memory usage of dataframe is 33.54 MB
Memory usage after optimization is: 29.35 MB
Decreased by 12.5%
Memory usage of dataframe is 428.28 MB
Memory usage after optimization is: 381.44 MB
Decreased by 10.9%


In [37]:
trip_df.to_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
relocation_df.to_parquet(RELOCATIONS_GROUPED_SPATIO_TEMPORAL_PATH)
movement_df.to_parquet(MOVEMENTS_GROUPED_SPATIO_TEMPORAL_PATH)