In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
# import aggregated trips and reset index
trips_aggregated_df = pd.read_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
trips_aggregated_df.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length
0,2019-01-20,871f1a164ffffff,871f1a164ffffff,1,7,1
1,2019-01-20,871f1a164ffffff,871f1a8c8ffffff,2,7,1


In [4]:
trips_aggregated_df = trips_aggregated_df[trips_aggregated_df['h3_res'].apply(lambda x: x in PREDICTIVE_H3_RESOLUTIONS)]
print(trips_aggregated_df.h3_res.unique())
print(trips_aggregated_df.index.size)

[7 8]
2161033


In [5]:
# add time features
trips_aggregated_df['hour'] = trips_aggregated_df.datetime_start_floored.dt.hour
trips_aggregated_df['day'] = trips_aggregated_df.datetime_start_floored.dt.day
trips_aggregated_df['weekday'] = trips_aggregated_df.datetime_start_floored.dt.weekday
trips_aggregated_df['month'] = trips_aggregated_df.datetime_start_floored.dt.month
trips_aggregated_df = trips_aggregated_df.drop(columns=['datetime_start_floored'])
trips_aggregated_df.head(2)

Unnamed: 0,start_hex_id,end_hex_id,demand,h3_res,time_interval_length,hour,day,weekday,month
0,871f1a164ffffff,871f1a164ffffff,1,7,1,0,20,6,1
1,871f1a164ffffff,871f1a8c8ffffff,2,7,1,0,20,6,1


In [6]:
# add poi data on start and end hexagon of the trip
hexagons_with_pois_df = pd.read_parquet(HEXAGON_WITH_POIS_PATH)

trips_aggregated_df = pd.merge(trips_aggregated_df, hexagons_with_pois_df, left_on="start_hex_id", right_on="hex")
trips_aggregated_df = trips_aggregated_df.drop(columns={"hex", "hex_and_neighbors", "h3_res_x"})

# add '_start' suffix to poi columns
trips_aggregated_df = trips_aggregated_df.rename(
    columns={
        "sustenance_poi": "sustenance_poi_start",
        "public_transport_poi": "public_transport_poi_start",
        "education_poi": "education_poi_start",
        "arts_and_culture_poi": "arts_and_culture_poi_start",
        "sports_poi": "sports_poi_start",
    }
)
trips_aggregated_df.head(2)

Unnamed: 0,start_hex_id,end_hex_id,demand,time_interval_length,hour,day,weekday,month,h3_res_y,sustenance_poi_start,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start
0,871f1a164ffffff,871f1a164ffffff,1,1,0,20,6,1,7,862,503,101,40,45
1,871f1a164ffffff,871f1a164ffffff,1,1,0,20,6,1,7,862,503,101,40,45


In [7]:
trips_aggregated_df = pd.merge(trips_aggregated_df, hexagons_with_pois_df, left_on="end_hex_id", right_on="hex")
trips_aggregated_df = trips_aggregated_df.drop(columns={"hex", "hex_and_neighbors", "h3_res_y"})

# add '_end' suffix to poi columns
trips_aggregated_df = trips_aggregated_df.rename(
    columns={
        "sustenance_poi": "sustenance_poi_end",
        "public_transport_poi": "public_transport_poi_end",
        "education_poi": "education_poi_end",
        "arts_and_culture_poi": "arts_and_culture_poi_end",
        "sports_poi": "sports_poi_end",
    }
)
trips_aggregated_df.head(2)

Unnamed: 0,start_hex_id,end_hex_id,demand,time_interval_length,hour,day,weekday,month,sustenance_poi_start,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start,h3_res,sustenance_poi_end,public_transport_poi_end,education_poi_end,arts_and_culture_poi_end,sports_poi_end
0,871f1a164ffffff,871f1a164ffffff,1,1,0,20,6,1,862,503,101,40,45,7,862,503,101,40,45
1,871f1a164ffffff,871f1a164ffffff,1,1,0,20,6,1,862,503,101,40,45,7,862,503,101,40,45


In [9]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [10]:
trips_aggregated_df = reduce_mem_usage(trips_aggregated_df)

Memory usage of dataframe is 5687.60 MB
Memory usage after optimization is: 1263.93 MB
Decreased by 77.8%


In [None]:
# encode start and end hexagons as one-hot vectors
start_hex_dummies = pd.get_dummies(trips_aggregated_df.start_hex_id, prefix="start_")
end_hex_dummies = pd.get_dummies(trips_aggregated_df.end_hex_id, prefix="end_")
trips_aggregated_df = pd.concat([trips_aggregated_df, start_hex_dummies, end_hex_dummies], axis=1)
trips_aggregated_df = trips_aggregated_df.drop(columns=['start_hex_id', 'end_hex_id'])
trips_aggregated_df.head(2)

MemoryError: Unable to allocate 9.98 GiB for an array with shape (275, 38979597) and data type uint8

In [None]:
trips_aggregated_df.to_parquet(MODEL_DATA_PATH)