In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys, os, gc

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
# import aggregated trips
trips_aggregated_df_init = pd.read_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
trips_aggregated_df = trips_aggregated_df_init[
    (trips_aggregated_df_init['h3_res'] != ORIGIN_DESTINATION_H3_RESOLUTION) |
    (trips_aggregated_df_init['time_interval_length'] != ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH)
]
trips_aggregated_df_reduced = trips_aggregated_df_init[
    (trips_aggregated_df_init['h3_res'] == ORIGIN_DESTINATION_H3_RESOLUTION) & 
    (trips_aggregated_df_init['time_interval_length'] == ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH)
]
trips_aggregated_df_reduced = trips_aggregated_df_reduced.reset_index(drop=True)

In [4]:
print(f"Total number of entries in the demand dataset for all h3 resolutions and all time intervals: {trips_aggregated_df_init.index.size}")

Total number of entries in the demand dataset for all h3 resolutions and all time intervals: 4253147


In [5]:
# #
# trips_aggregated_df['hour_sin'] = np.sin(trips_aggregated_df.hour*(2.*np.pi/24))
# trips_aggregated_df['hour_cos'] = np.cos(trips_aggregated_df.hour*(2.*np.pi/24))
# # m
# trips_aggregated_df['month_sin'] = np.sin((trips_aggregated_df.month-1)*(2.*np.pi/12))
# trips_aggregated_df['month_cos'] = np.cos((trips_aggregated_df.month-1)*(2.*np.pi/12))

In [6]:
# load the weather dataframe, reduce data types
weather_df = pd.read_parquet(WEATHER_AGGR_TEMPORAL_PATH)
weather_df = weather_df.astype(np.float16)
weather_df['time_interval_length'] = weather_df.time_interval_length.astype(np.uint8)
weather_df.head(2)

Unnamed: 0_level_0,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_interval_length
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 00:00:00,7.398438,7.398438,7.398438,2.800781,8.0,0.0,1
2019-01-01 01:00:00,7.699219,7.699219,7.699219,2.900391,8.0,0.0,1


In [7]:
# merge weather data with trips data
trips_aggregated_df = pd.merge(trips_aggregated_df, weather_df,  how='left', 
                    left_on=['datetime_start_floored','time_interval_length'],
                    right_on = ['MESS_DATUM','time_interval_length'])

trips_aggregated_df_reduced = pd.merge(trips_aggregated_df_reduced, weather_df,  how='left', 
                    left_on=['datetime_start_floored','time_interval_length'],
                    right_on = ['MESS_DATUM','time_interval_length'])

In [8]:
del weather_df
gc.collect()

141

In [9]:
# load poi data and reduce datatypes
hexagons_with_pois_df = pd.read_parquet(HEXAGON_WITH_POIS_PATH)
hexagons_with_pois_df = hexagons_with_pois_df.drop(columns=["hex_and_neighbors"])
hexagons_with_pois_df = hexagons_with_pois_df.astype({
    "h3_res": np.uint16,
    "sustenance_poi": np.uint16,
    "public_transport_poi": np.uint16,
    "education_poi": np.uint16,
    "arts_and_culture_poi": np.uint16,
    "sports_poi": np.uint16,
})

In [10]:
# add poi use data as feature of the start location
trips_aggregated_df = pd.merge(trips_aggregated_df, hexagons_with_pois_df.drop(columns=["h3_res"]), left_on="start_hex_id", right_on="hex")
trips_aggregated_df = trips_aggregated_df.drop(columns={"hex"})

# add '_start' suffix to poi columns
trips_aggregated_df = trips_aggregated_df.rename(columns={
    "sustenance_poi": "sustenance_poi_start",
    "public_transport_poi": "public_transport_poi_start",
    "education_poi": "education_poi_start",
    "arts_and_culture_poi": "arts_and_culture_poi_start",
    "sports_poi": "sports_poi_start",
})

# add poi use data as feature of the start location
trips_aggregated_df_reduced = pd.merge(trips_aggregated_df_reduced, hexagons_with_pois_df.drop(columns=["h3_res"]), left_on="start_hex_id", right_on="hex")
trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(columns={"hex"})

# add '_start' suffix to poi columns
trips_aggregated_df_reduced = trips_aggregated_df_reduced.rename(columns={
    "sustenance_poi": "sustenance_poi_start",
    "public_transport_poi": "public_transport_poi_start",
    "education_poi": "education_poi_start",
    "arts_and_culture_poi": "arts_and_culture_poi_start",
    "sports_poi": "sports_poi_start",
})
trips_aggregated_df_reduced.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res,time_interval_length,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,sustenance_poi_start,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start
0,2019-01-20,871f1a160ffffff,871f1a160ffffff,4,7,24,-9.101562,-0.199951,-5.402344,0.854004,0.083313,0.0,355,309,54,11,15
1,2019-01-20,871f1a160ffffff,871f1a164ffffff,5,7,24,-9.101562,-0.199951,-5.402344,0.854004,0.083313,0.0,355,309,54,11,15


In [11]:
trips_aggregated_df_reduced.columns

Index(['datetime_start_floored', 'start_hex_id', 'end_hex_id', 'demand',
       'h3_res', 'time_interval_length', 'min_temperature', 'max_temperature',
       'mean_temperature', 'mean_mean_wind_speed', 'mean_total_cloud_cover',
       'sum_precipitation', 'sustenance_poi_start',
       'public_transport_poi_start', 'education_poi_start',
       'arts_and_culture_poi_start', 'sports_poi_start'],
      dtype='object')

In [12]:
# add poi use data as feature of the end location
trips_aggregated_df_reduced = pd.merge(trips_aggregated_df_reduced, hexagons_with_pois_df, left_on="end_hex_id", right_on="hex")
trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(columns={"hex", "h3_res_y"})

# add '_end' suffix to poi columns
trips_aggregated_df_reduced = trips_aggregated_df_reduced.rename(
    columns={
        "sustenance_poi": "sustenance_poi_end",
        "public_transport_poi": "public_transport_poi_end",
        "education_poi": "education_poi_end",
        "arts_and_culture_poi": "arts_and_culture_poi_end",
        "sports_poi": "sports_poi_end",
    }
)
trips_aggregated_df_reduced.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res_x,time_interval_length,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,...,sustenance_poi_start,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start,sustenance_poi_end,public_transport_poi_end,education_poi_end,arts_and_culture_poi_end,sports_poi_end
0,2019-01-20,871f1a160ffffff,871f1a160ffffff,4,7,24,-9.101562,-0.199951,-5.402344,0.854004,...,355,309,54,11,15,355,309,54,11,15
1,2019-01-21,871f1a160ffffff,871f1a160ffffff,8,7,24,-9.101562,-3.300781,-6.507812,0.916504,...,355,309,54,11,15,355,309,54,11,15


In [13]:
del hexagons_with_pois_df
gc.collect()

0

In [14]:
# load land use data and reduce data types
hexagons_with_land_use_df = pd.read_parquet(HEXAGONS_WITH_LAND_USE_PATH)
hexagons_with_land_use_df = hexagons_with_land_use_df.astype(np.float16)
hexagons_with_land_use_df.head(2)

land_use,land_use_1,land_use_2,land_use_3,land_use_4,land_use_5,land_use_6,land_use_7,land_use_8,land_use_9,land_use_10,...,land_use_13,land_use_14,land_use_15,land_use_16,land_use_17,land_use_18,land_use_19,land_use_20,land_use_21,land_use_22
hexagon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
871f1a140ffffff,0.0,0.0,0.0,0.0,0.006058,0.0,0.019272,0.01548,0.0,0.319092,...,0.0,0.0,0.0,0.007874,0.0,0.0,0.0,0.001923,0.008865,0.0
871f1a144ffffff,0.148926,0.155518,0.016571,0.084351,0.135986,0.058533,0.071045,0.082703,0.004414,0.031219,...,0.000406,0.0,0.0,0.0,0.0,0.004196,0.0,0.070251,0.0,0.0


In [15]:
# add land use data as feature of the start location
trips_aggregated_df = pd.merge(trips_aggregated_df, hexagons_with_land_use_df.add_prefix("start_"), left_on="start_hex_id", right_on="hexagon_id")
trips_aggregated_df_reduced = pd.merge(trips_aggregated_df_reduced, hexagons_with_land_use_df.add_prefix("start_"), left_on="start_hex_id", right_on="hexagon_id")
trips_aggregated_df_reduced.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res_x,time_interval_length,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,...,start_land_use_13,start_land_use_14,start_land_use_15,start_land_use_16,start_land_use_17,start_land_use_18,start_land_use_19,start_land_use_20,start_land_use_21,start_land_use_22
0,2019-01-20,871f1a160ffffff,871f1a160ffffff,4,7,24,-9.101562,-0.199951,-5.402344,0.854004,...,0.001766,0.0,0.0,0.0,0.0,0.0,0.0,0.100342,0.0,0.0
1,2019-01-21,871f1a160ffffff,871f1a160ffffff,8,7,24,-9.101562,-3.300781,-6.507812,0.916504,...,0.001766,0.0,0.0,0.0,0.0,0.0,0.0,0.100342,0.0,0.0


In [16]:
# add land use data as feature of the end location
trips_aggregated_df_reduced = pd.merge(trips_aggregated_df_reduced, hexagons_with_land_use_df.add_prefix("end_"), left_on="end_hex_id", right_on="hexagon_id")
trips_aggregated_df_reduced.head(2)

Unnamed: 0,datetime_start_floored,start_hex_id,end_hex_id,demand,h3_res_x,time_interval_length,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,...,end_land_use_13,end_land_use_14,end_land_use_15,end_land_use_16,end_land_use_17,end_land_use_18,end_land_use_19,end_land_use_20,end_land_use_21,end_land_use_22
0,2019-01-20,871f1a160ffffff,871f1a160ffffff,4,7,24,-9.101562,-0.199951,-5.402344,0.854004,...,0.001766,0.0,0.0,0.0,0.0,0.0,0.0,0.100342,0.0,0.0
1,2019-01-21,871f1a160ffffff,871f1a160ffffff,8,7,24,-9.101562,-3.300781,-6.507812,0.916504,...,0.001766,0.0,0.0,0.0,0.0,0.0,0.0,0.100342,0.0,0.0


In [17]:
del hexagons_with_land_use_df
gc.collect()

0

In [18]:
print(trips_aggregated_df.memory_usage(index=True, deep=True).sum() / 1024**2)
print(trips_aggregated_df_reduced.memory_usage(index=True, deep=True).sum() / 1024**2)

955.1518306732178
13.412040710449219


In [19]:
start_hex_id_map = dict(enumerate(trips_aggregated_df.start_hex_id.unique()))
start_hex_id_map = {v: k for k, v in start_hex_id_map.items()}

reduced_start_hex_id_map = dict(enumerate(trips_aggregated_df_reduced.start_hex_id.unique()))
reduced_end_hex_id_map = dict(enumerate(trips_aggregated_df_reduced.end_hex_id.unique()))
reduced_start_hex_id_map = {v: k for k, v in reduced_start_hex_id_map.items()}
reduced_end_hex_id_map = {v: k for k, v in reduced_end_hex_id_map.items()}

In [20]:
trips_aggregated_df["start_hex_id"] = trips_aggregated_df["start_hex_id"].map(start_hex_id_map).astype(np.uint16)

trips_aggregated_df_reduced["start_hex_id"] = trips_aggregated_df_reduced["start_hex_id"].map(reduced_start_hex_id_map).astype(np.uint16)
trips_aggregated_df_reduced["end_hex_id"] = trips_aggregated_df_reduced["end_hex_id"].map(reduced_end_hex_id_map).astype(np.uint16)

In [21]:
# store all model data in one file
trips_aggregated_df.to_feather(MODEL_DATA_PATH)
trips_aggregated_df_reduced.to_feather(REDUCED_MODEL_DATA_PATH)

In [22]:
def add_time_features_to_model_data(model_data):
    # add time features
    model_data['hour'] = model_data.datetime_start_floored.dt.hour
    model_data['weekday'] = model_data.datetime_start_floored.dt.weekday
    model_data['month'] = model_data.datetime_start_floored.dt.month

    # convert time features to dummy variables (one-hot encoding) and replace the original features
    hour_dummies = pd.get_dummies(model_data['hour']).add_prefix('hour_')
    weekday_dummies = pd.get_dummies(model_data['weekday']).add_prefix('weekday_')
    month_dummies = pd.get_dummies(model_data['month']).add_prefix('month_')

    model_data.drop(['hour', 'weekday', 'month'], axis=1, inplace=True)
    model_data = pd.concat([model_data, hour_dummies, weekday_dummies, month_dummies], axis=1)

    del hour_dummies, weekday_dummies, month_dummies
    gc.collect()
    # remove datetime_start_floored column as it won't be needed anymore
    model_data = model_data.drop(columns=['datetime_start_floored'])
    return model_data

In [23]:
def get_invalid_cols(df):
    cols = df.sum().apply(lambda x: np.isinf(x))
    return cols[cols].index

In [24]:
if not os.path.exists(MODEL_DATA_DIR_PATH):
	os.makedirs(MODEL_DATA_DIR_PATH)

# store model data for each time and hex resolution in one file
# additionally create dummies for start and end hexagons  
for h3_res in CALC_H3_RESOLUTIONS:
    for t in CALC_TIME_INTERVAL_LENGTHS:
        if (h3_res == ORIGIN_DESTINATION_H3_RESOLUTION and t == ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH): continue

        model_data = trips_aggregated_df[
            (trips_aggregated_df.h3_res == h3_res) & (trips_aggregated_df.time_interval_length == t)
        ].reset_index(drop=True)

        if t > 5:
            model_data = model_data.drop(columns=["mean_temperature"])
        else:
            model_data = model_data.drop(columns=["min_temperature", "max_temperature"])

        model_data = add_time_features_to_model_data(model_data)

        start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
        model_data = pd.concat([model_data, start_hex_dummies], axis=1)
        model_data = model_data.drop(columns=['start_hex_id', 'end_hex_id', 'h3_res', 'time_interval_length'])
        
        invalid_cols = get_invalid_cols(model_data)

        # check if sum of all values in each column can be represented by the dtype of the column
        # if this is not the case we cannot scale the data as calculated the mean will return erroneous values
        if len(invalid_cols) > 0:
            for col in invalid_cols:
                if model_data[col].dtype != np.float16:
                    raise ValueError("Unexpected dtype for column {}. Expected float16, got {}".format(col, model_data[col].dtype))
                model_data[col] = model_data[col].astype(np.float32)
        
        invalid_cols = get_invalid_cols(model_data)
        if len(invalid_cols) > 0:
            raise ValueError("Invalid columns remaining")


        model_data.to_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{h3_res}_{t}.feather"))

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [25]:
if t > ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH:
    trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(columns=["mean_temperature"])
else:
    trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(columns=["min_temperature", "max_temperature"])

trips_aggregated_df_reduced = add_time_features_to_model_data(trips_aggregated_df_reduced)

start_hex_dummies = pd.get_dummies(trips_aggregated_df_reduced.start_hex_id, prefix="start_")
end_hex_dummies = pd.get_dummies(trips_aggregated_df_reduced.end_hex_id, prefix="end_")
trips_aggregated_df_reduced = pd.concat([trips_aggregated_df_reduced, start_hex_dummies, end_hex_dummies], axis=1)
trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(columns=['start_hex_id', 'end_hex_id', 'h3_res_x', 'time_interval_length'])

trips_aggregated_df_reduced.to_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{ORIGIN_DESTINATION_H3_RESOLUTION}_{ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH}.feather"))