In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
import sys, os, gc

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

In [3]:
trips_aggregated_df_init = pd.read_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)
trips_aggregated_df = trips_aggregated_df_init
trips_aggregated_df_reduced = trips_aggregated_df_init[
    (trips_aggregated_df_init['h3_res'] == ORIGIN_DESTINATION_H3_RESOLUTION) & 
    (trips_aggregated_df_init['time_interval_length'] == ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH)
]
trips_aggregated_df_reduced = trips_aggregated_df_reduced.reset_index(drop=True)
availability_df = pd.read_parquet(AVAILABILITY_PATH)

In [4]:
availability_df = availability_df.reset_index().rename(
    columns={"datetime": "datetime_start_floored", "hex_id": "start_hex_id"}
)


In [5]:
print(f"Total number of entries in the demand dataset for all h3 resolutions and all time intervals: {trips_aggregated_df_init.index.size}")

Total number of entries in the demand dataset for all h3 resolutions and all time intervals: 4253147


In [6]:
# #
# trips_aggregated_df['hour_sin'] = np.sin(trips_aggregated_df.hour*(2.*np.pi/24))
# trips_aggregated_df['hour_cos'] = np.cos(trips_aggregated_df.hour*(2.*np.pi/24))
# # m
# trips_aggregated_df['month_sin'] = np.sin((trips_aggregated_df.month-1)*(2.*np.pi/12))
# trips_aggregated_df['month_cos'] = np.cos((trips_aggregated_df.month-1)*(2.*np.pi/12))

In [7]:
# load the weather dataframe, reduce data types
weather_df = pd.read_parquet(WEATHER_AGGR_TEMPORAL_PATH)
weather_df = weather_df.astype(np.float16)
weather_df['time_interval_length'] = weather_df.time_interval_length.astype(np.uint8)
weather_df.head(2)

Unnamed: 0_level_0,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_interval_length
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 00:00:00,7.398438,7.398438,7.398438,2.800781,8.0,0.0,1
2019-01-01 01:00:00,7.699219,7.699219,7.699219,2.900391,8.0,0.0,1


In [8]:
def add_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    return pd.merge(
        df,
        weather_df,
        how="left",
        left_on=["datetime_start_floored", "time_interval_length"],
        right_on=["MESS_DATUM", "time_interval_length"],
    )


In [9]:
# merge weather data with trips data
trips_aggregated_df = add_weather_data(trips_aggregated_df)
trips_aggregated_df_reduced = add_weather_data(trips_aggregated_df_reduced)
availability_df = add_weather_data(availability_df)

In [10]:
del weather_df
gc.collect()

0

In [11]:
# load poi data and reduce datatypes
hexagons_with_pois_df = pd.read_parquet(HEXAGON_WITH_POIS_PATH)
hexagons_with_pois_df = hexagons_with_pois_df.drop(columns=["hex_and_neighbors"])
hexagons_with_pois_df = hexagons_with_pois_df.astype({
    "h3_res": np.uint16,
    "sustenance_poi": np.uint16,
    "public_transport_poi": np.uint16,
    "education_poi": np.uint16,
    "arts_and_culture_poi": np.uint16,
    "sports_poi": np.uint16,
})

In [12]:
def add_poi_data(df: pd.DataFrame, on="start") -> pd.DataFrame:
    left_on = f"{on}_hex_id"
    df = pd.merge(
        df,
        hexagons_with_pois_df.drop(columns=["h3_res"]).add_suffix(f"_{on}"),
        left_on=left_on,
        right_on=f"hex_{on}",
    )
    df = df.drop(columns={f"hex_{on}"})

    return df


In [13]:
trips_aggregated_df = add_poi_data(trips_aggregated_df)
trips_aggregated_df_reduced = add_poi_data(trips_aggregated_df_reduced, on="start")
trips_aggregated_df_reduced = add_poi_data(trips_aggregated_df_reduced, on="end")
availability_df = add_poi_data(availability_df, on="start")

In [14]:
trips_aggregated_df_reduced.columns

Index(['datetime_start_floored', 'start_hex_id', 'end_hex_id', 'demand',
       'h3_res', 'time_interval_length', 'min_temperature', 'max_temperature',
       'mean_temperature', 'mean_mean_wind_speed', 'mean_total_cloud_cover',
       'sum_precipitation', 'sustenance_poi_start',
       'public_transport_poi_start', 'education_poi_start',
       'arts_and_culture_poi_start', 'sports_poi_start', 'sustenance_poi_end',
       'public_transport_poi_end', 'education_poi_end',
       'arts_and_culture_poi_end', 'sports_poi_end'],
      dtype='object')

In [15]:
del hexagons_with_pois_df
gc.collect()

0

In [16]:
# load land use data and reduce data types
hexagons_with_land_use_df = pd.read_parquet(HEXAGONS_WITH_LAND_USE_PATH)
hexagons_with_land_use_df = hexagons_with_land_use_df.astype(np.float16)
hexagons_with_land_use_df.head(2)

land_use,land_use_1,land_use_2,land_use_3,land_use_4,land_use_5,land_use_6,land_use_7,land_use_8,land_use_9,land_use_10,...,land_use_13,land_use_14,land_use_15,land_use_16,land_use_17,land_use_18,land_use_19,land_use_20,land_use_21,land_use_22
hexagon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
871f1a140ffffff,0.0,0.0,0.0,0.0,0.006058,0.0,0.019272,0.01548,0.0,0.319092,...,0.0,0.0,0.0,0.007874,0.0,0.0,0.0,0.001923,0.008865,0.0
871f1a144ffffff,0.148926,0.155518,0.016571,0.084351,0.135986,0.058533,0.071045,0.082703,0.004414,0.031219,...,0.000406,0.0,0.0,0.0,0.0,0.004196,0.0,0.070251,0.0,0.0


In [17]:
def add_land_use_data(df: pd.DataFrame, on="start") -> pd.DataFrame:
	left_on = f"{on}_hex_id"
	return pd.merge(
		df,
		hexagons_with_land_use_df.add_suffix(f"_{on}"),
		left_on=left_on,
		right_on=f"hexagon_id",
	)

In [18]:
# add land use data as feature of the start location
trips_aggregated_df = add_land_use_data(trips_aggregated_df)
trips_aggregated_df_reduced = add_land_use_data(trips_aggregated_df_reduced, on="start")
trips_aggregated_df_reduced = add_land_use_data(trips_aggregated_df_reduced, on="end")


In [19]:
trips_aggregated_df_reduced.columns

Index(['datetime_start_floored', 'start_hex_id', 'end_hex_id', 'demand',
       'h3_res', 'time_interval_length', 'min_temperature', 'max_temperature',
       'mean_temperature', 'mean_mean_wind_speed', 'mean_total_cloud_cover',
       'sum_precipitation', 'sustenance_poi_start',
       'public_transport_poi_start', 'education_poi_start',
       'arts_and_culture_poi_start', 'sports_poi_start', 'sustenance_poi_end',
       'public_transport_poi_end', 'education_poi_end',
       'arts_and_culture_poi_end', 'sports_poi_end', 'land_use_1_start',
       'land_use_2_start', 'land_use_3_start', 'land_use_4_start',
       'land_use_5_start', 'land_use_6_start', 'land_use_7_start',
       'land_use_8_start', 'land_use_9_start', 'land_use_10_start',
       'land_use_11_start', 'land_use_12_start', 'land_use_13_start',
       'land_use_14_start', 'land_use_15_start', 'land_use_16_start',
       'land_use_17_start', 'land_use_18_start', 'land_use_19_start',
       'land_use_20_start', 'land_use_2

In [20]:
del hexagons_with_land_use_df
gc.collect()

0

In [21]:
print(f"memory usage: {trips_aggregated_df.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {trips_aggregated_df_reduced.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {availability_df.memory_usage(index=True, deep=True).sum() / 1024**2} MB")

memory usage: 966.1413087844849 MB
memory usage: 13.412040710449219 MB
memory usage: 1926.7611694335938 MB


In [22]:
hex_id_map = dict(
    enumerate(
        np.unique(
            np.concatenate(
                [
                    trips_aggregated_df.start_hex_id.unique(),
                    trips_aggregated_df.end_hex_id.unique(),
                    trips_aggregated_df_reduced.start_hex_id.unique(),
                    trips_aggregated_df_reduced.end_hex_id.unique(),
                    availability_df.start_hex_id.unique(),
                ]
            )
        )
    )
)
hex_id_map = {v: k for k, v in hex_id_map.items()}


In [23]:
def remap_hex_ids(df: pd.DataFrame, on="start") -> pd.DataFrame:
	return df[f"{on}_hex_id"].map(hex_id_map).astype(np.uint16)

In [24]:
trips_aggregated_df["start_hex_id"] = remap_hex_ids(trips_aggregated_df)

trips_aggregated_df_reduced["start_hex_id"] = remap_hex_ids(trips_aggregated_df_reduced, on="start")
trips_aggregated_df_reduced["end_hex_id"] = remap_hex_ids(trips_aggregated_df_reduced, on="end")

availability_df["start_hex_id"] = remap_hex_ids(availability_df)

In [25]:
trips_aggregated_df.to_feather(MODEL_DATA_PATH)
trips_aggregated_df_reduced.to_feather(REDUCED_MODEL_DATA_PATH)

In [26]:
def add_time_features_to_model_data(model_data: pd.DataFrame) -> pd.DataFrame:
    # add time features
    model_data['hour'] = model_data.datetime_start_floored.dt.hour
    model_data['weekday'] = model_data.datetime_start_floored.dt.weekday
    model_data['month'] = model_data.datetime_start_floored.dt.month

    # convert time features to dummy variables (one-hot encoding) and replace the original features
    hour_dummies = pd.get_dummies(model_data['hour']).add_prefix('hour_')
    weekday_dummies = pd.get_dummies(model_data['weekday']).add_prefix('weekday_')
    month_dummies = pd.get_dummies(model_data['month']).add_prefix('month_')

    model_data.drop(['hour', 'weekday', 'month'], axis=1, inplace=True)
    model_data = pd.concat([model_data, hour_dummies, weekday_dummies, month_dummies], axis=1)

    del hour_dummies, weekday_dummies, month_dummies
    gc.collect()
    # remove datetime_start_floored column as it won't be needed anymore

    model_data = model_data.drop(columns=['datetime_start_floored'])
    return model_data

In [27]:
def get_invalid_cols(df : pd.DataFrame):
    cols = df.sum().apply(lambda x: np.isinf(x))
    return cols[cols].index

In [28]:
def fix_invalid_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    invalid_cols = get_invalid_cols(df)

    # check if sum of all values in each column can be represented by the dtype of the column
    # if this is not the case we cannot scale the data as calculated the mean will return erroneous values
    if len(invalid_cols) > 0:
        for col in invalid_cols:
            if df[col].dtype != np.float16:
                raise ValueError(
                    "Unexpected dtype for column {}. Expected float16, got {}".format(
                        col, df[col].dtype
                    )
                )
            df[col] = model_data[col].astype(np.float32)

    invalid_cols = get_invalid_cols(df)
    if len(invalid_cols) > 0:
        raise ValueError("Invalid columns remaining")

    return df

In [29]:
def choose_temperature_features(model_data: pd.DataFrame, t: int) -> pd.DataFrame:
    if t > 5:
        return model_data.drop(columns=["mean_temperature"])
    return model_data.drop(columns=["min_temperature", "max_temperature"])

In [30]:
if not os.path.exists(MODEL_DATA_DIR_PATH):
    os.makedirs(MODEL_DATA_DIR_PATH)

# store model data for each time and hex resolution in one file
# additionally create dummies for start and end hexagons
for h3_res, t, df_name in itertools.product(
    CALC_H3_RESOLUTIONS,
    CALC_TIME_INTERVAL_LENGTHS,
    ["trips", "availability"],
):
    model_data = trips_aggregated_df if df_name == "demand" else availability_df

    model_data = model_data[
        (model_data.h3_res == h3_res)
        & (model_data.time_interval_length == t)
    ].reset_index(drop=True)

    model_data = choose_temperature_features(model_data, t)

    model_data = add_time_features_to_model_data(model_data)

    start_hex_dummies = pd.get_dummies(model_data.start_hex_id, prefix="start_")
    model_data = pd.concat([model_data, start_hex_dummies], axis=1)
    model_data = model_data.drop(
        columns=["start_hex_id", "h3_res", "time_interval_length"]
    )
    if "end_hex_id" in model_data.columns:
        model_data = model_data.drop(columns=["end_hex_id"])

    model_data = fix_invalid_cols(model_data)

    model_data.to_feather(
        os.path.join(MODEL_DATA_DIR_PATH, f"{df_name}_{h3_res}_{t}.feather")
    )


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  return umr_sum(a, axis, dtype, out, ke

In [31]:
trips_aggregated_df_reduced = choose_temperature_features(
    trips_aggregated_df_reduced, ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH
)

trips_aggregated_df_reduced = add_time_features_to_model_data(
    trips_aggregated_df_reduced
)

start_hex_dummies = pd.get_dummies(
    trips_aggregated_df_reduced.start_hex_id, prefix="start_"
)
end_hex_dummies = pd.get_dummies(trips_aggregated_df_reduced.end_hex_id, prefix="end_")
trips_aggregated_df_reduced = pd.concat(
    [trips_aggregated_df_reduced, start_hex_dummies, end_hex_dummies], axis=1
)
trips_aggregated_df_reduced = trips_aggregated_df_reduced.drop(
    columns=["start_hex_id", "end_hex_id", "h3_res", "time_interval_length"]
)

trips_aggregated_df_reduced = fix_invalid_cols(trips_aggregated_df_reduced)

trips_aggregated_df_reduced.to_feather(
    os.path.join(
        MODEL_DATA_DIR_PATH,
        f"demand_{ORIGIN_DESTINATION_H3_RESOLUTION}_{ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH}.feather",
    )
)


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
