In [1]:
import pandas as pd
import numpy as np
import itertools
from tqdm.notebook import tqdm
import warnings

In [2]:
import sys, os, gc

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *
from modules.preprocessing import reindex_on_full_index

In [3]:
trips_aggregated_df_init = pd.read_parquet(TRIPS_GROUPED_SPATIO_TEMPORAL_PATH)

In [4]:
trips_orig = trips_aggregated_df_init
trips_orig_dest = trips_aggregated_df_init[
    (trips_aggregated_df_init['h3_res'] == ORIGIN_DESTINATION_H3_RESOLUTION) & 
    (trips_aggregated_df_init['time_interval_length'] == ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH)
]
# trips_orig_dest = trips_orig_dest.reset_index(drop=True)

In [5]:
trips_orig = (
    trips_orig.groupby(["datetime_start_floored", "start_hex_id"])
    .agg({"demand": "sum", "h3_res": "max", "time_interval_length": "max"})
    .reset_index()
)
trips_orig = trips_orig.rename(
	columns={"datetime_start_floored": "datetime", "start_hex_id": "hex_id"}
)

In [6]:
trips_orig_dest = trips_orig_dest.rename(columns={"datetime_start_floored": "datetime"})
trips_orig_dest = trips_orig_dest.set_index(["datetime", "start_hex_id", "end_hex_id"])

In [7]:
trips_orig_dest = reindex_on_full_index(
    trips_orig_dest, ORIGIN_DESTINATION_H3_RESOLUTION, ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH, start_and_end=True
).reset_index()


In [8]:
trips_orig = trips_orig.set_index(["datetime", "hex_id"])
full_trips = []

for h3_res, time_interval_length in tqdm(
    list(itertools.product(CALC_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS))
):
    trips = trips_orig[(trips_orig['h3_res'] == h3_res) & (trips_orig['time_interval_length'] == time_interval_length)]
    full_trips.append(reindex_on_full_index(trips, h3_res, time_interval_length))

trips_orig = pd.concat(full_trips)
del full_trips


  0%|          | 0/12 [00:00<?, ?it/s]

In [9]:
trips_orig = trips_orig.reset_index()

In [10]:
availability_df = pd.read_parquet(AVAILABILITY_PATH)
availability_df = availability_df.reset_index()

In [40]:
print(f"Total number of entries in the demand dataset for all h3 resolutions and all time intervals: {trips_orig.index.size}")

Total number of entries in the demand dataset for all h3 resolutions and all time intervals: 13098911


In [12]:
# load the weather dataframe, reduce data types
weather_df = pd.read_parquet(WEATHER_AGGR_TEMPORAL_PATH)
weather_df = weather_df.astype(np.float16)
weather_df['time_interval_length'] = weather_df.time_interval_length.astype(np.uint8)
weather_df.head(2)

Unnamed: 0_level_0,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_interval_length
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 00:00:00,7.398438,7.398438,7.398438,2.800781,8.0,0.0,1
2019-01-01 01:00:00,7.699219,7.699219,7.699219,2.900391,8.0,0.0,1


In [13]:
def add_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    return pd.merge(
        df,
        weather_df,
        how="left",
        left_on=["datetime", "time_interval_length"],
        right_on=["MESS_DATUM", "time_interval_length"],
    )


In [14]:
# merge weather data with trips data
trips_orig = add_weather_data(trips_orig)
trips_orig_dest = add_weather_data(trips_orig_dest)
availability_df = add_weather_data(availability_df)

In [15]:
del weather_df

In [16]:
# load poi data and reduce datatypes
hexagons_with_pois_df = pd.read_parquet(HEXAGON_WITH_POIS_PATH)
hexagons_with_pois_df = hexagons_with_pois_df.drop(columns=["hex_and_neighbors"])
hexagons_with_pois_df = hexagons_with_pois_df.astype({
    "h3_res": np.uint16,
    "sustenance_poi": np.uint16,
    "public_transport_poi": np.uint16,
    "education_poi": np.uint16,
    "arts_and_culture_poi": np.uint16,
    "sports_poi": np.uint16,
})

In [17]:
hexagons_with_pois_df = hexagons_with_pois_df.drop(columns=["h3_res"])

In [18]:
def add_poi_data(df: pd.DataFrame, on="hex_id", suffix: str = None) -> pd.DataFrame:
    df = pd.merge(
        df,
        hexagons_with_pois_df.add_suffix(suffix) if suffix else hexagons_with_pois_df,
        left_on=on,
        right_on=f"hex{suffix}" if suffix else "hex",
    )
    df = df.drop(columns=["hex" + suffix if suffix else "hex"])

    return df


In [19]:
trips_orig = add_poi_data(trips_orig)
trips_orig_dest = add_poi_data(trips_orig_dest, on="start_hex_id", suffix="_start")
trips_orig_dest = add_poi_data(trips_orig_dest, on="end_hex_id", suffix="_end")
availability_df = add_poi_data(availability_df)

In [20]:
trips_orig_dest.columns

Index(['datetime', 'start_hex_id', 'end_hex_id', 'demand', 'h3_res',
       'time_interval_length', 'min_temperature', 'max_temperature',
       'mean_temperature', 'mean_mean_wind_speed', 'mean_total_cloud_cover',
       'sum_precipitation', 'hex_start', 'sustenance_poi_start',
       'public_transport_poi_start', 'education_poi_start',
       'arts_and_culture_poi_start', 'sports_poi_start', 'hex_end',
       'sustenance_poi_end', 'public_transport_poi_end', 'education_poi_end',
       'arts_and_culture_poi_end', 'sports_poi_end'],
      dtype='object')

In [21]:
del hexagons_with_pois_df

In [22]:
# load land use data and reduce data types
hexagons_with_land_use_df = pd.read_parquet(HEXAGONS_WITH_LAND_USE_PATH)
hexagons_with_land_use_df = hexagons_with_land_use_df.astype(np.float16)
hexagons_with_land_use_df = hexagons_with_land_use_df.reset_index()
hexagons_with_land_use_df.head(2)

land_use,hexagon_id,land_use_1,land_use_2,land_use_3,land_use_4,land_use_5,land_use_6,land_use_7,land_use_8,land_use_9,...,land_use_13,land_use_14,land_use_15,land_use_16,land_use_17,land_use_18,land_use_19,land_use_20,land_use_21,land_use_22
0,871f1a140ffffff,0.0,0.0,0.0,0.0,0.006058,0.0,0.019272,0.01548,0.0,...,0.0,0.0,0.0,0.007874,0.0,0.0,0.0,0.001923,0.008865,0.0
1,871f1a144ffffff,0.148926,0.155518,0.016571,0.084351,0.135986,0.058533,0.071045,0.082703,0.004414,...,0.000406,0.0,0.0,0.0,0.0,0.004196,0.0,0.070251,0.0,0.0


In [23]:
def add_land_use_data(df: pd.DataFrame, on="hex_id", suffix: str = None) -> pd.DataFrame:
	df = pd.merge(
		df,
		hexagons_with_land_use_df.add_suffix(suffix) if suffix else hexagons_with_land_use_df,
		left_on=on,
		right_on=f"hexagon_id{suffix}" if suffix else "hexagon_id",
	)
	return df

In [24]:
# add land use data as feature of the start location
trips_orig = add_land_use_data(trips_orig)
trips_orig_dest = add_land_use_data(trips_orig_dest, on="start_hex_id", suffix="_start")
trips_orig_dest = add_land_use_data(trips_orig_dest, on="end_hex_id", suffix="_end")

In [25]:
trips_orig_dest.columns

Index(['datetime', 'start_hex_id', 'end_hex_id', 'demand', 'h3_res',
       'time_interval_length', 'min_temperature', 'max_temperature',
       'mean_temperature', 'mean_mean_wind_speed', 'mean_total_cloud_cover',
       'sum_precipitation', 'hex_start', 'sustenance_poi_start',
       'public_transport_poi_start', 'education_poi_start',
       'arts_and_culture_poi_start', 'sports_poi_start', 'hex_end',
       'sustenance_poi_end', 'public_transport_poi_end', 'education_poi_end',
       'arts_and_culture_poi_end', 'sports_poi_end', 'hexagon_id_start',
       'land_use_1_start', 'land_use_2_start', 'land_use_3_start',
       'land_use_4_start', 'land_use_5_start', 'land_use_6_start',
       'land_use_7_start', 'land_use_8_start', 'land_use_9_start',
       'land_use_10_start', 'land_use_11_start', 'land_use_12_start',
       'land_use_13_start', 'land_use_14_start', 'land_use_15_start',
       'land_use_16_start', 'land_use_17_start', 'land_use_18_start',
       'land_use_19_start', 'l

In [26]:
del hexagons_with_land_use_df

In [27]:
print(f"memory usage: {trips_orig.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {trips_orig_dest.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {availability_df.memory_usage(index=True, deep=True).sum() / 1024**2} MB")

memory usage: 4022.4545879364014 MB
memory usage: 516.5524291992188 MB
memory usage: 2962.0358276367188 MB


In [28]:
hex_id_map = dict(
    enumerate(
        np.unique(
            np.concatenate(
                [
                    trips_orig.hex_id.unique(),
                    trips_orig_dest.start_hex_id.unique(),
                    trips_orig_dest.end_hex_id.unique(),
                    availability_df.hex_id.unique(),
                ]
            )
        )
    )
)
hex_id_map = {v: k for k, v in hex_id_map.items()}


In [29]:
def remap_hex_ids(df: pd.DataFrame, on="hex_id") -> pd.DataFrame:
	df["hex_id"] = df["hex_id"].map(hex_id_map)
	return df

In [30]:
trips_orig["hex_id"] = trips_orig["hex_id"].map(hex_id_map)

trips_orig_dest["start_hex_id"] = trips_orig_dest["start_hex_id"].map(hex_id_map)
trips_orig_dest["end_hex_id"] = trips_orig_dest["end_hex_id"].map(hex_id_map)

availability_df["hex_id"] = availability_df["hex_id"].map(hex_id_map)

In [31]:
print(f"memory usage: {trips_orig.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {trips_orig_dest.memory_usage(index=True, deep=True).sum() / 1024**2} MB")
print(f"memory usage: {availability_df.memory_usage(index=True, deep=True).sum() / 1024**2} MB")

memory usage: 3222.9605083465576 MB
memory usage: 404.86541748046875 MB
memory usage: 2041.7916870117188 MB


In [32]:
trips_orig.to_feather(MODEL_DATA_PATH)
trips_orig_dest.to_feather(REDUCED_MODEL_DATA_PATH)

In [33]:
def add_time_features_to_model_data(model_data: pd.DataFrame) -> pd.DataFrame:
    # add time features
    model_data['hour'] = model_data.datetime.dt.hour
    model_data['weekday'] = model_data.datetime.dt.weekday
    model_data['month'] = model_data.datetime.dt.month

    # convert time features to dummy variables (one-hot encoding) and replace the original features
    hour_dummies = pd.get_dummies(model_data['hour']).add_prefix('hour_')
    weekday_dummies = pd.get_dummies(model_data['weekday']).add_prefix('weekday_')
    month_dummies = pd.get_dummies(model_data['month']).add_prefix('month_')

    model_data.drop(['hour', 'weekday', 'month'], axis=1, inplace=True)
    model_data = pd.concat([model_data, hour_dummies, weekday_dummies, month_dummies], axis=1)

    del hour_dummies, weekday_dummies, month_dummies
    # remove datetime_start_floored column as it won't be needed anymore

    model_data = model_data.drop(columns=['datetime'])
    return model_data

In [34]:
def get_invalid_cols(df : pd.DataFrame):
    warnings.filterwarnings("ignore")
    cols = df.select_dtypes(include=np.number).sum().apply(lambda x: np.isinf(x))
    warnings.filterwarnings("default")
    return cols[cols].index

In [35]:
def fix_invalid_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    invalid_cols = get_invalid_cols(df)

    # check if sum of all values in each column can be represented by the dtype of the column
    # if this is not the case we cannot scale the data as calculated the mean will return erroneous values
    if len(invalid_cols) > 0:
        for col in invalid_cols:
            if df[col].dtype != np.float16:
                raise ValueError(
                    "Unexpected dtype for column {}. Expected float16, got {}".format(
                        col, df[col].dtype
                    )
                )
            df[col] = df[col].astype(np.float32)

    invalid_cols = get_invalid_cols(df)
    if len(invalid_cols) > 0:
        raise ValueError("Invalid columns remaining")

    return df

In [36]:
def choose_temperature_features(model_data: pd.DataFrame, t: int) -> pd.DataFrame:
    if t > 5:
        return model_data.drop(columns=["mean_temperature"])
    return model_data.drop(columns=["min_temperature", "max_temperature"])

In [38]:
if not os.path.exists(MODEL_DATA_DIR_PATH):
    os.makedirs(MODEL_DATA_DIR_PATH)

# store model data for each time and hex resolution in one file
# additionally create dummies for start and end hexagons
for h3_res, t, df_name in tqdm(
    list(
        itertools.product(
            CALC_H3_RESOLUTIONS,
            CALC_TIME_INTERVAL_LENGTHS,
            ["demand", "availability"],
        )
    )
):
    model_data = trips_orig if df_name == "demand" else availability_df

    model_data = model_data[
        (model_data.h3_res == h3_res) & (model_data.time_interval_length == t)
    ].reset_index(drop=True)

    model_data = choose_temperature_features(model_data, t)

    model_data = add_time_features_to_model_data(model_data)

    hex_dummies = pd.get_dummies(model_data.hex_id, prefix="start_")
    model_data = pd.concat([model_data, hex_dummies], axis=1)
    model_data = model_data.drop(columns=["hex_id", "h3_res", "time_interval_length"])

    model_data = fix_invalid_cols(model_data)

    model_data.to_feather(
        os.path.join(MODEL_DATA_DIR_PATH, f"{df_name}_{h3_res}_{t}.feather")
    )


  0%|          | 0/24 [00:00<?, ?it/s]

In [39]:
trips_orig_dest = choose_temperature_features(
    trips_orig_dest, ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH
)

trips_orig_dest = add_time_features_to_model_data(
    trips_orig_dest
)

start_hex_dummies = pd.get_dummies(
    trips_orig_dest.start_hex_id, prefix="start_"
)
end_hex_dummies = pd.get_dummies(trips_orig_dest.end_hex_id, prefix="end_")
trips_orig_dest = pd.concat(
    [trips_orig_dest, start_hex_dummies, end_hex_dummies], axis=1
)
trips_orig_dest = trips_orig_dest.drop(
    columns=["start_hex_id", "end_hex_id", "h3_res", "time_interval_length"]
)

trips_orig_dest = fix_invalid_cols(trips_orig_dest)

trips_orig_dest.to_feather(
    os.path.join(
        MODEL_DATA_DIR_PATH,
        f"demand_orig_dest_{ORIGIN_DESTINATION_H3_RESOLUTION}_{ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH}.feather",
    )
)
