# aggregation of neighbourhood for each station from train dataset

In [61]:
import pandas as pd
import h3
traffic_data_train = pd.read_parquet("../for_participants/data_parquet/traffic_train.parquet")
traffic_data_test = pd.read_parquet("../for_participants/data_parquet/traffic_test_without_target.parquet")
pois_df = pd.read_parquet("../for_participants/data_parquet/pois.parquet")
demo_df = pd.read_parquet('../for_participants/data_parquet/demography.parquet')

In [62]:
# CATEGORization of POI
pois_df["free_time_poi"] = pois_df[["active_life", "arts_and_entertainment", "attractions_and_activities", "pets", "eat_and_drink"]].sum(axis = 1)
pois_df["bussines_hours_poi"] = pois_df[["education", "business_to_business", "private_establishments_and_corporates", "professional_services", "real_estate", "financial_service", "health_and_medical"]].sum(axis = 1)


In [63]:
pois_df.columns

Index(['h3res13', 'accommodation', 'active_life', 'arts_and_entertainment',
       'attractions_and_activities', 'automotive', 'beauty_and_spa',
       'business_to_business', 'eat_and_drink', 'education',
       'financial_service', 'health_and_medical', 'home_service', 'mass_media',
       'pets', 'private_establishments_and_corporates',
       'professional_services', 'public_service_and_government', 'real_estate',
       'religious_organization', 'retail', 'structure_and_geography', 'travel',
       'free_time_poi', 'bussines_hours_poi'],
      dtype='object')

In [64]:
import pandas as pd
import numpy as np

def distance_km(h1, h2):
    loc1 = h3.cell_to_latlng(h1)
    loc2 = h3.cell_to_latlng(h2)
    return h3.great_circle_distance(loc1, loc2)


def aggregate_by_dist(
    agg_points,
    data,
    dist_metric,
    allowed_distance=2,
    agg_fun=np.mean,
):
    results = []

    for point in agg_points:
        data_point = data.copy()
        distances = data_point["h3res13"].apply(lambda x: dist_metric(x, point))
        nearby = data_point[distances < allowed_distance]

        if nearby.empty:
            continue

        aggregated = nearby.drop(columns=["h3res13"]).agg(agg_fun)
        aggregated["h3res13"] = point
        results.append(aggregated)

    if not results:
        return pd.DataFrame(columns=data.columns)

    return pd.DataFrame(results)


In [65]:
import warnings
warnings.filterwarnings("ignore")
def calculate_pois(dataset):
    dfs0 = []
    for dist in [0.5, 1, 2, 3, 5, 10]:
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = pois_df, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_pois_agg = pd.concat(dfs0, axis = 1)
    df_pois_agg["h3res13"] = df_pois_agg["h3res13"].bfill(axis = 1).ffill(axis = 1)
    df_pois_agg = df_pois_agg.loc[:,~df_pois_agg.columns.duplicated()]
    return df_pois_agg

In [66]:
df_pois_train = calculate_pois(dataset = traffic_data_train)
df_pois_test = calculate_pois(dataset = traffic_data_test)

In [67]:
df_pois_train = df_pois_train.fillna(0)
df_pois_test = df_pois_test.fillna(0)

## demography

In [27]:
def calculate_demography(dataset):
    dfs0 = []
    for dist in [0.5, 1, 2, 3]:
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = demo_df, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_demo_agg = pd.concat(dfs0, axis = 1)
    df_demo_agg = df_demo_agg.loc[:,~df_demo_agg.columns.duplicated()]
    return df_demo_agg

df_demo_train = calculate_pois(dataset = traffic_data_train)
df_demo_test = calculate_pois(dataset = traffic_data_test)


In [68]:
df_pois_train.to_parquet("dfs_processed/df_pois_agg.parquet")
df_demo_train.to_parquet("dfs_processed/df_demo_agg.parquet")
df_pois_test.to_parquet("dfs_processed_test/df_pois_agg_test.parquet")
df_demo_test.to_parquet("dfs_processed_test/df_demo_agg_test.parquet")

# A bit faster version for buildings and roads

In [None]:
import pandas as pd
import h3
pois_df = pd.read_parquet("../for_participants/data_parquet/pois.parquet")

In [50]:
import pandas as pd
import numpy as np
import h3

def aggregate_by_dist(
    agg_points,
    data,
    allowed_distance=2,
    agg_fun=np.mean,
):
    all_h3s = pd.unique(np.concatenate([agg_points, data["h3res13"].values]))
    h3_to_coord = {h: h3.cell_to_latlng(h) for h in all_h3s}
    def distance_km_cached(h1, h2):
        loc1 = h3_to_coord[h1]
        loc2 = h3_to_coord[h2]
        return h3.great_circle_distance(loc1, loc2)

    results = []

    for point in agg_points:
        distances = data["h3res13"].apply(lambda x: distance_km_cached(x, point))
        nearby = data[distances < allowed_distance]

        if nearby.empty:
            continue

        aggregated = nearby.drop(columns=["h3res13"]).agg(agg_fun)
        aggregated["h3res13"] = point
        results.append(aggregated)

    if not results:
        return pd.DataFrame(columns=data.columns)

    return pd.DataFrame(results)


In [29]:
roads = pd.read_parquet('../for_participants/data_parquet/roads.parquet')
roads["roads_intensity_1"] = roads[["motorway", "primary", "secondary"]].sum(axis = 1)
roads["roads_intensity_2"] = roads[["tertiary", "residential", "living_street", "service"]].sum(axis = 1)
roads["roads_intensity_3"] = roads[["track", "footway", "cycleway", "bridleway", "path", "steps", "pedestrian"]].sum(axis = 1)
roads = roads[["h3res13", "roads_intensity_1", "roads_intensity_2", "roads_intensity_3"]]

In [33]:
from tqdm import tqdm
def calculate_roads(dataset):
    dfs0 = []
    for dist in tqdm([0.1, 0.2, 0.5, 1]):
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = roads[["h3res13", "roads_intensity_1", "roads_intensity_2", "roads_intensity_3"]], allowed_distance = dist, agg_fun = np.mean)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_roads_agg = pd.concat(dfs0, axis = 1)
    df_roads_agg = df_roads_agg.loc[:,~df_roads_agg.columns.duplicated()]
    return df_roads_agg

In [34]:
df_roads_train = calculate_roads(traffic_data_train)
df_roads_test = calculate_roads(traffic_data_test)

100%|██████████| 4/4 [13:16<00:00, 199.23s/it]
100%|██████████| 4/4 [09:44<00:00, 146.23s/it]


In [35]:
df_roads_train.to_parquet("dfs_processed/df_roads_agg.parquet")
df_roads_test.to_parquet("dfs_processed_test/df_roads_agg_test.parquet")

In [36]:
buildings = pd.read_parquet('../for_participants/data_parquet/buildings.parquet')
building_function_group = {
    "Single-Family Residence": "Residential",
    "Multi-Family Residence": "Residential",
    "Apartment": "Residential",
    "Collected Dwelling Unit": "Residential",
    "Hotel": "Residential",
    "Tourist Accommodation Building": "Residential",
    "Retail and Service Building": "Service",
    "Hospitals and Medical Facility": "Service",
    "Schools and Research Institute": "Service",
    "Museums and Libraries": "Service",
    "Cultural Facility": "Service",
    "Cultural Public Facility": "Service",
    "Place of Worship": "Service",
    "Heritage Building": "Service",
    "Car Park": "Service",
    "Railway and Terminal Building": "Service",
    "Office Building": "Work",
    "Industrial Building": "Work",
    "Silo or Warehouse": "Work",
    "Agricultural Building": "Work",
    "Non-Residential Building": "Work",
}

buildings["building_function"] = buildings["building_function"].apply(lambda x: building_function_group[x]) #[""]

In [37]:
from tqdm import tqdm
buildings = buildings.pivot_table(
    index="h3res13", columns="building_function", values="building_area", aggfunc="sum", fill_value=0).reset_index()

In [48]:
def calculate_buildings(dataset):
    dfs0 = []
    for dist in tqdm([0.2, 0.5, 1, 1.5, 2]):
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = buildings[["h3res13", "Residential", "Service", "Work"]], allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_buildings_agg = pd.concat(dfs0, axis = 1)
    df_buildings_agg = df_buildings_agg.loc[:,~df_buildings_agg.columns.duplicated()]
    return df_buildings_agg

In [51]:
df_buildings_train = calculate_buildings(traffic_data_train)
#df_buildings_test = calculate_buildings(traffic_data_test)
df_buildings_train.to_parquet("dfs_processed/df_buildings_agg.parquet")
#df_buildings_test.to_parquet("dfs_processed_test/df_buildings_agg_test.parquet")

#df_buildings_agg.to_parquet("dfs_processed/df_buildings_agg.parquet")

100%|██████████| 5/5 [10:22<00:00, 124.41s/it]
