# aggregation of neighbourhood for each station from train dataset

In [124]:
import pandas as pd
import h3
traffic_data = pd.read_parquet("../for_participants/data_parquet/traffic_train.parquet")
pois_df = pd.read_parquet("../for_participants/data_parquet/pois.parquet")

In [125]:
a = "8d1f5224541113f"
b = "8d1f53d9a592cbf"
def distance_km(h1, h2):
    loc1 = h3.cell_to_latlng(h1)
    loc2 = h3.cell_to_latlng(h2)
    return h3.great_circle_distance(loc1, loc2)


In [126]:
import pandas as pd
import numpy as np

def aggregate_by_dist(
    agg_points,
    data,
    dist_metric,
    allowed_distance=2,
    agg_fun=np.mean,
):
    results = []

    for point in agg_points:
        data_point = data.copy()
        distances = data_point["h3res13"].apply(lambda x: dist_metric(x, point))
        nearby = data_point[distances < allowed_distance]

        if nearby.empty:
            continue

        aggregated = nearby.drop(columns=["h3res13"]).agg(agg_fun)
        aggregated["h3res13"] = point
        results.append(aggregated)

    if not results:
        return pd.DataFrame(columns=data.columns)

    return pd.DataFrame(results)


In [127]:
import warnings
warnings.filterwarnings("ignore")
dfs0 = []
for dist in [0.5, 1, 2, 3, 5, 10]:
    df0 = aggregate_by_dist(agg_points = traffic_data["h3res13"].unique(), data = pois_df, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
    new_columns = {
        col: col + "_" + str(dist) if col != "h3res13" else col
        for col in df0.columns
    }
    df0 = df0.rename(columns = new_columns)
    dfs0.append(df0)


In [128]:
df_pois_agg = pd.concat(dfs0, axis = 1)
df_pois_agg = df_pois_agg.loc[:,~df_pois_agg.columns.duplicated()]

In [73]:
demo = pd.read_parquet('../for_participants/data_parquet/demography.parquet')
dfs0 = []
for dist in [0.5, 1, 2, 3]:
    df0 = aggregate_by_dist(agg_points = traffic_data["h3res13"].unique(), data = demo, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
    new_columns = {
        col: col + "_" + str(dist) if col != "h3res13" else col
        for col in df0.columns
    }
    df0 = df0.rename(columns = new_columns)
    dfs0.append(df0)

In [74]:
df_demo_agg = pd.concat(dfs0, axis = 1)
df_demo_agg = df_demo_agg.loc[:,~df_demo_agg.columns.duplicated()]

In [131]:
df_pois_agg.to_parquet("dfs_processed/df_pois_agg.parquet")
df_demo_agg.to_parquet("dfs_processed/df_demo_agg.parquet")