# aggregation of neighbourhood for each station from train dataset

In [61]:
import pandas as pd
import h3
traffic_data_train = pd.read_parquet("../for_participants/data_parquet/traffic_train.parquet")
traffic_data_test = pd.read_parquet("../for_participants/data_parquet/traffic_test_without_target.parquet")
pois_df = pd.read_parquet("../for_participants/data_parquet/pois.parquet")
demo_df = pd.read_parquet('../for_participants/data_parquet/demography.parquet')

In [62]:
# CATEGORization of POI
pois_df["free_time_poi"] = pois_df[["active_life", "arts_and_entertainment", "attractions_and_activities", "pets", "eat_and_drink"]].sum(axis = 1)
pois_df["bussines_hours_poi"] = pois_df[["education", "business_to_business", "private_establishments_and_corporates", "professional_services", "real_estate", "financial_service", "health_and_medical"]].sum(axis = 1)


In [64]:
import pandas as pd
import numpy as np

def distance_km(h1, h2):
    loc1 = h3.cell_to_latlng(h1)
    loc2 = h3.cell_to_latlng(h2)
    return h3.great_circle_distance(loc1, loc2)


def aggregate_by_dist(
    agg_points,
    data,
    dist_metric,
    allowed_distance=2,
    agg_fun=np.mean,
):
    results = []

    for point in agg_points:
        data_point = data.copy()
        distances = data_point["h3res13"].apply(lambda x: dist_metric(x, point))
        nearby = data_point[distances < allowed_distance]

        if nearby.empty:
            continue

        aggregated = nearby.drop(columns=["h3res13"]).agg(agg_fun)
        aggregated["h3res13"] = point
        results.append(aggregated)

    if not results:
        return pd.DataFrame(columns=data.columns)

    return pd.DataFrame(results)


In [65]:
import warnings
warnings.filterwarnings("ignore")
def calculate_pois(dataset):
    dfs0 = []
    for dist in [0.5, 1, 2, 3, 5, 10]:
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = pois_df, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_pois_agg = pd.concat(dfs0, axis = 1)
    df_pois_agg["h3res13"] = df_pois_agg["h3res13"].bfill(axis = 1).ffill(axis = 1)
    df_pois_agg = df_pois_agg.loc[:,~df_pois_agg.columns.duplicated()]
    return df_pois_agg

In [66]:
df_pois_train = calculate_pois(dataset = traffic_data_train)
df_pois_test = calculate_pois(dataset = traffic_data_test)

In [67]:
df_pois_train = df_pois_train.fillna(0)
df_pois_test = df_pois_test.fillna(0)

## demography

In [27]:
def calculate_demography(dataset):
    dfs0 = []
    for dist in [0.5, 1, 2, 3]:
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = demo_df, dist_metric = distance_km, allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_demo_agg = pd.concat(dfs0, axis = 1)
    df_demo_agg = df_demo_agg.loc[:,~df_demo_agg.columns.duplicated()]
    return df_demo_agg

df_demo_train = calculate_pois(dataset = traffic_data_train)
df_demo_test = calculate_pois(dataset = traffic_data_test)


In [68]:
df_pois_train.to_parquet("dfs_processed/df_pois_agg.parquet")
df_demo_train.to_parquet("dfs_processed/df_demo_agg.parquet")
df_pois_test.to_parquet("dfs_processed_test/df_pois_agg_test.parquet")
df_demo_test.to_parquet("dfs_processed_test/df_demo_agg_test.parquet")

# A bit faster version for buildings and roads

In [None]:
import pandas as pd
import h3
pois_df = pd.read_parquet("../for_participants/data_parquet/pois.parquet")

In [71]:
import pandas as pd
import numpy as np
import h3

def aggregate_by_dist(
    agg_points,
    data,
    allowed_distance=2,
    agg_fun=np.mean,
):
    all_h3s = pd.unique(np.concatenate([agg_points, data["h3res13"].values]))
    h3_to_coord = {h: h3.cell_to_latlng(h) for h in all_h3s}
    def distance_km_cached(h1, h2):
        loc1 = h3_to_coord[h1]
        loc2 = h3_to_coord[h2]
        return h3.great_circle_distance(loc1, loc2)

    results = []

    for point in agg_points:
        distances = data["h3res13"].apply(lambda x: distance_km_cached(x, point))
        nearby = data[distances < allowed_distance]

        if nearby.empty:
            continue

        aggregated = nearby.drop(columns=["h3res13"]).agg(agg_fun)
        aggregated["h3res13"] = point
        results.append(aggregated)

    if not results:
        return pd.DataFrame(columns=data.columns)

    return pd.DataFrame(results)


In [29]:
roads = pd.read_parquet('../for_participants/data_parquet/roads.parquet')
roads["roads_intensity_1"] = roads[["motorway", "primary", "secondary"]].sum(axis = 1)
roads["roads_intensity_2"] = roads[["tertiary", "residential", "living_street", "service"]].sum(axis = 1)
roads["roads_intensity_3"] = roads[["track", "footway", "cycleway", "bridleway", "path", "steps", "pedestrian"]].sum(axis = 1)
roads = roads[["h3res13", "roads_intensity_1", "roads_intensity_2", "roads_intensity_3"]]

In [33]:
from tqdm import tqdm
def calculate_roads(dataset):
    dfs0 = []
    for dist in tqdm([0.1, 0.2, 0.5, 1]):
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = roads[["h3res13", "roads_intensity_1", "roads_intensity_2", "roads_intensity_3"]], allowed_distance = dist, agg_fun = np.mean)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_roads_agg = pd.concat(dfs0, axis = 1)
    df_roads_agg = df_roads_agg.loc[:,~df_roads_agg.columns.duplicated()]
    return df_roads_agg

In [34]:
df_roads_train = calculate_roads(traffic_data_train)
df_roads_test = calculate_roads(traffic_data_test)

100%|██████████| 4/4 [13:16<00:00, 199.23s/it]
100%|██████████| 4/4 [09:44<00:00, 146.23s/it]


In [35]:
df_roads_train.to_parquet("dfs_processed/df_roads_agg.parquet")
df_roads_test.to_parquet("dfs_processed_test/df_roads_agg_test.parquet")

In [72]:
buildings = pd.read_parquet('../for_participants/data_parquet/buildings.parquet')
building_function_group = {
    "Single-Family Residence": "Residential",
    "Multi-Family Residence": "Residential",
    "Apartment": "Residential",
    "Collected Dwelling Unit": "Residential",
    "Hotel": "Residential",
    "Tourist Accommodation Building": "Residential",
    "Retail and Service Building": "Service",
    "Hospitals and Medical Facility": "Service",
    "Schools and Research Institute": "Service",
    "Museums and Libraries": "Service",
    "Cultural Facility": "Service",
    "Cultural Public Facility": "Service",
    "Place of Worship": "Service",
    "Heritage Building": "Service",
    "Car Park": "Service",
    "Railway and Terminal Building": "Service",
    "Office Building": "Work",
    "Industrial Building": "Work",
    "Silo or Warehouse": "Work",
    "Agricultural Building": "Work",
    "Non-Residential Building": "Work",
}

buildings["building_function"] = buildings["building_function"].apply(lambda x: building_function_group[x]) #[""]

In [73]:
from tqdm import tqdm
buildings = buildings.pivot_table(
    index="h3res13", columns="building_function", values="building_area", aggfunc="sum", fill_value=0).reset_index()

In [74]:
def calculate_buildings(dataset):
    dfs0 = []
    for dist in tqdm([0.2, 0.5, 1, 1.5, 2]):
        df0 = aggregate_by_dist(agg_points = dataset["h3res13"].unique(), data = buildings[["h3res13", "Residential", "Service", "Work"]], allowed_distance = dist, agg_fun = np.sum)
        new_columns = {
            col: col + "_" + str(dist) if col != "h3res13" else col
            for col in df0.columns
        }
        df0 = df0.rename(columns = new_columns)
        dfs0.append(df0)
    df_buildings_agg = pd.concat(dfs0, axis = 1)
    df_buildings_agg["h3res13"] = df_buildings_agg["h3res13"].bfill(axis = 1).ffill(axis = 1)
    df_buildings_agg = df_buildings_agg.loc[:,~df_buildings_agg.columns.duplicated()]
    return df_buildings_agg

In [None]:
df_buildings_train = calculate_buildings(traffic_data_train)
#df_buildings_test = calculate_buildings(traffic_data_test)

#df_buildings_test.to_parquet("dfs_processed_test/df_buildings_agg_test.parquet")

#df_buildings_agg.to_parquet("dfs_processed/df_buildings_agg.parquet")

100%|██████████| 5/5 [05:51<00:00, 70.21s/it]


In [None]:
# tam gdzie nie bylo zadnego budynku tak blisko musze uzupelnic zerami
df_buildings_train = df_buildings_train.fillna(0)
df_buildings_train.to_parquet("dfs_processed/df_buildings_agg.parquet")

In [2]:
import pandas as pd
df_pois_train = pd.read_parquet("dfs_processed/df_pois_agg.parquet")
df_demo_train = pd.read_parquet("dfs_processed/df_demo_agg.parquet")
df_pois_test = pd.read_parquet("dfs_processed_test/df_pois_agg_test.parquet")
df_demo_test = pd.read_parquet("dfs_processed_test/df_demo_agg_test.parquet")
df_roads_train = pd.read_parquet("dfs_processed/df_roads_agg.parquet")
df_roads_test = pd.read_parquet("dfs_processed_test/df_roads_agg_test.parquet")

df_buildings_train = pd.read_parquet("dfs_processed/df_buildings_agg.parquet")

In [4]:
df_roads_train

Unnamed: 0,roads_intensity_1_0.1,roads_intensity_2_0.1,roads_intensity_3_0.1,h3res13,roads_intensity_1_0.2,roads_intensity_2_0.2,roads_intensity_3_0.2,roads_intensity_1_0.5,roads_intensity_2_0.5,roads_intensity_3_0.5,roads_intensity_1_1,roads_intensity_2_1,roads_intensity_3_1
0,0.213058,0.776632,0.219931,8d1f5224541113f,0.176596,0.782979,0.224468,0.300973,0.651639,0.188268,0.193490,0.747847,0.207210
1,0.230366,0.670157,0.256545,8d1f52248b302ff,0.184000,0.629714,0.377143,0.184060,0.627294,0.390291,0.142343,0.625199,0.425966
2,0.244898,0.375510,0.477551,8d1f5224c89b23f,0.145759,0.630824,0.412186,0.142077,0.491148,0.550383,0.105689,0.594520,0.458397
3,0.245791,0.538721,0.414141,8d1f5224dcdd63f,0.161491,0.593168,0.476190,0.117145,0.484562,0.689372,0.081174,0.488548,0.710842
4,0.170213,0.271277,0.742021,8d1f52260d8eaff,0.121409,0.378128,0.711770,0.074903,0.519372,0.677572,0.048039,0.522018,0.710337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0.000000,0.580175,0.647230,8d1f53cb3131cff,0.039514,0.477204,0.754559,0.050090,0.463367,0.776316,0.094776,0.367176,0.821269
66,0.000000,0.389558,0.767068,8d1f53cb38edcff,0.175090,0.341155,0.688628,0.195501,0.261090,0.744455,0.111826,0.308926,0.806789
67,0.156098,0.307317,0.775610,8d1f53cb3ce307f,0.148238,0.408262,0.691373,0.115513,0.422738,0.775046,0.085153,0.391043,0.827986
68,0.240310,0.282946,0.658915,8d1f53cb6b503bf,0.147674,0.289535,0.772093,0.222064,0.222953,0.755019,0.138023,0.341820,0.708822


In [84]:
roads.isna().sum()

roads_intensity_1_0.1    0
roads_intensity_2_0.1    0
roads_intensity_3_0.1    0
h3res13                  0
roads_intensity_1_0.2    0
roads_intensity_2_0.2    0
roads_intensity_3_0.2    0
roads_intensity_1_0.5    0
roads_intensity_2_0.5    0
roads_intensity_3_0.5    0
roads_intensity_1_1      0
roads_intensity_2_1      0
roads_intensity_3_1      0
dtype: int64