This notebook contains code on collecting street intersection data from OSM for the generated grids and then calculating the number of street intersections (edges>=3) for each 100m x 100m grid

In [None]:
import geopandas as gpd
import os
import glob
import logging
from multiprocess import Pool
import gc
import osmium
import shapely.geometry
from shapely.geometry import box
import subprocess
import networkx as nx

In [None]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/street_intersections.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

In [None]:
def is_walkable(tags):
    excluded_highways = {
        "abandoned", "bus_guideway", "construction", "cycleway", "motor",
        "no", "planned", "platform", "proposed", "raceway", "razed", "service"
    }
    hwy = tags.get("highway")
    if not hwy or hwy in excluded_highways:
        return False
    if tags.get("foot") == "no":
        return False
    if tags.get("access") == "private":
        return False
    if tags.get("area") == "yes":
        return False
    return True


def extract_graph_edges(pbf_path):
    node_locations = {}
    edges = []

    class Handler(osmium.SimpleHandler):
        def node(self, n):
            if n.location.valid():
                node_locations[n.id] = (n.location.lon, n.location.lat)

        def way(self, w):
            if is_walkable(w.tags):
                node_refs = [n.ref for n in w.nodes]
                # add edges between consecutive node pairs
                for u, v in zip(node_refs[:-1], node_refs[1:]):
                    edges.append((u, v))

    handler = Handler()
    handler.apply_file(pbf_path, locations=True)
    return edges, node_locations


def build_graph(edges, node_locations):
    G = nx.Graph()
    for u, v in edges:
        if u in node_locations and v in node_locations:
            G.add_edge(u, v)
    return G


def count_streets(G, min_street_count=3):
    intersections = {
        node: G.degree(node)
        for node in G.nodes
        if G.degree(node) >= min_street_count
    }
    return intersections


def intersections_to_gdf(intersections, node_locations):
    ids = list(intersections.keys())
    coords = [shapely.geometry.Point(*node_locations[nid]) for nid in ids]
    gdf = gpd.GeoDataFrame({'id': ids, 'street_count': list(intersections.values())}, 
                           geometry=coords, crs="EPSG:4326")
    return gdf


def extract_intersections_from_pbf(pbf_path, min_street_count=3):
    edges, node_locations = extract_graph_edges(pbf_path)
    G = build_graph(edges, node_locations)
    intersections = count_streets(G, min_street_count)
    return intersections_to_gdf(intersections, node_locations)

In [None]:
def process_grid(grid_path):
    """
    Processes each grid to count street intersections within it.
    
    Args:
    - grid_path (str): Path to the grid file.
    """
    try:
        grid_number = grid_path.split('_')[-1].split('.')[0]

        grid_gdf = gpd.read_parquet(grid_path)
        output_dir = os.path.join(os.path.dirname(grid_path), 'street_intersection_data')
        output_file = f'street_intersections_{grid_number}.parquet'

        os.makedirs(output_dir, exist_ok=True)

        if os.path.exists(os.path.join(output_dir, output_file)):
            logging.info(f'Skipping {output_file} as it already exists')
            print(f'Skipping {output_file} as it already exists')
            return

        if 'index' not in grid_gdf.columns:
            grid_gdf.reset_index(inplace=True)

        if 'num_street_intersections' in grid_gdf.columns:
            grid_gdf.drop(columns=['num_street_intersections'], inplace=True)

        grid_gdf_4326 = grid_gdf.to_crs('epsg:4326')
        bounds = grid_gdf_4326.total_bounds
        buffered_box = box(*bounds).buffer(0.01)  # ~1 km buffer in degrees
        # logging.info(f'Started processing grid {grid_number}')
        print(f'Started processing grid {grid_number}')

        # clip .pbf file
        clipped_pbf_path = f'data/{grid_number}.pbf'
        os.makedirs(os.path.dirname(clipped_pbf_path), exist_ok=True)

        bbox_str = ','.join(map(str, buffered_box.bounds))
        subprocess.run([
            'osmium', 'extract',
            '-b', bbox_str,
            '/Volumes/ssd1/osm_europe/europe-latest.osm.pbf',
            '-o', clipped_pbf_path,
            '--overwrite'
        ], check=True)
        print(f'clipped PBF for {grid_number}')
        
        # process with pyosmium
        gdf_nodes = extract_intersections_from_pbf(clipped_pbf_path)
        gdf_nodes.to_crs(grid_gdf.crs, inplace=True)
        
        gdf_nodes.to_parquet(os.path.join(output_dir, output_file))
        # logging.info(f'Saved intersection data to {output_file}')
        print(f'Saved intersection data to {output_file}')

        joined = gpd.sjoin(grid_gdf, gdf_nodes, how='left', predicate='intersects')
        node_counts_per_grid = joined.groupby('index')['index_right'].nunique().reset_index(name='num_street_intersections')
        grid_gdf_f = grid_gdf.merge(node_counts_per_grid, on='index', how='left').fillna({'num_street_intersections': 0})
        grid_gdf_f.to_parquet(grid_path)
        
        # logging.info(f'Successfully processed grid {grid_path}')
        print(f'Successfully processed grid {grid_path}')

        # clean up temp PBF file
        if os.path.exists(clipped_pbf_path):
            os.remove(clipped_pbf_path)
            
        del gdf_nodes, grid_gdf, grid_gdf_f
        gc.collect()
    except Exception as e:
        # logging.error(f'Error processing grid {grid_path}: {e}')
        print(f'Error processing grid {grid_path}: {e}')

In [None]:
# sequential
for elem in grids_list:
    process_grid(elem)

# # parallel processsing if you want to process a lot of files
# num_processes = 5

# with Pool(processes=num_processes) as pool:
#     pool.map(process_grid, grids_list)