This notebook contains code on collecting sidewalk data from OSM for the generated grids and then calculating length of sidewalks for each 100m x 100m grid

In [None]:
import osmium
import shapely.geometry
import shapely.wkb
import geopandas as gpd
from shapely.geometry import box
import os
import subprocess
import glob

In [None]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/street_walk.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

In [None]:
def is_walkable(tags):
    excluded_highways = {
        "abandoned", "bus_guideway", "construction", "cycleway", "motor",
        "no", "planned", "platform", "proposed", "raceway", "razed"
    }
    hwy = tags.get("highway")
    if not hwy or hwy in excluded_highways:
        return False
    if tags.get("foot") == "no":
        return False
    if tags.get("service") == "private":
        return False
    if tags.get("area") == "yes":
        return False
    return True


def get_relevant_tags(tags):
        return {
            k: tags.get(k)
            for k in ("highway", "foot", "sidewalk", "surface", "name")
            if tags.get(k) is not None
        }

    
def extract_walking_ways(pbf_path):
    wkbfab = osmium.geom.WKBFactory()
    geometries = []

    class Handler(osmium.SimpleHandler):
        def way(self, w):
            if is_walkable(w.tags):
                try:
                    wkb = wkbfab.create_linestring(w)
                    geom = shapely.wkb.loads(wkb, hex=True)
                    tag_subset = get_relevant_tags(w.tags)
                    geometries.append((w.id, geom, tag_subset))
                except:
                    pass  # skip malformed geometries

    handler = Handler()
    handler.apply_file(pbf_path, locations=True)

    if not geometries:
        return gpd.GeoDataFrame(columns=["id", "tags", "geometry"])

    ids, geoms, tags = zip(*geometries)
    return gpd.GeoDataFrame({
        "id": ids,
        "tags": tags,
        "geometry": geoms
    }, crs="EPSG:4326")

In [None]:
def calculate_sidewalk_length(group):
    """
    Calculates the total length of sidewalks within each grid.
    
    Args:
    - group (DataFrame): GeoDataFrame group containing sidewalk line strings.

    Returns:
    - float: Total length of sidewalks in the grid.
    """
    if not group.empty:
        union = group['retain_linestring'].union_all()
        if union:
            intersection_union = group.iloc[0]['geometry'].intersection(union)
            if intersection_union:
                return intersection_union.length
    return 0


def process_grid(grid_path):
    """
    Processes each grid to calculate sidewalk lengths.
    
    Args:
    - grid_path (str): Path to the grid file.
    """
    try:
        grid_number = grid_path.split('_')[-1].split('.')[0]

        grid_gdf = gpd.read_parquet(grid_path)
        output_dir = os.path.join(os.path.dirname(grid_path), 'street_walk_data')
        output_file = f'street_walk_{grid_number}.parquet'
        
        os.makedirs(output_dir, exist_ok=True)
        
        if os.path.exists(os.path.join(output_dir, output_file)) and 'street_walk_length' in grid_gdf.columns:
            return

        if 'index' not in grid_gdf.columns:
            grid_gdf.reset_index(inplace=True)
        
        if 'street_walk_length' in grid_gdf.columns:
            grid_gdf.drop(columns=['street_walk_length'], inplace=True)

    
        # convert grid to WGS84 (lat/lon)
        grid_gdf_4326 = grid_gdf.to_crs('EPSG:4326')
        print(f'Started processing grid {grid_number}')
    
        bounds = grid_gdf_4326.total_bounds
        buffered_box = box(*bounds).buffer(0.01)  # ~1 km buffer in degrees
    
        # clip .pbf file
        clipped_pbf_path = f'data/{grid_number}.pbf'
        os.makedirs(os.path.dirname(clipped_pbf_path), exist_ok=True)
    
        bbox_str = ','.join(map(str, buffered_box.bounds))
        subprocess.run([
            'osmium', 'extract',
            '-b', bbox_str,
            '/Volumes/ssd1/osm_europe/europe-latest.osm.pbf',
            '-o', clipped_pbf_path,
            '--overwrite'
        ], check=True)
        print(f'clipped PBF for {grid_number}')
        
        # process with pyosmium
        gdf_edges = extract_walking_ways(f'data/{grid_number}.pbf')
        
        gdf_edges.to_crs(grid_gdf.crs, inplace=True)
        gdf_edges['retain_linestring'] = gdf_edges.geometry
        
        gdf_edges.to_parquet(os.path.join(output_dir, output_file))
        # logging.info(f'Saved sidewalks data to {output_file}')
        print(f'Saved sidewalks data to {output_file}')

        joined = gpd.sjoin(grid_gdf, gdf_edges, how="left", predicate='intersects')
        grouped_gdf = joined.groupby('index')
        grid_gdf['street_walk_length'] = grouped_gdf.apply(calculate_sidewalk_length)

        grid_gdf.to_parquet(grid_path)
        # logging.info(f'Successfully processed grid {grid_path}')
        print(f'Successfully processed grid {grid_path}')
        
        del gdf_edges, grid_gdf
        gc.collect()
    except Exception as e:
        # logging.error(f'Error processing grid {grid_path}: {e}')
        print(f'Error processing grid {grid_path}: {e}')

In [None]:
# sequential
for elem in grids_list:
    process_grid(elem)

# # parallel processsing if you want to process a lot of files
# num_processes = 5

# with Pool(processes=num_processes) as pool:
#     pool.map(process_grid, grids_list)