This notebook contains code on collecting sidewalk data from OSM for the generated grids and then calculating length of sidewalks for each 100m x 100m grid

In [None]:
import geopandas as gpd
import osmnx as ox
import os
import glob
import logging
from multiprocess import Pool
import gc
from pyrosm import OSM
from shapely.geometry import Polygon, box
import subprocess
import numpy as np
import pandas as pd

In [None]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/street_walk.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

In [None]:
def calculate_sidewalk_length(group):
    """
    Calculates the total length of sidewalks within each grid.
    
    Args:
    - group (DataFrame): GeoDataFrame group containing sidewalk line strings.

    Returns:
    - float: Total length of sidewalks in the grid.
    """
    if not group.empty:
        union = group['retain_linestring'].unary_union
        if union:
            intersection_union = group.iloc[0]['geometry'].intersection(union)
            if intersection_union:
                return intersection_union.length
    return 0

def process_grid(grid_path):
    """
    Processes each grid to calculate sidewalk lengths.
    
    Args:
    - grid_path (str): Path to the grid file.
    """
    try:
        ox.settings.log_console = True
        ox.settings.use_cache = True
        
        grid_number = grid_path.split('_')[-1].split('.')[0]
        grid_gdf = gpd.read_parquet(grid_path)
        
        output_dir = os.path.join(os.path.dirname(grid_path), 'street_walk_data')
        output_file = f'street_walk_{grid_number}.parquet'
        os.makedirs(output_dir, exist_ok=True)

        if os.path.exists(os.path.join(output_dir, output_file)) and 'street_walk_length' in grid_gdf.columns:
            # logging.info(f'Skipping {output_file} as it already exists')
            return

        if 'index' not in grid_gdf.columns:
            grid_gdf.reset_index(inplace=True)

        if 'street_walk_length' in grid_gdf.columns:
            grid_gdf.drop(columns=['street_walk_length'], inplace=True)
            
        grid_gdf_4326 = grid_gdf.to_crs('epsg:4326')
        west, south, east, north = grid_gdf_4326.total_bounds
        logging.info(f'Started processing grid {grid_number}')
        
        G = ox.graph_from_bbox(bbox=(north, south, east, west), network_type='walk', simplify=False, retain_all=True)
        gdf_edges = ox.convert.graph_to_gdfs(G, nodes=False, edges=True)
        gdf_edges.to_crs(grid_gdf.crs, inplace=True)

        if 'name' in gdf_edges.columns:
            gdf_edges = gdf_edges.loc[:, ['name', 'geometry']]
        else:
            gdf_edges = gdf_edges.loc[:, ['geometry']]
        
        gdf_edges['retain_linestring'] = gdf_edges.geometry
        
        # save the intersection data
        gdf_edges.to_parquet(os.path.join(output_dir, output_file))
        logging.info(f'Saved sidewalks to {os.path.join(output_dir, output_file)}')

        # calculate sidewalk length for each grid
        joined = gpd.sjoin(grid_gdf, gdf_edges, how="left", predicate='intersects')
        grouped_gdf = joined.groupby('index')
        grid_gdf['street_walk_length'] = grouped_gdf.apply(calculate_sidewalk_length)
    
        grid_gdf.to_parquet(grid_path)
        
        logging.info(f'Successfully processed grid {grid_path}')

        # free memory
        del gdf_edges, grid_gdf
        gc.collect()
    except Exception as e:
        logging.error(f'Error processing grid {grid_path}: {e}')

In [None]:
# sequential
for elem in grids_list:
    process_grid(elem)

# # parallel processsing if you want to process a lot of files
# num_processes = 5

# with Pool(processes=num_processes) as pool:
#     pool.map(process_grid, grids_list)