This notebook contains code on collecting public transport data from OSM for the generated grids and then calculating the number of public transport options for each 100m x 100m grid

In [None]:
import geopandas as gpd
import os
import glob
import overpass
import logging

In [None]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/public_transport.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

In [None]:
def geo_to_overpass_poly(bounds):
    """
    Converts geographic boundaries to an Overpass API polygon string.
    
    Args:
    - bounds (tuple): A tuple containing the minx, miny, maxx, maxy of the bounding box.

    Returns:
    - str: Polygon string formatted for Overpass API queries.
    """
    minx, miny, maxx, maxy = bounds
    coords = [(miny, minx), (miny, maxx), (maxy, maxx), (maxy, minx), (miny, minx)]
    formatted_coords = ' '.join([f'{lon} {lat}' for lon, lat in coords])
    return f'poly:"{formatted_coords}"'

def query_osm_transportation(polygon_wkt):
    """
    Queries Overpass API for public transportation nodes within a given polygon.
    
    Args:
    - polygon_wkt (str): WKT polygon string for the geographic area.

    Returns:
    - dict: GeoJSON formatted result of the query.
    """
    api = overpass.API(timeout=600)
    query = f"""
        (
            node["highway" = "bus_stop"]["name"]({polygon_wkt});
            node["public_transport" = "station"]["name"]({polygon_wkt});
            node["public_transport" = "platform"]["name"]({polygon_wkt});
            node["tram"]["name"]({polygon_wkt});
            node["metro"]["name"]({polygon_wkt});
        );
    """
    return api.get(query, verbosity='geom')

def process_grid(grid_path):
    """
    Processes each grid file to query and save public transport data.
    
    Args:
    - grid_path (str): Path to the grid file.
    """
    try:
        grid_number = grid_path.split('_')[-1].split('.')[0]
        output_dir = os.path.join(os.path.dirname(grid_path), 'public_transport_data')
        output_file = f'public_transport_points_{grid_number}.parquet'
        os.makedirs(output_dir, exist_ok=True)

        if os.path.exists(os.path.join(output_dir, output_file)):
            # logging.info(f'Skipping {output_file} as it already exists')
            print(f'Skipping {output_file} as it already exists')
            return

        grid_gdf = gpd.read_parquet(grid_path)
        if 'index' not in grid_gdf.columns:
            grid_gdf.reset_index(inplace=True)

        grid_gdf_4326 = grid_gdf.to_crs('epsg:4326')
        bounds = grid_gdf_4326.total_bounds
        polygon_wkt = geo_to_overpass_poly(bounds)
        data = query_osm_transportation(polygon_wkt)
        data_gdf = gpd.GeoDataFrame.from_features(data)

        if data_gdf.empty:
            # logging.info(f'No public transport data found for grid {grid_path}')
            print(f'No public transport data found for grid {grid_path}')
            return

        data_gdf.set_crs('epsg:4326', inplace=True)
        data_gdf.to_crs(grid_gdf.crs, inplace=True)
        data_gdf.to_parquet(os.path.join(output_dir, output_file))
        # logging.info(f'Saved public transport data to {output_file}')
        print(f'Saved public transport data to {output_file}')

        joined = gpd.sjoin(grid_gdf, data_gdf, how="left", predicate='intersects')
        node_counts_per_grid = joined.groupby('index')['index_right'].nunique().reset_index(name='pub_trans_count')
        grid_gdf_f = grid_gdf.merge(node_counts_per_grid, on='index', how='left')
        grid_gdf_f.to_parquet(grid_path)
        # logging.info(f'Successfully processed grid {grid_path}')
        print(f'Successfully processed grid {grid_path}')

    except Exception as e:
        # logging.error(f'Error processing grid {grid_path}: {e}')
        print(f'Error processing grid {grid_path}: {e}')

In [None]:
# Process each grid file and log the process
for i, grid in enumerate(grids_list):
    process_grid(grid)
    print(f'Processed grid {i}')