This notebook contains code on collecting green spaces from OSM for the generated grids and then calculating the area covered by these green spaces for each 100m x 100m grid

In [7]:
import geopandas as gpd
import os
import glob
import logging
import osmium
import shapely.geometry
from shapely.geometry import box
import subprocess
import gc
from multiprocess import Pool

In [8]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# configure logging (recommended if you monitor processing over a lot of files)
log_path = 'logs/green_spaces_osm.log'

# ensure log directory exists
log_dir = os.path.dirname(log_path)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
logging.basicConfig(filename=log_path, level=logging.INFO,
                    format='%(asctime)s:%(levelname)s:%(message)s', force=True)

['/Volumes/ssd1/eurostat_grid/grids_100/grids_100_716.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_982.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_855.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_349.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_103.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_972.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_631.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_976.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_107.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_635.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_986.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_712.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_558.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_851.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_477.parquet', '/Volumes/ssd1/eurostat_grid/grids_100/grids_100_292.p

In [9]:
# Define green area tags
GREEN_TAGS = {
    ("leisure", "park"),
    ("leisure", "garden"),
    ("leisure", "common"),
    ("leisure", "recreation_ground"),
    ("leisure", "village_green"),
    ("landuse", "grass"),
    ("landuse", "forest"),  
    ("natural", "grassland"),
    ("natural", "wood"),   
}

def is_green_area(tags):
    if tags.get("access") == "private":
        return False
    return any(tags.get(k) == v for k, v in GREEN_TAGS)

def extract_green_areas_from_pbf(pbf_path):
    """
    Extract green areas (parks, forests, etc.) from an OSM PBF file.
    Returns a GeoDataFrame of polygons.
    """
    green_areas = []
    wkb_factory = osmium.geom.WKBFactory()

    class GreenAreaHandler(osmium.SimpleHandler):
        def area(self, a):
            try:
                if not is_green_area(a.tags):
                    return
                geom = wkb_factory.create_multipolygon(a)
                green_areas.append({
                    "id": a.id,
                    "name": a.tags.get("name"),
                    "type": next((f"{k}={v}" for k, v in a.tags if (k, v) in GREEN_TAGS), None),
                    "geometry": shapely.wkb.loads(geom, hex=True),
                })
            except Exception:
                pass  # Invalid geometry or tags

    handler = GreenAreaHandler()
    handler.apply_file(pbf_path, locations=True)

    if not green_areas:
        return gpd.GeoDataFrame(columns=["id", "name", "type", "geometry"], geometry="geometry", crs="EPSG:4326")

    return gpd.GeoDataFrame(green_areas, crs="EPSG:4326")

In [10]:
def process_grid(grid_path):
    """
    Processes each grid file to query and save green areas data.
    
    Args:
    - grid_path (str): Path to the grid file.
    """
    try:
        grid_number = grid_path.split('_')[-1].split('.')[0]
        output_dir = os.path.join(os.path.dirname(grid_path), 'green_spaces_osm_data')
        output_file = f'green_areas_{grid_number}.parquet'
        os.makedirs(output_dir, exist_ok=True)

        if os.path.exists(os.path.join(output_dir, output_file)):
            logging.info(f'Skipping {output_file} as it already exists')
            # print(f'Skipping {output_file} as it already exists')
            return

        grid_gdf = gpd.read_parquet(grid_path)
        if 'index' not in grid_gdf.columns:
            grid_gdf.reset_index(inplace=True)

        if 'green_area' in grid_gdf.columns:
            grid_gdf = grid_gdf.drop(columns='green_area')

        grid_gdf_4326 = grid_gdf.to_crs('epsg:4326')
        bounds = grid_gdf_4326.total_bounds
        buffered_box = box(*bounds).buffer(0.01)
        logging.info(f'Started processing grid {grid_number}')
        # print(f'Started processing grid {grid_number}')

        # clip .pbf file
        clipped_pbf_path = f'/Volumes/ssd1/eurostat_grid/grids_100/pbf_extracts/{grid_number}.pbf'
        os.makedirs(os.path.dirname(clipped_pbf_path), exist_ok=True)

        bbox_str = ','.join(map(str, buffered_box.bounds))
        subprocess.run([
            'osmium', 'extract',
            '-b', bbox_str,
            '/Volumes/ssd1/osm_europe/europe-latest.osm.pbf',
            '-o', clipped_pbf_path,
            '--overwrite'
        ], check=True)
        logging.info(f'clipped PBF for {grid_number}')
        # print(f'clipped PBF for {grid_number}')

        # process with pyosmium
        data_gdf = extract_green_areas_from_pbf(clipped_pbf_path)
        
        if data_gdf.empty:
            logging.info(f'No green spaces found for grid {grid_path}')
            # print(f'No green spaces found for grid {grid_path}')
            return

        data_gdf.to_crs(grid_gdf.crs, inplace=True)
        data_gdf.to_parquet(os.path.join(output_dir, output_file))
        logging.info(f'Saved green spaces to {output_file}')
        # print(f'Saved green spaces to {output_file}')

        intersection = gpd.overlay(grid_gdf, data_gdf, how='intersection')
        dissolved = intersection.dissolve(by='index')
        dissolved['green_area'] = dissolved.geometry.area
        grid_gdf_f = grid_gdf.merge(dissolved['green_area'], on='index', how='left').fillna({'green_area': 0})
        grid_gdf_f.to_parquet(grid_path)

        # # clean up temp PBF file
        # if os.path.exists(clipped_pbf_path):
        #     os.remove(clipped_pbf_path)
            
        del data_gdf, grid_gdf, grid_gdf_f
        
        logging.info(f'Successfully processed grid {grid_path}')
        # print(f'Successfully processed grid {grid_path}')
        gc.collect()
    except Exception as e:
        logging.error(f'Error processing grid {grid_path}: {e}')
        # print(f'Error processing grid {grid_path}: {e}')

In [11]:
# sequential
for elem in grids_list:
    if int(elem.split('_')[-1].split('.')[0]) == 228:
        process_grid(elem)

# # parallel processsing if you want to process a lot of files
# num_processes = 3

# with Pool(processes=num_processes) as pool:
#     pool.map(process_grid, grids_list)