# Download the spatial data representing OSM street network

This notebook downloads the boundaries of FUAs from GHSL, randomly samples 25 larger than 1 million inhabitants from each continent to account for various sizes and geographical variation, and downloads street networks from OpenStreetMap.

In [5]:
import os

import geopandas
import pandas
import pooch
import osmnx as ox
import dask_geopandas

from tqdm import tqdm

## Download data

Download FUA polygons. We are using _GHS functional urban areas, derived from GHS-UCDB R2019A (2015)_ from [GHSL - Global Human Settlement Layer](https://ghsl.jrc.ec.europa.eu/ghs_fua.php).

In [None]:
fua_path = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_FUA_UCDB2015_GLOBE_R2019A/V1-0/GHS_FUA_UCDB2015_GLOBE_R2019A_54009_1K_V1_0.zip"

fua_cache = pooch.retrieve(
    fua_path,
    known_hash="d54de59b82b8c4d64a710f90ccd554975a3be92233f14115ac154094c3549979",
)

Downloading data from 'https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_FUA_UCDB2015_GLOBE_R2019A/V1-0/GHS_FUA_UCDB2015_GLOBE_R2019A_54009_1K_V1_0.zip' to file '/home/jovyan/.cache/pooch/42ab047cecb0653ef75601013a14e233-GHS_FUA_UCDB2015_GLOBE_R2019A_54009_1K_V1_0.zip'.


Read polygons and continent geometry (built-in dataset in geopandas coming from Natural Earth).

In [None]:
fua = geopandas.read_file(
    f"{fua_cache}!GHS_FUA_UCDB2015_GLOBE_R2019A_54009_1K_V1_0.gpkg"
)
continents = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))

Retain only FUAs with population (as of 2015) larger than 1 000 000 inhabitants. That ensures a reasonable size of a street network.

In [None]:
fua = fua.query("FUA_p_2015 > 1000000")

Attach information on a continent to FUAs.

In [None]:
fua = fua.merge(
    continents[["continent", "iso_a3"]], left_on="Cntry_ISO", right_on="iso_a3"
)

Sample 25 FUAs from each continent.

In [None]:
sample = []
for continent in fua.continent.unique():
    subset = fua[fua.continent == continent]
    if len(subset) > 25:
        sample.append(fua[fua.continent == continent].sample(25, random_state=42))
    else:
        sample.append(subset)
sample = pandas.concat(sample)

Reproject geometry to WGS84 required by OSM and check geometry validity.

In [None]:
sample = sample.to_crs(4326)
if not sample.is_valid.all():
    sample.geometry = sample.buffer(0)

Loop over the sampled FUAs and download their street network from OSM. This step may take some time (~13 hours).

In [3]:
# Define which combination of OSM tags should be used. This covers what would be usually used in a morphological analysis.
type_filter = '["highway"~"living_street|motorway|motorway_link|pedestrian|primary|primary_link|residential|secondary|secondary_link|service|tertiary|tertiary_link|trunk|trunk_link|unclassified"]'

# Loop over all samples
for ix, row in tqdm(sample.iterrows(), total=len(sample)):
    # Download OSM graph
    streets_graph = ox.graph_from_polygon(
        row.geometry,
        network_type="all_private",
        custom_filter=type_filter,
        retain_all=True,
        simplify=False,
    )
    # Project graph to the local UTM zone (in meters with a reletively small error)
    streets_graph = ox.projection.project_graph(streets_graph)
    # Create an undirected graph to avoid duplicated geometry and convert it to a GeoDataFrame
    gdf = ox.graph_to_gdfs(
        ox.get_undirected(streets_graph),
        nodes=False,
        edges=True,
        node_geometry=False,
        fill_edge_geometry=True,
    )
    # Ensure tags are a string and not different dtype (as list) so we can save it
    gdf.highway = gdf.highway.astype(str)

    # Create a folder for the sample case
    os.makedirs(f"../data/{int(row.eFUA_ID)}", exist_ok=True)

    # Save the street network as a GeoParquet, using only necessary columns. We are not interested in other.
    path = f"../data/{int(row.eFUA_ID)}/roads_osm.parquet"
    gdf[["highway", "geometry"]].to_parquet(path)

  1%|▎                                         | 1/131 [00:20<43:32, 20.10s/it]IOStream.flush timed out
 99%|███████████████████████████████████▋| 130/131 [12:40:07<14:55, 895.29s/it]IOStream.flush timed out
IOStream.flush timed out
100%|████████████████████████████████████| 131/131 [12:53:30<00:00, 354.28s/it]


Save sample boundaries containing names and continents for a future reference.

In [None]:
sample.to_parquet("../data/sample.parquet")

Split to partitions to fit into the GitHub filesize limit.

In [12]:
for ix, row in tqdm(sample.iterrows(), total=len(sample)):
    path = f"../data/{int(row.eFUA_ID)}/roads_osm.parquet"
    polygons = dask_geopandas.from_geopandas(
        geopandas.read_parquet(path).reset_index(), npartitions=2
    )
    os.remove(path)
    polygons.to_parquet(path)

100%|████████████████████████████████████████| 131/131 [04:30<00:00,  2.06s/it]
