# Validation

We want to check whether our identification of face artifacts from street network polygons is correct. We postulate that, in accordance with our definitions:

1. **no** face artifacts contain buildings (with a possible small margin of error, e.g. there could be a bus stop tagged as "building" placed in the middle of a multilane street);
2. **some** urban blocks contain **no** buildings (e.g. squares, parks, sports and service areas).
For an automated validation of our findings, we can use building data from OSM to check statement 1:
Polygons classified as face artifacts, but which contain buildings, are wrongly classified and actually represent urban blocks (type 1 error). Note that an automated checking of statement 2 - i.e. actual face artifacts wrongly classified as urban blocks - is not possible since we have no ground truth (e.g. a list of "empty urban blocks") to test agains.

Thus, in this notebook we:
* download building data for all FUAs from OSM
* visually check FUA building data for sufficient completeness
* for the subset of FUAs with a sufficient completeness of building data on OSM,
    - import street network face polygons
    - classify them into face artifacts vs. urban blocks according to the face artifact threshold found for the FUA in Notebook 4 (peaks)
    - for each polygon, compute its area of intersection with the building data set
    - generate pseudo-confusion matrices for different tolerance thresholds (the threshold indicates how many square meters of building surface a face artifact is allowed contain without being classified as urban block)
    - visualize results of validation

In [None]:
## shapely 2.0.0.!
import shapely
from shapely import strtree
from shapely.geometry import LineString
from shapely.validation import make_valid

import os
import shutil

os.environ["USE_PYGEOS"] = "0"

import geopandas
import pandas
import dask_geopandas

import numpy
from palettable.cartocolors.qualitative import Bold_6

import os
import warnings

import json

from tqdm import tqdm

import osmnx as ox

**Import meta data**

In [None]:
# sample meta data
sample = geopandas.read_parquet("../data/sample.parquet")

**Download building data for each FUA from OSM**

We do this in 2 steps: 

1. first, for all cities except London, Cologne, and Dortmund - where building data for the entire FUA area can be downloaded in one go;
2. and second, for London, Cologne, and Dortmund, where building data is too large to be downloaded in one go - we need to split the request polygon into smaller polygons, and then recombine the bulding data

In [None]:
# Step 1: For all cities where data can be downloaded in one go
sample.drop(sample[sample["eFUA_name"].isin(["London", "Cologne", "Dortmund"])].index, axis = 0, inplace = True)

In [None]:
# Filter warnings about GeoParquet implementation.
warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")

for ix, row in tqdm(sample.iterrows(), total=len(sample)):
    city = row.eFUA_name

    # check if file exists - if it doesn't, download from OSM:
    if os.path.exists(f"../data/buildings/buildings_{city}.gpkg"):
        print(f"{city} building data already downloaded")
    elif os.path.exists(f"../data/buildings/buildings_{city}_clean.gpkg"):
        print(f"{city} building data already downloaded and preprocessed")
    else:
        print(f"dowloading {city} building data")
        # Download OSM buildings
        buildings = ox.geometries_from_polygon(
            sample[sample["eFUA_name"] == city].geometry.values[0],
            tags={"building": True},
        )
        buildings_small = buildings[["geometry"]]  # saving only most relevant columns
        buildings_small.to_file(f"../data/buildings/buildings_{city}.gpkg", index=False)
        del (buildings, buildings_small)
        print(f"{city} building data downloaded")

In [None]:
# Step 2: for London, Cologne, and Dortmund

# sample meta data
sample = geopandas.read_parquet("../data/sample.parquet")

# make a separate df of cities whose building data sets are too big to download in one go
sample_big = sample[sample["eFUA_name"].isin(["London", "Dortmund", "Cologne"])]

# define a function to separate the FUA area into grid cells (to download building data separately)
def create_grid_geometry(gdf, cell_size):

    """
    Creates a geodataframe with grid cells covering the area specificed by the input gdf

    Arguments:
        gdf (gdf): geodataframe with a polygon/polygons defining the study area
        cell_size (numeric): width of the grid cells in units used by gdf crs

    Returns:
        grid (gdf): gdf with grid cells in same crs as input data
    """

    geometry = gdf["geometry"].unary_union
    geometry_cut = osmnx.utils_geo._quadrat_cut_geometry(geometry, quadrat_width=cell_size)

    grid = geopandas.GeoDataFrame(geometry=[geometry_cut], crs=gdf.crs)

    grid = grid.explode(index_parts=False, ignore_index=True)

    # Create arbitraty grid id col
    grid["grid_id"] = grid.index

    return grid

In [None]:
# for the given index in the sample data frame,
for ix in sample_big.index:

    # get meta data for this city
    eFUA_ID = int(sample_big.loc[ix, "eFUA_ID"])
    eFUA_name = str(sample_big.loc[ix, "eFUA_name"])

    # check if file exists - if it doesn't, download from OSM:
    if os.path.exists(f"../data/buildings/buildings_{eFUA_name}.gpkg"):
        print(f"{eFUA_name} building data already downloaded")
    elif os.path.exists(f"../data/buildings/buildings_{eFUA_name}_clean.gpkg"):
        print(f"{eFUA_name} building data already downloaded and preprocessed")
    else:
        print(f"dowloading {eFUA_name} building data")
        
        # get the geometries of all polygons
        gdf = geopandas.read_parquet(f"../data/{eFUA_ID}/polygons/")
        # make a square grid of cell width 10km on top of it
        grid = create_grid_geometry(gdf, 10000)
        # convert grid back to EPSG:4326 (for OSMnx download)
        grid = grid.to_crs("EPSG:4326")

        # make temp subfolder for current city
        os.makedirs(f"../data/buildings/temp_{eFUA_name}/", exist_ok=True)

        # download building data for each grid cell separately
        for ix, row in grid.iterrows():
            # try/except (some grid cells don't contain any building data for which osmnx throws an error)
            try: 
                buildings = osmnx.features_from_polygon(
                    row.geometry,
                    tags={"building": True},
                    )
            except:
                buildings = geopandas.GeoDataFrame()
            # if building data has been found for this grid cell:
            if not buildings.empty:
                # drop not needed columns
                buildings = buildings.reset_index()
                buildings = buildings[["osmid", "geometry"]]
                # drop points and linestrings, assert we only have polygons in the gdf
                buildings = buildings.drop(
                    buildings[buildings.geometry.type == "Point"].index, axis=0
                ).reset_index(drop=True)
                buildings = buildings.drop(
                    buildings[buildings.geometry.type == "LineString"].index, axis=0
                ).reset_index(drop=True)
                # check that we now only have (multi)polygons in the data set
                assert all(buildings.geometry.type.isin(["Polygon", "MultiPolygon"]))
                # save "cleaned" data set for this grid cell
                buildings.to_file(f"../data/buildings/temp_{eFUA_name}/buildings_{ix}.gpkg")        
            del(buildings)

        print(f"all buildings for {eFUA_name} downloaded")
        del(grid, gdf, eFUA_name, eFUA_ID)

In [None]:
# Combine grid cell data sets for each city into one (already cleaned) data set

for ix in sample_big.index:  
    # get meta data for this city
    eFUA_ID = int(sample_big.loc[ix, "eFUA_ID"])
    eFUA_name = str(sample_big.loc[ix, "eFUA_name"])
    if not os.path.exists(f"../data/buildings/buildings_{eFUA_name}_clean.gpkg"):
        # initialize an empty gdf
        gdf = geopandas.GeoDataFrame()
        # combine data from all cells into one gdf
        for filename in os.listdir(f"../data/buildings/temp_{eFUA_name}/"):
            gdf_temp = geopandas.read_file(f"../data/buildings/temp_{eFUA_name}/" + filename)
            gdf = pandas.concat([gdf, gdf_temp])
        print(eFUA_name, len(gdf), " before dedup")
        # drop duplicates
        gdf.drop_duplicates(subset = "osmid", keep = "first", inplace = True, ignore_index = True)
        print(eFUA_name, len(gdf), " after dedup")
        # drop osmid column (not needed anymore)
        gdf = gdf[["geometry"]]
        # explode multipolygons
        gdf = gdf.explode(index_parts=False)
        # save file
        gdf.to_file(f"../data/buildings/buildings_{eFUA_name}_clean.gpkg", index = False)
        print(f"Cleaned data saved for {eFUA_name}")
    else:
        print(f"Cleaned data exists for {eFUA_name}")

In [None]:
# Delete the temporary subfolders that contain building data per grid cell
for eFUA_name in ["London", "Dortmund"]:
    shutil.rmtree(f"../data/buildings/temp_{eFUA_name}")

## Visual processing step in QGIS

In this step, for each FUA, we visually checked in QGIS (by plotting and comparing all buildings, the OSM base layer, and the FUA shape) whether its building mapping in OSM is sufficiently complete as to be used for face artifact validation purposes. Next, in the list for each continent, we noted down cities with enough buildings mapped for validation.

**Africa:** Abidjan, Conakry, Douala, Modagishu

**Asia:** Aleppo, Kabul, Semarang, Seoul

**South America:** Bucaramanga, Porto Alegre

**North America:** Cincinnati, Dallas, Ottawa, Raleigh, Richmond, Rio Piedras [San Juan], Salt Lake City, San José, Washington D.C.

**Europe:** Amsterdam, Belgrade, Chelyabinsk, Cologne, Dortmund, Helsinki, Katowice, Krakow, Liège, London, Nuremberg, Saratov, Vienna, Warsaw

**Oceania:** Auckland

**Discarded cities**, i.e. cities with enough buildings mapped for validation, but no artifact index threshold found: Comilla, Dhaka, Jombang, Monrovia


**Define FUA subsample for validation**

In [None]:
cities_for_validation = [
    "Abidjan",
    "Conakry",
    "Douala",
    "Mogadishu",  # Africa
    "Aleppo",
    "Kabul",
    "Semarang",
    "Seoul",  # Asia
    "Bucaramanga",
    "Porto Alegre",  # South America
    "Cincinnati",
    "Dallas",
    "Ottawa",
    "Raleigh",
    "Richmond",
    "Río Piedras [San Juan]",
    "Salt Lake City",
    "San Jose",
    "Washington D.C.",  # North America
    "Auckland",  # Oceania
    "Amsterdam",
    "Belgrade",
    "Chelyabinsk",
    "Cologne",
    "Dortmund",
    "Helsinki",
    "Katowice",
    "Krakow",
    "Liège",
    "London",
    "Nuremberg",
    "Saratov",
    "Vienna",
    "Warsaw",  # Europe
]


**Preprocessing of building data for validation subsample**

Clean the building data (will only be needed for the cities where we did download the building data in one go)

In [None]:
sample_validation = sample[sample.eFUA_name.isin(cities_for_validation)]

In [None]:
# Filter warnings about GeoParquet implementation.
warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")

for ix, row in tqdm(sample_validation.iterrows(), total=len(sample_validation)):
    city = row.eFUA_name
    myid = row.eFUA_ID

    if os.path.exists(f"../data/buildings/buildings_{city}_clean.gpkg"):
        print(f"{city} building data already preprocessed")
    elif not os.path.exists(f"../data/buildings/buildings_{city}.gpkg"):
        print(f"{city} building data not downloaded yet")
    else:
        polygons = geopandas.read_parquet(f"../data/{int(row.eFUA_ID)}/polygons/")
        buildings = geopandas.read_file(f"../data/buildings/buildings_{city}.gpkg")
        buildings = buildings.to_crs(polygons.crs)

        # drop tags (not needed for analysis)
        buildings = buildings[["geometry"]]

        # drop points, assert we only have polygons in the gdf
        buildings = buildings.drop(
            buildings[buildings.geometry.type == "Point"].index, axis=0
        ).reset_index(drop=True)

        buildings = buildings.drop(
            buildings[buildings.geometry.type == "LineString"].index, axis=0
        ).reset_index(drop=True)

        # explode multipolygons
        buildings = buildings.explode(index_parts=False)

        # check that we now only have polygons in the data set
        assert all(buildings.geometry.type == "Polygon")

        # save "cleaned" data set
        buildings.to_file(f"../data/buildings/buildings_{city}_clean.gpkg", index=0)

        # delete original data set
        os.remove(f"../data/buildings/buildings_{city}.gpkg")

**Read in peak results (face artifact thresholds)**

In [None]:
results = json.load(open("../results/04_peaks_results.json"))

**Classify urban blocks and calculate block/building overlap**

In [None]:
os.makedirs("../data/validation_temp", exist_ok=True)

# Filter warnings about GeoParquet implementation.
warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")

for ix, row in tqdm(sample_validation.iterrows(), total=len(sample_validation)):
    city = row.eFUA_name
    myid = int(row.eFUA_ID)
    # read in buildings and polygons
    buildings = geopandas.read_file(f"../data/buildings/buildings_{city}_clean.gpkg")
    polygons = geopandas.read_parquet(f"../data/{myid}/polygons/")
    buildings = buildings.to_crs(polygons.crs)
    assert buildings.crs == polygons.crs

    # find predicted artifacts
    option = "circular_compactness_index"
    threshold = results[city][option]["threshold"]
    polygons["is_artifact"] = False # set default to False
    polygons.loc[
        polygons["circular_compactness_index"] <= threshold, "is_artifact"
    ] = True # set to True for polygons where index is below the threshold

    # find overlap of polygons with buildings
    mytree = strtree.STRtree(geoms=buildings.geometry)
    q = mytree.query(polygons.geometry, predicate="intersects")
    # q[0] ...polygon indeces
    # q[1] ...building indeces

    # find out whether polygons contain buldings or not
    polygons["intersects_buildings"] = False  # set default to False
    polygons.loc[numpy.unique(q[0]), "intersects_buildings"] = True
    # if index is in query, change to True

    #### find absolute value of built area in polygon [in square meters]

    # column of total area of polygon
    polygons["area"] = polygons.area

    # column of building area within polygon (default to 0)
    polygons["intersection_area"] = 0

    # replace 0 with area of unary union of buildings intersecting that polygon
    for polygon_index in numpy.unique(q[0]):
        # for this polygon, get indeces of buildings intersecting it
        building_indeces = q[1][numpy.where(q[0] == polygon_index)]

        # get area of intersection
        intersection_area = (
            polygons.loc[polygon_index, "geometry"]
            .intersection(buildings.loc[building_indeces, "geometry"].unary_union)
            .area
        )

        # add to polygon table
        polygons.loc[polygon_index, "intersection_area"] = intersection_area

    # drop not needed columns that were created in the process
    polygons = polygons.drop(columns = ["area", "intersects_buildings"])

    # save polygons to a partitioned GeoParquet, overwriting original polygons
    polygons = dask_geopandas.from_geopandas(polygons, npartitions=10)
    polygons.to_parquet(f"../data/{int(myid)}/polygons/")

    del(buildings, polygons, mytree, q)
