Splitting of LCZ reference data into 5 folds based on ETCs counts, used for the S1 morphometric-based scheme.

In [1]:
# module imports
import geopandas as gpd
import numpy as np

In [2]:
# load reference polygons
ref = gpd.read_file(r'ref_data\berlin_ref.gpkg')
#ref = gpd.read_file(r'ref_data\hongkong_ref.gpkg')
#ref = gpd.read_file(r'ref_data\paris_ref.gpkg')
#ref = gpd.read_file(r'ref_data\rome_ref.gpkg')
#ref = gpd.read_file(r'ref_data\saopaulo_ref.gpkg')

# load tessellation cells with morphometrics
data = gpd.read_parquet(r'momepy\berlin_morphometrics.parquet')
#data = gpd.read_parquet(r'momepy\hongkong_morphometrics.parquet')
#data = gpd.read_parquet(r'momepy\paris_morphometrics.parquet')
#data = gpd.read_parquet(r'momepy\rome_morphometrics.parquet')
#data = gpd.read_parquet(r'momepy\saopaulo_morphometrics.parquet')

In [3]:
# select just urban classes (IDs 1 - 10) polygons
ref_urban = ref[ref['gridcode'] <= 10]
ref_urban = ref_urban.reset_index(drop=True)
ref_urban['id'] = ref_urban.index

In [4]:
# define classes names
classes = {
    1: "Compact high-rise",
    2: "Compact mid-rise",
    3: "Compact low-rise",
    4: "Open high-rise",
    5: "Open mid-rise",
    6: "Open low-rise",
    8: 'Large low-rise',
    9: 'Sparsely built',
    10: "Heavy industry"}

ref_urban["class_name"] = ref_urban["gridcode"].map(classes)

In [5]:
def stratified_kfold_cell_split(gdf, tessellation, class_col, n_splits=5, random_seed=0):
    np.random.seed(random_seed)
    gdf = gdf.copy()

    # spatial join to associate tessellation cells with ref polygons
    joined = gpd.sjoin(tessellation, gdf[[class_col, "geometry"]], how="inner", predicate="intersects")
    joined = joined.rename(columns={"index_right": "polygon_idx"})

    # count cells per polygon
    cell_counts = joined.groupby("polygon_idx").size().rename("cell_count")

    # assign cell counts to gdf
    gdf["cell_count"] = gdf.index.map(cell_counts).fillna(0).astype("int32")

    # create an array to store fold assignments
    gdf["fold"] = -1

    # stratify by class and distribute polygons to folds by balancing cell counts
    for class_value, class_group in gdf.groupby(class_col):

        # shuffle
        class_group = class_group.sample(frac=1, random_state=random_seed).reset_index(drop=False)

        # empty buckets for each fold
        fold_buckets = [[] for _ in range(n_splits)]
        fold_cell_sums = [0] * n_splits

        # assign each polygon to the fold with currently lowest total cell count
        for _, row in class_group.iterrows():
            min_fold = np.argmin(fold_cell_sums)
            fold_buckets[min_fold].append(row["index"])
            fold_cell_sums[min_fold] += row["cell_count"]

        # update the fold assignments
        for fold_idx, polygon_indices in enumerate(fold_buckets):
            gdf.loc[polygon_indices, "fold"] = fold_idx

    return gdf

In [6]:
split_gdf = stratified_kfold_cell_split(ref_urban, data, class_col="gridcode", n_splits=5, random_seed=0)

In [7]:
# cleaning
split_gdf = split_gdf[["gridcode","geometry","class_name","fold"]]

In [8]:
# save split
split_gdf.to_file(r"ref_data\berlin_ref_splitS1.gpkg", driver="GPKG")
#split_gdf.to_file(r"ref_data\hongkong_ref_splitS1.gpkg", driver="GPKG")
#split_gdf.to_file(r"ref_data\paris_ref_splitS1.gpkg", driver="GPKG")
#split_gdf.to_file(r"ref_data\rome_ref_splitS1.gpkg", driver="GPKG")
#split_gdf.to_file(r"ref_data\saopaulo_ref_splitS1.gpkg", driver="GPKG")