## Package and dirs

In [1]:
# %pip install pyarrow
# %pip install --upgrade pandas
# %pip install xlsxwriter


import pyarrow
import pandas as pd
import numpy as np
from pathlib import Path
import geopandas as gpd


import sys, platform
print("=== Environment Info ===")
print(f"Python version : {sys.version.split()[0]}")
print(f"Python env     : {sys.executable}")
print(f"Platform       : {platform.platform()}")
print(f"numpy          : {np.__version__}")
print(f"pandas         : {pd.__version__}")
print(f"geopandas      : {gpd.__version__}")
print("========================")


=== Environment Info ===
Python version : 3.11.13
Python env     : c:\Users\pc\.conda\envs\geo_env\python.exe
Platform       : Windows-10-10.0.22631-SP0
numpy          : 1.26.4
pandas         : 2.3.2
geopandas      : 0.14.4


In [2]:

# Define the file path
folder = Path(r"E:\London\colouringbritain\data-extract-2025-09-01")
csv_file = folder / "building_attributes.csv"

# Subfolder for cached files
cache_dir = folder / "cache"
cache_dir.mkdir(exist_ok=True)   # create if not exists



## Merge building polygons with attributes data from points

### 1. reload building attributes - points

In [3]:
import pyarrow.feather as feather

# cache_dir = Path(r"C:\Users\yingjiel\data\colouringbritain")  # --> when using SDSS remote desktop

out_feather = cache_dir / "building_attributes_subset.feather"
df_building_subset = feather.read_feather(out_feather)

In [4]:

# df_building_subset.head()
print(df_building_subset.columns.tolist())

# # Find duplicates based on building_id
# duplicates = df_building_subset[df_building_subset.duplicated(subset="building_id", keep=False)]
# # Optionally, sort for easier inspection
# duplicates = duplicates.sort_values(by="building_id")
# print(duplicates)

cols_pts_selected = ['building_id', 'location_postcode', 
                     'location_latitude', 'location_longitude', 
                     'current_landuse_group', 'current_landuse_order',
                     'sust_breeam_rating', 'sust_dec', 'sust_retrofit_date',
                     ]

building_points = df_building_subset[cols_pts_selected].copy()

['building_id', 'ref_toid', 'ref_osm_id', 'location_name', 'location_town', 'location_postcode', 'location_latitude', 'location_longitude', 'current_landuse_group', 'current_landuse_order', 'building_attachment_form', 'date_change_building_use', 'date_year', 'size_storeys_attic', 'size_storeys_core', 'size_storeys_basement', 'size_height_apex', 'size_floor_area_ground', 'size_floor_area_total', 'sust_breeam_rating', 'sust_dec', 'sust_retrofit_date', 'construction_core_material', 'construction_secondary_materials', 'construction_roof_covering', 'is_domestic', 'context_front_garden', 'context_back_garden', 'context_flats_garden', 'context_green_space_distance', 'context_green_space_distance', 'context_tree_distance', 'age_retrofit_date']


In [5]:


# Drop duplicate rows across all columns
building_points_nodup = building_points.drop_duplicates()               # step 2: drop duplicate rows

print(building_points.shape)
print(building_points_nodup.shape) # --> the same shape, so no duplicate rows


# Auto-rename duplicated columns (if any) by appending _1, _2, etc.
def dedup_columns(df):
    seen = {}
    new_cols = []
    for col in df.columns:
        if col not in seen:
            seen[col] = 0
            new_cols.append(col)
        else:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
    df.columns = new_cols
    return df

building_points = dedup_columns(building_points)

(10180876, 9)
(10180876, 9)


### convert to point shapefile

In [6]:
# Make points GeoDataFrame from lat/lon
building_points_df = building_points.dropna(subset=["location_latitude", "location_longitude"]).copy()

building_pts = gpd.GeoDataFrame(
    building_points_df,
    geometry=gpd.points_from_xy(
        x=building_points_df["location_longitude"],
        y=building_points_df["location_latitude"],
    ),
    crs="EPSG:4326",  # WGS84  # <<< CHANGED (move crs to GeoDataFrame for clarity)
)


# # Save as GeoPackage -- will take 13 mins
# gpkg_path = cache_dir / "building_pts.gpkg"
# building_pts.to_file(gpkg_path, driver="GPKG", index=False)

### 2. reload building polygon  

In [7]:
buildings_parquet = cache_dir / "buildings_clean.feather"

# Reload instantly
building_poly = gpd.read_feather(buildings_parquet)


In [8]:
# print(building_poly.head())
print(building_poly.columns.tolist())

print(building_poly.shape)

['fid', 'area', 'perimeter', 'geometry']
(2223481, 4)


### spatial join 

It might be poosible one polygon contains multiple points -- need to find a way to address this

In [9]:
import geopandas as gpd

print(building_pts.crs)
print(building_poly.crs)


# Reproject both GeoDataFrames
# For London, use a projected CRS in meters. The two best choices:
#   EPSG:27700 — OSGB 1936 / British National Grid (BNG)
#   Standard for Great Britain; great local distance accuracy around London.
building_pts = building_pts.to_crs(epsg=27700)
building_poly = building_poly.to_crs(epsg=27700)

print(building_pts.crs)
print(building_poly.crs)

EPSG:4326
{"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "ProjectedCRS", "name": "OSGB36 / British National Grid", "base_crs": {"name": "OSGB36", "datum": {"type": "GeodeticReferenceFrame", "name": "Ordnance Survey of Great Britain 1936", "ellipsoid": {"name": "Airy 1830", "semi_major_axis": 6377563.396, "inverse_flattening": 299.3249646}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "id": {"authority": "EPSG", "code": 4277}}, "conversion": {"name": "British National Grid", "method": {"name": "Transverse Mercator", "id": {"authority": "EPSG", "code": 9807}}, "parameters": [{"name": "Latitude of natural origin", "value": 49, "unit": "degree", "id": {"authority": "EPSG", "code": 8801}}, {"name": "Longitude of natural origin", "value": -2, "unit": "degree", "id":

In [10]:

# Pick the right predicate:
#    - "contains" : point must be strictly inside polygon (boundary points excluded)
#    - "covers"   : includes points on polygon boundary (often safer)
predicate = "covers"

bld_with_pts = gpd.sjoin(
    building_poly,   # LEFT: keep all buildings
    building_pts,    # RIGHT: bring point attrs in where they fall inside
    how="left",
    predicate=predicate,
    lsuffix="bld",
    rsuffix="pt",
)

print(bld_with_pts.shape)

(2841826, 14)


`sjoin` is a row-wise join.

If one polygon covers multiple points, then the polygon row is **repeated** once for each matching point.

So you’ll get one row per (polygon, point) pair.

In [11]:
# print(bld_with_pts.columns.tolist())
# print(building_poly.columns.tolist())
# print(bld_with_pts.head())

# Identify points that didn't match any building
pts_matched = bld_with_pts[bld_with_pts["building_id"].notna()]  # building_id from points that matched
print(pts_matched.head())


   fid          area  perimeter  \
2  3.0  4.776945e-09   0.000326   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   

                                            geometry   index_pt  building_id  \
2  POLYGON ((530958.680 200117.570, 530962.290 20...  2981433.0    1164790.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  3787818.0     937225.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  4497128.0    3546140.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  2926106.0     956250.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  4040057.0    1003486.0   

  location_postcode  location_latitude  location_longitude  \
2                             51.68447            -0.10702   
3                             51.66841            -0.17639   
3                             51.66847            -0.17635   
3                             51.66848            -0.17594  

#### Identify records that one polygon covers multiple points

In [12]:
# add a count of how many points each polygon covers
bld_with_pts["pts_count"] = bld_with_pts.groupby(bld_with_pts.index)["building_id"].transform("count")

# filter polygons that cover more than 1 point
polys_multi_pts = bld_with_pts[bld_with_pts["pts_count"] > 1]

print(polys_multi_pts.head())

   fid          area  perimeter  \
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   
3  4.0  5.876390e-08   0.002627   

                                            geometry   index_pt  building_id  \
3  POLYGON ((526208.730 198238.610, 526212.740 19...  3787818.0     937225.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  4497128.0    3546140.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  2926106.0     956250.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  4040057.0    1003486.0   
3  POLYGON ((526208.730 198238.610, 526212.740 19...  3937069.0     927964.0   

  location_postcode  location_latitude  location_longitude  \
3                             51.66841            -0.17639   
3                             51.66847            -0.17635   
3                             51.66848            -0.17594   
3                             51.66866            -0.17636  

In [13]:
# 2) Choose a stable polygon key
poly_key = "fid"
pts_key = "building_id"

# helpers
def mode_or_na(s: pd.Series):
    """
    Return the most frequent (mode) value in a pandas Series.
    - Drop missing values first.
    - If the Series is not empty after dropping NaN, return the first mode.
      (pandas mode() can return multiple values if there are ties, but
       we just take the first to keep it simple.)
    - If the Series is empty, return pandas.NA (missing).
    """
    s = s.dropna()
    return s.mode().iloc[0] if not s.empty else pd.NA


# Alternative mode function that breaks ties by first appearance
def mode_safe(s: pd.Series):
    """
    Return the most frequent non-null, non-empty value.
    If tie, pick the one that appears first in the original order.
    If no valid values, return pd.NA.
    """
    # Drop NA and empty/whitespace-only strings
    s = s.dropna().replace(r'^\s*$', pd.NA, regex=True).dropna()
    if s.empty:
        return pd.NA
    
    # Count frequencies
    counts = s.value_counts()
    # all values tied for max frequency
    top = set(counts.index[counts.eq(counts.max())])
    # preserve original order for tie-break
    for v in s:
        if v in top:
            return v
        

def uniques_sorted(s: pd.Series):
    """
    Return a sorted list of unique, non-null, non-empty values.
    - Drop missing values first.
    - Use pandas.unique() to extract unique values in order of appearance.
    - Sort the list so the output is consistent across runs.
    - If the Series is empty, return an empty list [].
    """
    s = s.dropna().replace(r'^\s*$', pd.NA, regex=True).dropna()
    return sorted(pd.unique(s)) if not s.empty else []

In [14]:

# 3) aggregate to one row per polygon
grp = bld_with_pts.groupby(poly_key, dropna=False)
agg_pts = grp.agg(
    point_count          = (pts_key, "count"),                        # number of matched points
    landuse_mode         = ("current_landuse_order", mode_safe),      # most common land use
    landuse_unique       = ("current_landuse_order", uniques_sorted), # all unique land uses
    breeam_mode          = ("sust_breeam_rating", mode_safe),         # most common BREEAM rating
    breeam_unique        = ("sust_breeam_rating", uniques_sorted),    # all unique BREEAM ratings
).reset_index()

# 4) merge back to polygons so geometry is included
polys_unique = building_poly.copy()
if poly_key not in polys_unique.columns:
    polys_unique = polys_unique.reset_index().rename(columns={"index": poly_key})




In [15]:
print(agg_pts.head())
print(agg_pts.shape)
print(polys_unique.shape)

   fid  point_count landuse_mode landuse_unique breeam_mode breeam_unique
0  1.0            0         <NA>             []        <NA>            []
1  2.0            0         <NA>             []        <NA>            []
2  3.0            1  Residential  [Residential]        <NA>            []
3  4.0            5  Residential  [Residential]        <NA>            []
4  5.0            0         <NA>             []        <NA>            []
(2223481, 6)
(2223481, 4)


In [16]:

merge1 = polys_unique.merge(agg_pts, on=poly_key, how="left")
merge1 = gpd.GeoDataFrame(merge1, geometry="geometry", crs=building_poly.crs)

# merge1: one row per polygon with counts, modes, and unique lists
print(merge1.head())


   fid          area  perimeter  \
0  1.0  3.503437e-09   0.000266   
1  2.0  3.830134e-09   0.000308   
2  3.0  4.776945e-09   0.000326   
3  4.0  5.876390e-08   0.002627   
4  5.0  2.593630e-09   0.000232   

                                            geometry  point_count  \
0  POLYGON ((530850.030 200467.340, 530853.140 20...            0   
1  POLYGON ((530946.580 200101.170, 530942.780 20...            0   
2  POLYGON ((530958.680 200117.570, 530962.290 20...            1   
3  POLYGON ((526208.730 198238.610, 526212.740 19...            5   
4  POLYGON ((524271.910 197156.219, 524277.781 19...            0   

  landuse_mode landuse_unique breeam_mode breeam_unique  
0         <NA>             []        <NA>            []  
1         <NA>             []        <NA>            []  
2  Residential  [Residential]        <NA>            []  
3  Residential  [Residential]        <NA>            []  
4         <NA>             []        <NA>            []  


In [17]:
print(merge1.shape)

(2223481, 9)


#### Fallback nearest join for unmatched points

(1) finds which points were already matched by your primary join, 

(2) computes a nearest polygon only for the unmatched points (in meters), and 

(3) returns a per-point assignment table you can merge anywhere.

In [18]:
# 1) Points matched by the primary join (some polygons may repeat; we just need the set)
matched_pts = set(bld_with_pts[pts_key].dropna().unique())
print(f"\n Matched points: {len(matched_pts)}")

# 2) Split points into matched / unmatched
unmatched_pts = building_pts[~building_pts[pts_key].isin(matched_pts)].copy()
print(f"\n Unmatched points: {len(unmatched_pts)}")


 Matched points: 2632484

 Unmatched points: 5693084


In [19]:
# print(merge1.head())


# -------------------------------------------------------------------
# Assumptions:
# - `merge1` is already built from the PRIMARY polygons-left join ("covers")
#   and contains per-polygon fields like `point_count`, plus mode/unique cols.
# - We now only handle polygons with point_count == 0 via NEAREST fallback.
# -------------------------------------------------------------------


# 1) Find polygons that did not get any matched points in the primary join
no_match_polys = merge1.loc[merge1["point_count"].fillna(0).eq(0), [poly_key, "geometry"]]
attr_cols = ["current_landuse_order", "sust_breeam_rating",
             "sust_dec", "sust_retrofit_date"]
print(no_match_polys.head())

     fid                                           geometry
0    1.0  POLYGON ((530850.030 200467.340, 530853.140 20...
1    2.0  POLYGON ((530946.580 200101.170, 530942.780 20...
4    5.0  POLYGON ((524271.910 197156.219, 524277.781 19...
10  11.0  POLYGON ((523756.723 197059.965, 523761.078 19...
12  13.0  POLYGON ((526317.830 197736.050, 526317.720 19...


In [20]:
# constants for clarity / reruns
BNG_EPSG = 27700       # British National Grid (meters)
MAX_DIST = 5           # meters in BNG; tune as needed

if not no_match_polys.empty:
    # 2) Project both layers for distance-based join (London → EPSG:27700)
    poly_m = no_match_polys.to_crs(BNG_EPSG) # British National Grid (meters)
    pts_m  = building_pts.to_crs(BNG_EPSG)

    # 3) Nearest join: polygon-left to pull attributes from closest point
    #    Nearest join (polygons-left, without k=1) → may return multiple matches per polygon
    nearest = gpd.sjoin_nearest(
        poly_m,
        pts_m[[pts_key, "geometry"] + attr_cols],
        how="left",
        distance_col="nearest_dist_m",
        max_distance=MAX_DIST,
        # k=1, # intentionally omitted to allow ties at equal distance
        lsuffix="bld",
        rsuffix="pt",
    ).to_crs(no_match_polys.crs)

    # 4) Aggregate fallback attributes per polygon
    #    Because we may now have >1 nearest point at equal distance,
    #    we summarize all their attributes.
    fallback_agg = nearest.groupby(poly_key).agg(
        point_count    = (pts_key, "count"),               # count of tied nearest points
        nearest_dist_m = ("nearest_dist_m", "min"),        # min = actual nearest distance
        landuse_mode   = ("current_landuse_order", mode_safe),
        landuse_unique = ("current_landuse_order", uniques_sorted),
        breeam_mode    = ("sust_breeam_rating", mode_safe),
        breeam_unique  = ("sust_breeam_rating", uniques_sorted),
    ).reset_index()

    # Identify polygons that truly received a nearest match (within MAX_DIST)
    fb_ids = set(fallback_agg.loc[fallback_agg["point_count"].fillna(0) > 0, poly_key])


    # # Mark source
    # fallback_agg["match_method"] = "nearest"

    # 5) Update `merge1`: attach fallback columns (keep primary values!)
    # Ensure primary columns exist so combine_first works even if they were absent in merge1
    for col in ["point_count", "nearest_dist_m",
                "landuse_mode", "landuse_unique", "breeam_mode", "breeam_unique"]:
        if col not in merge1.columns:
            merge1[col] = pd.NA

    # Q: Should we add `match_method` now?
    # A: Yes — add it before merging and default to "pip" for polygons
    #    that had any primary points; leave as NA for those with none.
    if "match_method" not in merge1.columns:
        merge1["match_method"] = pd.NA
        # mark polygons that had primary matches as "pip"
        merge1.loc[merge1["point_count"].fillna(0) > 0, "match_method"] = "pip"


    # ----------------------------------------------------------------------------
    # Merge fallback summaries as *_fb columns (do NOT drop primary columns) 
    # ----------------------------------------------------------------------------
    merge2 = merge1.merge(
        fallback_agg,               # contains fallback summaries
        on=poly_key,
        how="left",
        suffixes=("", "_fb")        # fallback columns carry "_fb"; primary keep base name
    )

    print(merge2.columns.tolist())



    # --------------------------------------------------------------------------
    # For polygons that had primary matches, keep original values.
    # For polygons with no primary (point_count == 0 or NA), fill from fallback (_fb).
    # --------------------------------------------------------------------------
    import numpy as np

    # build a mask where we should use fallback values:
    # - primary point_count is 0 or NA
    # - AND fallback point_count_fb > 0 (i.e., a nearest point was actually found)
    pc_primary  = merge2.get("point_count")
    pc_fallback = merge2.get("point_count_fb")
    use_fb = pc_primary.fillna(0).eq(0) & pc_fallback.fillna(0).gt(0)

    # 1) point_count: replace 0/NA with fallback count where use_fb is True
    if "point_count_fb" in merge2.columns:
        merge2["point_count"] = np.where(use_fb, merge2["point_count_fb"], merge2["point_count"])
        merge2.drop(columns=["point_count_fb"], inplace=True, errors="ignore")

    # 2) other attributes: fill from fallback ONLY where use_fb is True; keep primary otherwise
    for col in ["nearest_dist_m", 
                "landuse_mode", "landuse_unique", "breeam_mode", "breeam_unique"]:
        fb_col = col + "_fb"
        if fb_col in merge2.columns:
            # np.where(use_fb, merge2[fb_col], merge2[col]) works row by row:
            # - If the polygon had no primary match (use_fb == True),
            #   take the fallback value from the *_fb column.
            # - Otherwise, keep the original primary value from the base column.
            # This ensures we only overwrite for polygons that really need fallback.
            merge2[col] = np.where(use_fb, merge2[fb_col], merge2[col])

            # Drop the temporary fallback column once we've merged its values in.
            # This keeps the final dataframe clean (only the unified column remains).
            merge2.drop(columns=[fb_col], inplace=True)

    
    # Update match_method:
    # - If a polygon got a valid nearest match (in fb_ids) AND its current method is NA,
    #   label it "nearest".
    # - If it already was "pip", keep it as "pip" (primary wins).
    if "match_method" not in merge2.columns:
        merge2["match_method"] = pd.NA
    mask_nearest_fill = merge2[poly_key].isin(fb_ids) & merge2["match_method"].isna()
    merge2.loc[mask_nearest_fill, "match_method"] = "nearest"


    # --- final enriched polygons (merge2) ---
    print(
        merge2[
            [poly_key, "point_count", "match_method", "nearest_dist_m",
             "landuse_mode", "breeam_mode"]
        ].head()
    )

['fid', 'area', 'perimeter', 'geometry', 'point_count', 'landuse_mode', 'landuse_unique', 'breeam_mode', 'breeam_unique', 'nearest_dist_m', 'match_method', 'point_count_fb', 'nearest_dist_m_fb', 'landuse_mode_fb', 'landuse_unique_fb', 'breeam_mode_fb', 'breeam_unique_fb']
   fid  point_count match_method nearest_dist_m landuse_mode breeam_mode
0  1.0          1.0      nearest       0.605724  Residential        <NA>
1  2.0          1.0      nearest       0.707167  Residential        <NA>
2  3.0          1.0          pip           <NA>  Residential        <NA>
3  4.0          5.0          pip           <NA>  Residential        <NA>
4  5.0          1.0      nearest       1.694663  Residential        <NA>


The above code has merged the primary and fallback data

- merge1 → polygons with attributes from the primary join (covers)

- merge2 → same polygons, but with fallback nearest values filled for those that had no points in the primary join → The final complete data

### check data

In [25]:
print(merge1.shape)
print(merge2.shape)


# merge1 = merge1.drop(columns=["area", "perimeter"], errors="ignore")
# merge2 = merge2.drop(columns=["area", "perimeter"], errors="ignore")


# print(merge1.head())
# print(merge2.head())


# print(merge1.columns)
print(merge2.columns)



(2223481, 11)
(2223481, 11)
Index(['fid', 'area', 'perimeter', 'geometry', 'point_count', 'landuse_mode',
       'landuse_unique', 'breeam_mode', 'breeam_unique', 'nearest_dist_m',
       'match_method'],
      dtype='object')


### save data

`.copy()` is important: without it, bld_with_attr = merge2 just creates a second reference to the same object. Any edits to one would also change the other.

With .copy(), you get an independent object that you can modify safely.

In [None]:
bld_with_attr = merge2.copy()

# 1. Save as Feather (very fast reload in Python, good for local caching) -- will take 12 seconds
feather_path = cache_dir / "bld_with_attr.feather"
bld_with_attr.to_feather(feather_path)

### reload data

In [9]:
import geopandas as gpd
from pathlib import Path

# path
feather_path = cache_dir / "bld_with_attr.feather"

# reload
bld_with_attr = gpd.read_feather(feather_path)

print("Loaded", len(bld_with_attr), "features")

Loaded 2223481 features


GeoPackage fields can’t store Python lists/dicts, so Fiona raises that error. Convert those columns to something GPKG understands (e.g., TEXT via JSON) or explode them before writing.

Here’s a drop-in fixer that:
- detects non-scalar columns (list/dict/set/tuple/ndarray),
- converts them to JSON strings (keeping None as null),
- then writes the file.

In [18]:
import json, numpy as np, pandas as pd
import geopandas as gpd
# EDITED: add this import once at the top of your file/cell
from pandas.api.types import CategoricalDtype

def sanitize_for_gpkg(gdf: gpd.GeoDataFrame):
    g = gdf.copy()
    geom = g.geometry.name

    # 1) Rename reserved PK names
    # avoid writing the pandas index as a field
    # rename any user column named 'fid' (and similar reserved names)
    # common reserved PK names in GPKG
    reserved = {"fid", "ogc_fid", "rowid"}
    rename_map = {c: f"{c}_attr" for c in g.columns if c.lower() in reserved}
    if rename_map:
        g = g.rename(columns=rename_map)

    # 2) Convert non-scalar values (list/dict/array/tuple/set) to JSON TEXT
    def is_non_scalar(v):
        return isinstance(v, (list, dict, tuple, set, np.ndarray))

    def to_text(v):
        if v is None or (isinstance(v, float) and pd.isna(v)):
            return None
        if is_non_scalar(v):
            try:
                return json.dumps(v, ensure_ascii=False)
            except Exception:
                return str(v)
        return v

    for c in g.columns:
        if c == geom:
            continue
        if g[c].map(is_non_scalar).any():
            g[c] = g[c].map(to_text).astype("string")

    # 3) Normalize dtypes to OGR-friendly ones
    for c in g.columns:
        if c == geom:
            continue
        s = g[c]
        if pd.api.types.is_datetime64_any_dtype(s):
            g[c] = s.dt.strftime("%Y-%m-%d %H:%M:%S")  # TEXT
        
        # elif pd.api.types.is_categorical_dtype(s):
        elif isinstance(s.dtype, CategoricalDtype):  # EDITED
            g[c] = s.astype("string")

        elif s.dtype == "object":
            g[c] = s.astype("string")
        elif pd.api.types.is_bool_dtype(s):
            g[c] = s.astype("int8")  # or keep bool; int is safest
        elif pd.api.types.is_integer_dtype(s):
            g[c] = s.astype("int64")
        elif pd.api.types.is_float_dtype(s):
            g[c] = s.astype("float64")

    # Safety check: ensure no reserved names remain
    assert not any(c.lower() in reserved for c in g.columns), "Reserved field name still present."

    return g


In [19]:
# 2. Save as GeoPackage (portable, works with GIS software like QGIS/ArcGIS) -- will take 13 mins
gpkg_path = cache_dir / "bld_with_attr.gpkg"

# ---- use it ---- 
g = sanitize_for_gpkg(bld_with_attr) 
# g.to_file(gpkg_path, driver="GPKG", layer="bld_with_attr", index=False)

### save data - gpkg - smaller 

In [20]:
print(g.columns.tolist())

['fid_attr', 'area', 'perimeter', 'geometry', 'point_count', 'landuse_mode', 'landuse_unique', 'breeam_mode', 'breeam_unique', 'nearest_dist_m', 'match_method']


In [None]:
# %pip install pyogrio
# pyogrio is faster and smaller.

from shapely import set_precision
import pandas as pd
import pyogrio


g_small = g.copy().drop(columns=["area", "perimeter", 'landuse_unique', 'breeam_unique'], errors="ignore")

print(g_small.columns.tolist())

# code strings → ints + lookups
for col in ["landuse_mode","breeam_mode","match_method"]:
    if col in g_small.columns:  # EDITED: guard if column missing
        codes, cats = pd.factorize(g_small[col], sort=True)
        g_small[col+"_code"] = pd.Series(codes, index=g_small.index).astype("Int16")
        g_small.drop(columns=[col], inplace=True)
        csv_path = cache_dir / f"bld_with_attr_{col}_lookup.csv"
        pd.DataFrame({col+"_code": range(len(cats)), col: cats}).to_csv(csv_path, index=False)




['fid_attr', 'geometry', 'point_count', 'landuse_mode', 'breeam_mode', 'nearest_dist_m', 'match_method']


#### compress shapefiles

In [37]:
from shapely import set_precision
from shapely.ops import unary_union
from shapely.geometry import Polygon, MultiPolygon

def remove_small_parts(geom, min_area_m2=1.0):
    """Drop tiny polygon parts (m²) to reduce vertices."""
    if geom is None or geom.is_empty:
        return geom
    if isinstance(geom, Polygon):
        return geom if geom.area >= min_area_m2 else None
    if isinstance(geom, MultiPolygon):
        parts = [g for g in geom.geoms if g.area >= min_area_m2]
        if not parts:
            return None
        return MultiPolygon(parts) if len(parts) > 1 else parts[0]
    return geom


# 3) Quantize coordinates (snap to grid) — reduces duplicate/near-duplicate vertices
#    Pick a grid that’s safe for your CRS (meters in BNG). 0.5 m is often visually lossless for buildings.
# g_small["geometry"] = g_small.geometry.apply(lambda geom: set_precision(geom, grid_size=0.5))  # meters

# 4) Topology-preserving simplification — removes vertices while keeping shape
#    Use a *small* tolerance first (e.g., 0.2–0.5 m for building outlines).
#    Note: shapely.simplify(preserve_topology=True) keeps topology but is heavier.
g_small["geometry"] = g_small.geometry.simplify(tolerance=0.5, preserve_topology=True)

# 5) Remove tiny parts/rings that contribute many vertices but add no value
g_small["geometry"] = g_small.geometry.apply(lambda g: remove_small_parts(g, min_area_m2=0.5))

In [38]:

# quantize nearest distance to centimeters
if "nearest_dist_m" in g_small.columns:  # EDITED: guard if missing
    nd = pd.to_numeric(g_small["nearest_dist_m"], errors="coerce")  # EDITED: robust to non-numeric
    nd = nd.replace([np.inf, -np.inf], np.nan)  # guard against inf

    # Convert meters → centimeters, round to nearest integer, then build a nullable Int32 array
    nearest_cm_ext = pd.array(
        np.rint(nd.to_numpy() * 100),          # integer-valued float array (NaN stays NaN)
        dtype=pd.Int32Dtype()                  # pandas nullable Int32 (supports <NA>)
    )
    # EDITED: use pandas nullable Int32 that supports NA values
    g_small["nearest_dist_cm"] = pd.Series(nearest_cm_ext, index=g_small.index)
    g_small.drop(columns=["nearest_dist_m"], inplace=True)

# counts to Int16
if "point_count" in g_small.columns:  # EDITED: guard + robust cast
    g_small["point_count"] = pd.to_numeric(g_small["point_count"], errors="coerce").astype(pd.Int16Dtype())

# # write compact GPKG without spatial index
# gpkg_path = cache_dir / "bld_with_attr_compact.gpkg"
# pyogrio.write_dataframe(
#     g_small, 
#     gpkg_path, 
#     layer="bld_attr",
#     driver="GPKG", 
#     layer_options={"SPATIAL_INDEX": "NO"},
# )


### prep for urban cooling model 

In [None]:
import pyogrio

print(g_small.columns.tolist())  # ### prep for urban cooling model

# If the column exists, rename it
if "landuse_mode_code" in g_small.columns:
    g_small = g_small.rename(columns={"landuse_mode_code": "type"})

# EDITED: fallback — if "type" still missing but a label exists, use it
if "type" not in g_small.columns and "landuse_mode" in g_small.columns:  # EDITED
    g_small = g_small.rename(columns={"landuse_mode": "type"})           # EDITED

# EDITED: ensure fid_attr exists (common source column is "fid")
if "fid_attr" not in g_small.columns and "fid" in g_small.columns:       # EDITED
    g_small = g_small.rename(columns={"fid": "fid_attr"})                # EDITED

print(g_small.columns.tolist())

# make a copy with only selected columns
keep_cols = ["fid_attr", "type", "geometry"]

g_selected = g_small[[col for col in keep_cols if col in g_small.columns]].copy()

# sanity check to avoid writing an empty/invalid layer
missing = [c for c in keep_cols if c not in g_selected.columns]
if missing:
    print(f"Warning: missing columns in output: {missing}")

# write compact GPKG without spatial index
gpkg_path = cache_dir / "bld_with_attr_compact_ucm2.gpkg"

# ensure output directory exists
gpkg_path.parent.mkdir(parents=True, exist_ok=True)

pyogrio.write_dataframe(
    g_selected, 
    gpkg_path, 
    layer="bld_attr_ucm",                                                # EDITED: distinct layer name
    driver="GPKG", 
    layer_options={"SPATIAL_INDEX": "NO"},
)
print(f"Saved: {gpkg_path}")                                             # EDITED

['fid_attr', 'geometry', 'point_count', 'type', 'breeam_mode_code', 'match_method_code', 'nearest_dist_cm']
['fid_attr', 'geometry', 'point_count', 'type', 'breeam_mode_code', 'match_method_code', 'nearest_dist_cm']
Saved: E:\London\colouringbritain\data-extract-2025-09-01\cache\bld_with_attr_compact_ucm2.gpkg


In [None]:
# import numpy as np
# import pandas as pd

# # --- Make a safe copy so we don’t mutate the original GeoDataFrame ---
# g_out = g_selected.copy()

# # --- Loop through each column and normalize its dtype ---
# for col in g_out.columns:
#     if col == g_out.geometry.name:  # Skip the geometry column (leave as is)
#         continue
#     s = g_out[col]

#     # 1) Integers (including nullable Int16/Int32) → plain int32
#     #    Shapefile/DBF only supports fixed-width numeric fields, no NA.
#     #    So: convert to numeric, replace NaN with -1, then cast to int32.
#     if pd.api.types.is_integer_dtype(s):
#         g_out[col] = pd.to_numeric(s, errors="coerce").fillna(-1).astype(np.int32)

#     # 2) Floats → float64
#     #    Shapefile supports floats, so keep as float64 for maximum compatibility.
#     elif pd.api.types.is_float_dtype(s):
#         g_out[col] = pd.to_numeric(s, errors="coerce").astype(np.float64)

#     # 3) Booleans → int8
#     #    DBF doesn’t have a native boolean type; store as 0/1 integers.
#     elif pd.api.types.is_bool_dtype(s):
#         g_out[col] = s.astype(np.int8)

#     # 4) Datetime → string
#     #    Shapefile DBF doesn’t handle full pandas datetimes well.
#     #    Convert to human-readable text (YYYY-MM-DD HH:MM:SS).
#     elif pd.api.types.is_datetime64_any_dtype(s):
#         g_out[col] = s.dt.strftime("%Y-%m-%d %H:%M:%S")

#     # 5) Everything else (object, string, category, mixed) → string
#     #    Ensure missing values stay as None (DBF null), not "nan" text.
#     else:
#         g_out[col] = s.astype(str).replace("nan", None)

# # --- Save as ESRI Shapefile ---
# # Shapefile consists of .shp/.shx/.dbf/.prj/.cpg, all created automatically.
# shp_path = cache_dir / "bld_with_attr_compact_ucm.shp"
# g_out.to_file(shp_path, driver="ESRI Shapefile", encoding="utf-8")

# print(f"Saved shapefile to: {shp_path}")

Saved shapefile to: E:\London\colouringbritain\data-extract-2025-09-01\cache\bld_with_attr_compact_ucm.shp


### data sumamry - total area by buidling type

In [40]:
# import geopandas as gpd
# import pandas as pd
# from pathlib import Path

# # --- paths ---
# gpkg_path = cache_dir / "bld_with_attr_compact.gpkg"

# # --- load GeoPackage ---
# g_small = gpd.read_file(gpkg_path, layer="bld_attr")
# print("Loaded", len(g_small), "features")
# print("Columns:", g_small.columns.tolist())

# # --- ensure CRS in meters (BNG for London) ---
# if g_small.crs is None or g_small.crs.to_epsg() != 27700:
#     g_small = g_small.to_crs(27700)

# # --- calculate area ---
# g_small["area_m2"] = g_small.geometry.area

# # --- detect land use column ---
# landuse_col = None
# if "landuse_mode" in g_small.columns:
#     landuse_col = "landuse_mode"
# elif "landuse_mode_code" in g_small.columns:
#     landuse_col = "landuse_mode_code"
# else:
#     # create placeholder column if missing
#     landuse_col = "landuse_mode"
#     g_small[landuse_col] = pd.NA

# # --- aggregate total area by land use ---
# area_by_landuse = (
#     g_small.groupby(landuse_col, dropna=True, observed=True, as_index=False)["area_m2"]
#            .sum()
#            .sort_values("area_m2", ascending=False)
# )

# # --- save CSV ---
# out_csv = cache_dir / "bld_with_attr_landuse_total_area_m2.csv"
# area_by_landuse.to_csv(out_csv, index=False)

# print(area_by_landuse.head())
# print(f"Saved totals to: {out_csv}")
