In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd


import sys, platform
print("=== Environment Info ===")
print(f"Python envi    : {sys.executable}")
print(f"Python version : {sys.version.split()[0]}")
print(f"Platform       : {platform.platform()}")
print(f"geopandas      : {gpd.__version__}")
print(f"numpy          : {np.__version__}")
print(f"matplotlib     : {plt.matplotlib.__version__}")
print("========================")


=== Environment Info ===
Python envi    : c:\Users\pc\.conda\envs\geo_env\python.exe
Python version : 3.11.13
Platform       : Windows-10-10.0.22631-SP0
geopandas      : 0.14.4
numpy          : 2.2.6
matplotlib     : 3.10.6


### Dir

In [2]:
from pathlib import Path

# Set the working directory
wd_main     = Path(r"G:\Shared drives\Wellcome Trust Project Data") 
wd_shp      = wd_main / "1_preprocess" / "UrbanCoolingModel" / "OfficialWorkingInputs" / "AOIs"
dir_ucm_out = wd_main / "2_postprocess_intermediate" / "UCM_official_runs"
figures_dir = wd_main / "3_final" / "UCM_figures"



### AOI - select one

In [3]:
# 1. borough level AOI files
admin_shapefile = wd_shp / "London_Borough_aoi.shp"  # Administrative boundary (e.g., census tracts)
aoi_adm = gpd.read_file(admin_shapefile)



# 2. LSOA level AOI files
admin_shapefile = wd_shp / "Social_Vulnerability_Index_london_q.gpkg"  # LSOA boundary
aoi_adm = gpd.read_file(admin_shapefile)

### Load data

Energy results - Building level

In [4]:

var = "energy_sav" # heavy work


# 2) Recursive: all files in subfolders too
files = sorted(dir_ucm_out.rglob("buildings_with_stats_london_scenario*.shp"))

# for p in files:
#     print(p)

# subset filenames that contain "25" or "28" anywhere in the name
fs = [f for f in files if ("25" in f.name) or ("28" in f.name)]

for p in fs:
    print(p)

G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\scenario0\work_and_energy_runs\buildings_with_stats_london_scenario_25.0deg_2.0uhi_45.0hum_energy_productivity.shp
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\scenario0\work_and_energy_runs\buildings_with_stats_london_scenario_25.0deg_5.0uhi_45.0hum_energy_productivity.shp
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\scenario0\work_and_energy_runs\buildings_with_stats_london_scenario_28deg_2uhi_45hum_energy_productivity.shp
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\scenario0\work_and_energy_runs\buildings_with_stats_london_scenario_28deg_5uhi_45hum_energy_productivity.shp
G:\Shared drives\Wellcome Trust Project Data\2_postprocess_intermediate\UCM_official_runs\scenario1\work_and_energy_runs\buildings_with_stats_london_scenario_25.0deg_2.0uhi_45.0hum_energy_prod

quickly check the columns

In [5]:
# import geopandas as gpd

# # 1. Pick the first file from your list
# sample_file = fs[0]
# print(f"Inspecting file: {sample_file.name}")

# # 2. Read only the first row (much faster than reading the whole file)
# gdf_sample = gpd.read_file(sample_file, rows=1)

# # 3. Print the column names
# print("\nColumn Names:")
# for col in gdf_sample.columns:
#     print(f" - {col}")

# print(gdf_sample.head())

In [None]:
# # Get all parts of the path
# p = fs[0]
# parts = p.parts 

# # Find the index of the base folder
# if "UCM_official_runs" in parts:
#     idx = parts.index("UCM_official_runs")
#     scenario_name = parts[idx + 1] # Take the folder right after it
# else:
#     scenario_name = "unknown_scenario"

# print(f"Processing scenario: {scenario_name}")

Processing scenario: scenario0


### covert shp to raster -> zonal stats

In [7]:
import geopandas as gpd
import numpy as np
import rasterio
from rasterio import features
from rasterio.transform import from_bounds
from pathlib import Path
import pandas as pd

# --- 1. Configuration ---
# Define paths
LSOA_PATH = admin_shapefile
OUTPUT_DIR = dir_ucm_out / "zonal_stats_lsoa"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Which column in the building shapefiles contains the value you want to aggregate?
VALUE_COL = "energy_sav" 
RESOLUTION = 30 # 30 meter resolution

# Setup file list (from your snippet) - fs is already defined above


# --- 2. Prepare the Reference Grid (LSOA) ---
print("Loading LSOA base layer...")
lsoa = gpd.read_file(LSOA_PATH, engine="pyogrio")
# Rename 'id' to 'LSOA_CODE'
lsoa = lsoa.rename(columns={'id': 'LSOA_CODE'})
print(lsoa.columns)

# Keep only necessary columns to keep memory usage low
# Ensure 'LSOA_CODE' is the unique identifier
lsoa_base = lsoa.copy() 


Loading LSOA base layer...
Index(['LSOA_CODE', 'Neighborhood_name', 'metric_category',
       'pIncomeDeprivation', 'p5under', 'p75over', 'pNotEnglishProficient',
       'pSocial_housing', 'pBAME', 'pIncomeDeprivation_pct', 'p5under_pct',
       'p75over_pct', 'pNotEnglishProficient_pct', 'pSocial_housing_pct',
       'pBAME_pct', 'mean_Social', 'mean_Household', 'mean_Racial',
       'mean_Housing', 'overall_score', 'overall_pct', 'overall_rank',
       'pIncomeDeprivation_q', 'p5under_q', 'p75over_q',
       'pNotEnglishProficient_q', 'pSocial_housing_q', 'pBAME_q', 'geometry'],
      dtype='object')


#### point approach 

In [None]:
# --- 3. Process Each File ---
for p in fs:
    # A. Extract Scenario and Clean Name
    try:
        scenario_name = p.parent.parent.name # Parent of parent folder
    except:
        scenario_name = "unknown_scenario"
        
    clean_stem = p.stem.replace("buildings_with_stats_london_", "").replace("_energy_productivity", "")
    
    print(f"Processing: {scenario_name} | {clean_stem}")

    # B. Load Buildings (Only Geometry and Energy Column)
    try:
        bldgs = gpd.read_file(p, engine="pyogrio", columns=['energy_sav', 'geometry'])
    except Exception as e:
        print(f"  Error reading file: {e}")
        continue

    # Align CRS
    if bldgs.crs != lsoa_base.crs:
        bldgs = bldgs.to_crs(lsoa_base.crs)

    # C. Convert to Centroids (Crucial Step)
    # This turns polygons into points. It's much faster to join points to polygons.
    # It ensures every building is counted, even if they are tiny or close together.
    bldgs['geometry'] = bldgs.geometry.centroid

    # D. Spatial Join (Assign every building to an LSOA)
    # predicate='within' checks which LSOA the building point is inside
    joined = gpd.sjoin(bldgs, lsoa_base, how="inner", predicate="within")

    # E. Calculate Statistics (GroupBy LSOA)
    stats = joined.groupby('LSOA_CODE')['energy_sav'].agg(['sum', 'mean', 'count']).reset_index()
    
    # # Rename columns for clarity
    # stats.columns = ['LSOA_CODE', 'energy_sav_sum', 'energy_sav_mean', 'bldg_count']

    # Rename columns for clarity
    stats = stats.rename(columns={'sum': 'energy_sav_sum', 'mean': 'energy_sav_mean', 'count': 'bldg_count'})

    # F. Merge Results back to LSOA Geometry
    result_gdf = lsoa_base.merge(stats, on='LSOA_CODE', how='left')

    # Fill NaNs (LSOAs that had no buildings) with 0
    cols_to_fill = ['energy_sav_sum', 'energy_sav_mean', 'bldg_count']
    result_gdf[cols_to_fill] = result_gdf[cols_to_fill].fillna(0)

    # Add Metadata
    result_gdf['scenario'] = scenario_name

    # G. Save
    out_name = f"{scenario_name}_{clean_stem}_energy_stats.gpkg"
    out_path = OUTPUT_DIR / out_name
    
    print(f"  Saving to {out_path}...")
    result_gdf.to_file(out_path, driver="GPKG")

print("Done.")

#### Raster approach

Rasterizing shp to 30m Resolution
- For "Sum" calculations (energy_sav), rasterizing at 30m is dangerous if your buildings are smaller than 900m² (which most London houses are).
- The Problem: If 3 small row houses fit inside one 30m pixel, the rasterization process will typically only capture the value of one of them (the last one processed). You will lose the data for the other two.
- The Result: Your Total Sum of energy savings will be significantly undercounted.


The Better Solution: Vector "Centroid" Method: 
- Since you ultimately want the result as a GPKG table (LSOA zones), you should skip the raster step entirely.
- Instead, convert the buildings to Points (Centroids) and perform a Spatial Join.
- It is 100% accurate for Sums (no overlapping pixels).
- It is faster than rasterizing at high resolution.
- It handles the "Scenario" extraction you requested.

In [None]:

# # Define 30m Grid based on LSOA bounds
# xmin, ymin, xmax, ymax = lsoa.total_bounds
# width = int((xmax - xmin) / RESOLUTION)
# height = int((ymax - ymin) / RESOLUTION)
# transform = from_bounds(xmin, ymin, xmax, ymax, width, height)

# print(f"Grid dimensions: {width}x{height} (30m resolution)")

# # Rasterize LSOAs to create a "Zone Map" (ID for every pixel)
# print("Rasterizing LSOA zones...")
# lsoa['temp_idx'] = range(len(lsoa))
# lsoa_shapes = ((geom, val) for geom, val in zip(lsoa.geometry, lsoa['temp_idx']))

# # Fill with -1 for areas outside any LSOA
# lsoa_grid = features.rasterize(
#     shapes=lsoa_shapes,
#     out_shape=(height, width),
#     transform=transform,
#     fill=-1, 
#     dtype='int32'
# )

# # Flatten LSOA grid for fast indexing
# flat_lsoa = lsoa_grid.ravel()

# # --- 3. Process Each Scenario File ---
# for p in fs:
#     print(f"Processing: {p.name}")

#     # Get scenario name from path -------
#     parts = p.parts 
#     # Find the index of the base folder
#     if "UCM_official_runs" in parts:
#         idx = parts.index("UCM_official_runs")
#         scenario_name = parts[idx + 1] # Take the folder right after it
#     else:
#         scenario_name = "scenario_unknown"
    
#     # 1. Load Building Data
#     try:
#         # Load only geometry and the value column to save memory
#         bldgs = gpd.read_file(p, engine="pyogrio", columns=[VALUE_COL, 'geometry'])
#     except Exception as e:
#         print(f"  Error reading {p.name}: {e}")
#         continue

#     # Align CRS if necessary
#     if bldgs.crs != lsoa.crs:
#         print("  Reprojecting buildings...")
#         bldgs = bldgs.to_crs(lsoa.crs)

#     # 2. Rasterize the 'energy_sav' values
#     print(f"  Rasterizing '{VALUE_COL}'...")
    
#     # Drop rows where energy_sav is NaN
#     bldgs = bldgs.dropna(subset=[VALUE_COL])
    
#     # Create generator: (geometry, energy_sav_value)
#     bldg_shapes = ((geom, val) for geom, val in zip(bldgs.geometry, bldgs[VALUE_COL]))

#     # Burn values into grid. 
#     # fill=0 assumes areas with no buildings have 0 energy savings.
#     val_grid = features.rasterize(
#         shapes=bldg_shapes,
#         out_shape=(height, width),
#         transform=transform,
#         fill=0, 
#         dtype='float32'
#     )

#     ######################
#     # 3. Zonal Statistics (Numpy) 
#     ######################
#     flat_vals = val_grid.ravel()

#     # Create mask: Pixel must be inside an LSOA (ID != -1) AND have a building (Value != 0)
#     # If you want to calculate the mean over the *entire* LSOA area (including empty space),
#     # change the mask to just: mask = (flat_lsoa != -1)
#     mask = (flat_lsoa != -1) & (flat_vals != 0)
    
#     valid_lsoa_ids = flat_lsoa[mask]
#     valid_values = flat_vals[mask]

#     if len(valid_values) == 0:
#         print(f"  Warning: No overlaps found for {p.name}")
#         continue

#     # Count: How many building pixels per LSOA
#     count_per_lsoa = np.bincount(valid_lsoa_ids, minlength=len(lsoa))
    
#     # Sum: Total energy_sav per LSOA (sum of pixel values)
#     sum_per_lsoa = np.bincount(valid_lsoa_ids, weights=valid_values, minlength=len(lsoa))

#     # Mean: Average energy_sav PER BUILDING PIXEL within the LSOA
#     with np.errstate(divide='ignore', invalid='ignore'):
#         mean_per_lsoa = sum_per_lsoa / count_per_lsoa
#         mean_per_lsoa[~np.isfinite(mean_per_lsoa)] = 0 

#     ######################
#     # 4. Save Results
#     ######################
#     # Attach results to a clean copy of LSOA info
#     # result_df = lsoa[['LSOA_CODE', 'LSOA_NAME', 'geometry']].copy()
#     result_df = lsoa.copy()
    
#     result_df[f'{VALUE_COL}_sum'] = sum_per_lsoa
#     result_df[f'{VALUE_COL}_mean'] = mean_per_lsoa
#     result_df['bldg_pixel_count'] = count_per_lsoa
    
#     # --- CLEAN FILENAME LOGIC ---
#     # Remove the specific unwanted strings from the original filename
#     clean_stem = p.stem.replace("with_stats_london_", "").replace("_energy_productivity", "")
    
#     # Construct final output name
#     out_name = scenario_name + clean_stem + "_energy_stats.gpkg"
#     out_path = OUTPUT_DIR / out_name
    
#     print(f"  Saving to {out_path}...")
#     result_df.to_file(out_path, driver="GPKG")

# print("Done.")

### Load building level data - take a long time ... 

In [None]:
# # Load the shapefile
# f1 = dir_ucm_out / 'current_lulc' / 'work_and_energy_runs' / "buildings_with_stats_london_scenario_22.0deg_2.0uhi_55.0hum_energy_productivity.shp"
# gdf_base = gpd.read_file(f1)
# print(gdf_base.columns.tolist())

# gdf_base = gdf_base.dropna(subset=[var])

# print(gdf_base.columns.tolist())

In [None]:

# # Load the shapefile
# f2 = dir_ucm_out / 'scenario3' / 'work_and_energy_runs' / "buildings_with_stats_london_scenario3_25.0deg_5.0uhi_45hum_energy_productivity.shp"
# # f2 = dir_ucm_out / 'scenario4' / 'work_and_energy_runs' / "tcc_30prc" / "buildings_with_stats_london_scenario4_30prc_22.0deg_2.0uhi_55.0hum_energy_productivity.shp"
# gdf_new  = gpd.read_file(f2)    # Alternative scenario
# gdf_new = gdf_new.dropna(subset=[var])



# # Extract suffix
# suffix1 = make_short_name(Path(f1), Path(dir_ucm_out) )  
# suffix2 = make_short_name(Path(f2), Path(dir_ucm_out) )
# suffix_change = f"{suffix1}_VS_{suffix2}"
# print(suffix_change)


### save a copy 

In [None]:

# # 1. Save as Feather (very fast reload in Python, good for local caching) -- will take 12 seconds
# feather_path = dir_ucm_out / 'current_lulc' / 'work_and_energy_runs' / "buildings_with_stats_22_2.feather"
# gdf_base.to_feather(feather_path)



# feather_path = dir_ucm_out / 'scenario3' / 'work_and_energy_runs' / "buildings_with_stats_25_5.feather"
# gdf_new.to_feather(feather_path)


### reload

In [None]:
# feather_path1 = dir_ucm_out / 'current_lulc' / 'work_and_energy_runs' / "buildings_with_stats_22_2.feather"
# gdf_base = pd.read_feather(feather_path1)

# feather_path2 = dir_ucm_out / 'scenario3' / 'work_and_energy_runs' / "buildings_with_stats_25_5.feather"
# gdf_new = pd.read_feather(feather_path2)

## Sum by borough  

### function

In [None]:
# import re
# from pathlib import Path

# def make_short_name(path, base_dir) -> str:
#     # --- normalize inputs ---
#     # Unwrap single-element tuple/list like (path,) or [path]
#     if isinstance(path, (tuple, list)):
#         if len(path) == 1:
#             path = path[0]
#         else:
#             raise TypeError(f"Expected a path, got a {type(path).__name__} with len {len(path)}")
#     if isinstance(base_dir, (tuple, list)):
#         if len(base_dir) == 1:
#             base_dir = base_dir[0]
#         else:
#             raise TypeError(f"Expected a base_dir path, got a {type(base_dir).__name__} with len {len(base_dir)}")

#     path = Path(path)
#     base_dir = Path(base_dir)

#     # find prefix (the first folder under base_dir, e.g. 'current_lulc' or 'scenario2')
#     try:
#         prefix = path.relative_to(base_dir).parts[0]
#     except Exception:
#         # fallback: immediate parent if not under base_dir
#         prefix = path.parent.name

#     # extract deg/uhi numbers from filename
#     m = re.search(r"scenario_([\d.]+)deg_([\d.]+)uhi", path.name)
#     if not m:
#         return prefix

#     deg = str(int(float(m.group(1))))
#     uhi = str(int(float(m.group(2))))
#     return f"{prefix}_{deg}_{uhi}"


### run

In [None]:
# import geopandas as gpd
# import pandas as pd
# import numpy as np

# # Inputs
# # blocks_fp = r"path\to\blockgroups.shp"    # polygons with 'ene_sav' per block group
# tracts_fp = admin_shapefile                 # polygons

# energy_col = "energy_sav"                   # block group total energy column
# tract_id_col = "GSS_CODE"                   # tract id column in the tracts layer

# # 1) Load
# bg = gdf_base
# # bg = gdf_new.copy()

# print(bg.head(6))

# tr = gpd.read_file(tracts_fp)

# # 2) Project to an equal-area CRS for correct area math (US example: EPSG:5070)
# #    Use a suitable local equal-area if you’re outside the US.
# # ea_crs = "EPSG:27700"
# # bg = bg.to_crs(ea_crs)
# # tr = tr.to_crs(ea_crs)

# # 3) Compute source polygon areas (to guard against weird or zero areas)
# bg["src_area"] = bg.geometry.area
# bg[energy_col] = bg[energy_col].fillna(0)

# # print(bg.head(6))

# # 4) Intersect (overlay) to get pieces of BGs clipped by tracts
# #    This can be memory-heavy for big areas—consider spatial indexing or tiling if needed.
# inter = gpd.overlay(bg, tr[[tract_id_col, "geometry"]], how="intersection")

# # 5) Compute overlap area and proportional share
# inter["overlap_area"] = inter.geometry.area
# # Avoid division by 0
# inter = inter[inter["overlap_area"] > 0].copy()
# inter = inter[inter["src_area"] > 0].copy()

# # Share of the BG’s energy allocated to that tract piece
# inter["ene_sav_alloc"] = inter[energy_col] * (inter["overlap_area"] / inter["src_area"])

# # 6) Sum by tract
# tract_energy = (inter.groupby(tract_id_col, as_index=False)["ene_sav_alloc"]
#                         .sum()
#                         .rename(columns={"ene_sav_alloc": "ene_sav_total"}))

# # 7) Join back to tract geometry
# tr_out = tr.merge(tract_energy, on=tract_id_col, how="left")
# tr_out["ene_sav_total"] = tr_out["ene_sav_total"].fillna(0)

# # print(tr_out.head())

# # 8) Save
# f = Path(str(feather_path1).replace("energy_productivity", "ene_sav_total").replace(".shp", ".gpkg"))
# tr_out.to_file(f, layer="tract_energy", driver="GPKG")
