# Packages

In [1]:
import geopandas as gpd
import os
import pandas as pd
from pathlib import Path


# Data

## Load shp data

In [2]:

# Define file path
file1_path = r"G:/Shared drives/Wellcome Trust Project Data/0_source_data/GiGL land use data/GiGL_Trees_font_point/GiGL_GLATrees_Pre2023.shp"
file2_path = r"G:/Shared drives/Wellcome Trust Project Data/0_source_data/GiGL land use data/GiGL_Trees_font_point/GiGL_GLATrees_2023-24.shp"

# Define output path
current_path = os.getcwd()
print("Current working directory:\n", current_path)

parent_folder = os.path.dirname(current_path)
print("Parent folder:\n", parent_folder)

##
wd_shp = r'G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel'


from pathlib import Path

# Where to save
out_dir = Path(parent_folder) / "data"
out_dir.mkdir(parents=True, exist_ok=True)

# # Read the spatial data one by one
# d = gpd.read_file(file1_path)


Current working directory:
 d:\natcap\urban-cooling-health\code
Parent folder:
 d:\natcap\urban-cooling-health


In [15]:
## Load all the saved data

f1 = os.path.join(parent_folder, "data", os.path.basename(file1_path).replace('.shp', '') + "_risk_2050.shp"); print(f1)
f2 = os.path.join(parent_folder, "data", os.path.basename(file2_path).replace('.shp', '') + "_risk_2050.shp"); print(f2)


# Read the spatial data
tree_risk_layer1 = gpd.read_file(f1, engine="pyogrio")
tree_risk_layer2 = gpd.read_file(f2, engine="pyogrio")


d:\natcap\urban-cooling-health\data\GiGL_GLATrees_Pre2023_risk_2050.shp
d:\natcap\urban-cooling-health\data\GiGL_GLATrees_2023-24_risk_2050.shp


KeyboardInterrupt: 

## combine shp data

In [None]:
import os
import geopandas as gpd
import pandas as pd
from pathlib import Path

# You already have:
# f1, f2
# tree_risk_layer1 = gpd.read_file(f1, engine="pyogrio")
# tree_risk_layer2 = gpd.read_file(f2, engine="pyogrio")

g1 = tree_risk_layer1.copy()
g2 = tree_risk_layer2.copy()

# Tag source
g1["source_file"] = Path(f1).name
g2["source_file"] = Path(f2).name

# Pick a target CRS (prefer the first non-None)
target_crs = g1.crs or g2.crs

# Align CRS
def _align_crs(g, target):
    if target is None:
        return g
    if g.crs is None:
        return g.set_crs(target)
    if g.crs != target:
        return g.to_crs(target)
    return g

g1 = _align_crs(g1, target_crs)
g2 = _align_crs(g2, target_crs)

# Union schema (keep geometry last)
all_cols = [c for c in sorted(set(g1.columns) | set(g2.columns)) if c != "geometry"] + ["geometry"]
g1 = gpd.GeoDataFrame(g1.reindex(columns=all_cols), geometry="geometry", crs=target_crs)
g2 = gpd.GeoDataFrame(g2.reindex(columns=all_cols), geometry="geometry", crs=target_crs)

# Fix invalid geometries (best effort)
try:
    from shapely import make_valid
    g1["geometry"] = make_valid(g1.geometry)
    g2["geometry"] = make_valid(g2.geometry)
except Exception:
    g1["geometry"] = g1.buffer(0)
    g2["geometry"] = g2.buffer(0)

# Drop empty/null geoms
g1 = g1[g1.geometry.notnull() & ~g1.geometry.is_empty]
g2 = g2[g2.geometry.notnull() & ~g2.geometry.is_empty]

# Concatenate
combined = gpd.GeoDataFrame(
    pd.concat([g1, g2], ignore_index=True, sort=False),
    geometry="geometry",
    crs=target_crs
)

# (Optional) drop exact duplicates (including geometry)
# combined = combined.drop_duplicates(subset=[c for c in combined.columns if c != "geometry"] + ["geometry"])

# Save
out_path = os.path.join(parent_folder, "data", "GiGL_GLATrees_at_risk_2050_merged.gpkg")
layer_name = Path(out_path).stem  # "GiGL_GLATrees_at_risk_2050_merged"

combined.to_file(out_path, driver="GPKG", layer=layer_name)
print(f"Wrote {len(combined)} features to {out_path} (layer='{layer_name}')")


Wrote 1050054 features to d:\natcap\urban-cooling-health\data\GiGL_GLATrees_at_risk_2050_merged.gpkg (layer='GiGL_GLATrees_at_risk_2050_merged')


In [None]:
import pandas as pd

# --- make 'year' sortable & consistent ---
if 'year' not in combined.columns:
    raise KeyError("'year' column not found in combined")

# if it's datetime-like, grab the year; else coerce to numeric
if pd.api.types.is_datetime64_any_dtype(combined['year']):
    combined['year'] = combined['year'].dt.year.astype('Int64')
else:
    combined['year'] = pd.to_numeric(combined['year'], errors='coerce').astype('Int64')

# --- summarize ---
summary = (
    combined
    .groupby('year', dropna=False)
    .size()
    .rename('count')
    .reset_index()
    .sort_values('year', na_position='last')
)

total_n = len(combined)
summary['percent'] = (summary['count'] / total_n * 100).round(2)

# for display, keep a nice label while preserving numeric 'year'
summary['year_label'] = summary['year'].astype(str).replace('<NA>', 'Unknown')

print(summary[['year', 'year_label', 'count', 'percent']])


   year year_label   count  percent
0  2050       2050  657622    62.63
1  <NA>    Unknown  392432    37.37


## 1m grid shp to 10m grid

Turn the 1 m × 1 m grid to a 10 m × 10 m grid -- snap cells to a 10 m lattice and aggregate. This avoids creating 100 overlapping squares per tile.

* **Warning**: this will take a long time to run! Uncomment this block if update is needed. 

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import box

gdf = combined.copy()

# 0) Ensure CRS is metric (meters)
if gdf.crs is None or not gdf.crs.is_projected:
    raise ValueError("Project to a metric CRS first (e.g., EPSG:5070 or UTM).")

# 1) Define 10 m grid origin (snap to a multiple of 10 m)
cell = 10.0
xmin, ymin, xmax, ymax = gdf.total_bounds
x0 = np.floor(xmin / cell) * cell
y0 = np.floor(ymin / cell) * cell

# 2) Compute grid indices for each 1 m cell using its lower-left corner
b = gdf.geometry.bounds  # DataFrame with minx, miny, maxx, maxy
i = np.floor((b["minx"] - x0) / cell).astype("int64")
j = np.floor((b["miny"] - y0) / cell).astype("int64")
gdf["i"] = i
gdf["j"] = j

# 3) Build unique 10 m tiles as polygons
keys = gdf[["i", "j"]].drop_duplicates().reset_index(drop=True)
xs = x0 + keys["i"].to_numpy() * cell
ys = y0 + keys["j"].to_numpy() * cell
geoms = [box(x, y, x + cell, y + cell) for x, y in zip(xs, ys)]
tiles10 = gpd.GeoDataFrame(keys, geometry=geoms, crs=gdf.crs)

# 4) (Optional) carry aggregations to 10 m grid
# Count how many 1 m cells fell into each 10 m tile (0–100, typically <=100 if partial coverage)
counts = gdf.groupby(["i", "j"]).size().rename("n_cells").reset_index()
tiles10 = tiles10.merge(counts, on=["i", "j"], how="left")

# Example: majority year per 10 m tile (if you have a 'year' column)
if "year" in gdf.columns:
    per_year = gdf.groupby(["i", "j", "year"]).size().rename("n").reset_index()
    idx = per_year.groupby(["i", "j"])["n"].idxmax()
    majority_year = per_year.loc[idx, ["i", "j", "year"]]
    tiles10 = tiles10.merge(majority_year, on=["i", "j"], how="left")





# --- Best option: GeoPackage ---
gpkg_path = out_dir / "GiGL_GLATrees_at_risk_2050_merged_tiles10.gpkg"
layer_name = "tiles10"

# keep grid keys as ints
t10 = tiles10.copy()
for c in ("i", "j"):
    if c in t10.columns:
        t10[c] = t10[c].astype("int64")

t10.to_file(gpkg_path, driver="GPKG", layer=layer_name)  # engine="pyogrio" also OK
print(f"Wrote {len(t10)} tiles to {gpkg_path} (layer='{layer_name}')")

# --- (Optional) Shapefile (use short field names, 32-bit ints) ---
# shp_path = out_dir / "tiles10_shp" / "tiles10.shp"
# (out_dir / "tiles10_shp").mkdir(exist_ok=True)
# t_shp = t10.copy()
# for c in ("i", "j"):
#     if c in t_shp.columns:
#         t_shp[c] = t_shp[c].astype("int32")
# t_shp.to_file(shp_path, driver="ESRI Shapefile")
# print(f"Also wrote Shapefile to {shp_path}")


Wrote 777007 tiles to d:\natcap\urban-cooling-health\data\GiGL_GLATrees_at_risk_2050_merged_tiles10.gpkg (layer='tiles10')


## generate lc scenario - tree at risk

* **Load data and process from here!** if only to update lc scenario raster
* Pick one of the land cover inputs data

only changes LC pixels that (a) overlap the risk polygons and (b) are currently one of your tree codes:

In [4]:
import os
from pathlib import Path

import numpy as np
import geopandas as gpd
import rasterio
from rasterio.features import rasterize


## Load shapefile with tree cover will be at risk or loss
shp_tree_risk = out_dir / "GiGL_GLATrees_at_risk_2050_merged_tiles10.gpkg"



#### land cover option 1

In [5]:

## land cover data
lc_input = r"G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel\EP_preliminary_tests\clipped_lulc\UKECH\LCM2021_london.tif"
tree_cover_code = [1, 2]
after_risk_lc_code = 4  # 4: 'Improved Grassland',
# Define labels
land_cover_labels = {
    1: 'Deciduous woodland',
    2: 'Coniferous woodland',
    3: 'Arable',
    4: 'Improved Grassland',
    5: 'Neutral Grassland',
    6: 'Calcareous Grassland',
    7: 'Acid grassland',
    8: 'Fen, Marsh, and Swamp',
    9: 'Heather',
    10: 'Heather grassland',
    11: 'Bog',
    12: 'Inland Rock',
    13: 'Saltwater',
    14: 'Freshwater',
    15: 'Supralittoral Rock',
    16: 'Supralittoral Sediment',
    17: 'Littoral Rock',
    18: 'Littoral Sediment',
    19: 'Saltmarsh',
    20: 'Urban',
    21: 'Suburban',
}


## clip raster to AOI -------------------------------------------------------------------
aoi_shapefile = os.path.join(wd_shp, "London_Ward_aoi_prj.shp") 
# Load administrative boundaries
aoi = gpd.read_file(aoi_shapefile)


# load function to use in notebook
from function_clip_raster_to_aoi import clip_raster_to_aoi


## clip 
# Output path (optional)
lc_clipped_tif = lc_input.replace(".tif", "_clip2aoi.tif")

filled_arr, filled_transform, filled_profile = clip_raster_to_aoi(
    raster_path=lc_input,
    aoi=aoi_shapefile,  # or pass your GeoDataFrame
    out_path=lc_clipped_tif,
    replace_nodata_with=0,     # convert 255/nodata -> 0
    keep_nodata_tag=False      # keep False so 0 is treated as a real class
)

print("Done. Saved to:\n\t", lc_clipped_tif)
print(np.unique(filled_arr[~np.isnan(filled_arr)]))  # check valid classes




## update land cover input data to the clipped raster ---------------------------------
lc_input = lc_clipped_tif  # Use the clipped raster as input
lc_output = lc_input.replace(".tif", "_scenario3_TreeRisk.tif")


Done. Saved to:
	 G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel\EP_preliminary_tests\clipped_lulc\UKECH\LCM2021_london_clip2aoi.tif
[ 0.  1.  2.  3.  4.  5.  6.  8.  9. 10. 12. 13. 14. 16. 18. 19. 20. 21.]


#### land cover option 2

In [6]:
# # --- File Paths ---
# lc_input = r"G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel\ESA_WorldCover_10m_2021_v200_Mosaic_Mask_proj.tif"
# lc_output = lc_input.replace(".tif", "_scenario3_TreeRisk.tif")

# tree_cover_code = 1      # ESA WorldCover: Tree cover
# after_risk_lc_code = 3   # 3: 'Grassland',

# # Define labels
# land_cover_labels = {
#     1: 'Tree cover',
#     2: 'Shrubland',
#     3: 'Grassland',
#     4: 'Cropland',
#     5: 'Built-up',
#     6: "Bare / sparse vegetation",
#     7: "Snow and ice",
#     8: "Permanent water bodies",
#     9: "Herbaceous wetland",
#     10: "Moss and lichen"
# }

#### generate lc scenario 

In [7]:

# Ensure output folder exists
Path(os.path.dirname(lc_output)).mkdir(parents=True, exist_ok=True)

# --- Load and filter polygons ---
tree_cover_at_risk = gpd.read_file(shp_tree_risk)
## Filter the tree cover data with year = 2050, which indicate these trees will be at risk by 2050
tree_cover_at_risk = tree_cover_at_risk[tree_cover_at_risk["year"].isin([2050])]
tree_cover_at_risk = tree_cover_at_risk[
    tree_cover_at_risk.geometry.notna() & ~tree_cover_at_risk.geometry.is_empty
]

# --- Read raster ---
with rasterio.open(lc_input) as src:
    meta      = src.meta.copy()
    transform = src.transform
    lc_crs    = src.crs
    src_nodata = src.nodata
    arr       = src.read(1)  # 2D array of land cover codes


# Reproject polygons to raster CRS if needed
if tree_cover_at_risk.crs and (tree_cover_at_risk.crs != lc_crs):
    tree_cover_at_risk = tree_cover_at_risk.to_crs(lc_crs)


# --- Rasterize risk polygons to a mask (1 = at risk) ---
shape_mask = rasterize(
    [(geom, 1) for geom in tree_cover_at_risk.geometry],
    out_shape=arr.shape,
    transform=transform,
    fill=0,
    dtype="uint8",
    # all_touched=True,  # uncomment if you want a slightly more inclusive burn-in
)

# --- Build masks ---
valid_mask = np.ones_like(arr, dtype=bool)
if src_nodata is not None:
    valid_mask &= (arr != src_nodata)

# If LC==0 is background you want to preserve, keep it out of edits:
valid_mask &= (arr != 0)

# Only modify cells that are both "at risk" and currently a tree class
tree_mask = np.isin(arr, tree_cover_code)
target_mask = (shape_mask == 1) & tree_mask & valid_mask

# --- Apply scenario ---
remapped = arr.copy()
remapped[target_mask] = after_risk_lc_code

# --- Save updated raster ---
meta_out = meta.copy()
# Keep original dtype unless you explicitly want uint8
meta_out.update(
    dtype=remapped.dtype,
    compress="lzw",
    nodata=0                # <-- change from src_nodata to 0
)

with rasterio.open(lc_output, "w", **meta_out) as dst:
    dst.write(remapped, 1)

print(f"Updated land cover raster saved at: {lc_output}")


Updated land cover raster saved at: G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel\EP_preliminary_tests\clipped_lulc\UKECH\LCM2021_london_clip2aoi_scenario3_TreeRisk.tif


## lc change summary 

In [8]:
# load function to use in notebook
from function_summarize_lc_classes import summarize_lc_classes




# ---- Compute pixel area if projected (optional) ----
px_area_m2 = None
if lc_crs and lc_crs.is_projected:
    # rasterio Affine: transform.a = pixel width, transform.e = pixel height (negative)
    px_area_m2 = abs(transform.a) * abs(transform.e)

# ---- Summaries: BEFORE (original) and AFTER (remapped) ----
summary_before = summarize_lc_classes(
    arr,
    land_cover_labels,
    nodata=0,  # Use 0 if it is the nodata value in the original raster
    px_area_m2=px_area_m2,
    sort_by="class"
)

# Optional: if you defined land_cover_labels_scenario; otherwise reuse land_cover_labels
labels_after = globals().get("land_cover_labels_scenario", land_cover_labels)

summary_after = summarize_lc_classes(
    remapped,
    labels_after,
    nodata=0,
    px_area_m2=px_area_m2,
    sort_by="class"
)

print("\n=== Class proportions BEFORE scenario ===")
print(summary_before.to_string(index=False))

print("\n=== Class proportions AFTER scenario ===")
print(summary_after.to_string(index=False))

# ---- Optional: save to CSV ----
out_csv_before = lc_output.replace(".tif", "_class_summary_BEFORE.csv")
out_csv_after  = lc_output.replace(".tif", "_class_summary_AFTER.csv")
summary_before.to_csv(out_csv_before, index=False)
summary_after.to_csv(out_csv_after, index=False)
print(f"\nSaved summaries:\n- {out_csv_before}\n- {out_csv_after}")





=== Class proportions BEFORE scenario ===
 class_code                  label   count  proportion  percent     area_m2  area_ha  area_km2
        1.0     Deciduous woodland  867944    0.054427    5.443  86794400.0  8679.44     86.79
        2.0    Coniferous woodland   65124    0.004084    0.408   6512400.0   651.24      6.51
        3.0                 Arable  597008    0.037437    3.744  59700800.0  5970.08     59.70
        4.0     Improved Grassland 3559567    0.223214   22.321 355956700.0 35595.67    355.96
        5.0      Neutral Grassland      11    0.000001    0.000      1100.0     0.11      0.00
        6.0   Calcareous Grassland   50726    0.003181    0.318   5072600.0   507.26      5.07
        8.0  Fen, Marsh, and Swamp   49809    0.003123    0.312   4980900.0   498.09      4.98
        9.0                Heather    1220    0.000077    0.008    122000.0    12.20      0.12
       10.0      Heather grassland    8112    0.000509    0.051    811200.0    81.12      0.81
       

In [9]:

## compute the change in area_km2 ------------------------------------------------------------------ 
import pandas as pd

# Read
df_before = pd.read_csv(out_csv_before)
df_after  = pd.read_csv(out_csv_after)

# Keep only what we need and rename
b = df_before[['class_code', 'area_km2']].rename(columns={'area_km2': 'area_km2_before'})
a = df_after [['class_code', 'area_km2']].rename(columns={'area_km2': 'area_km2_after'})

# Outer join on class_code
merged = b.merge(a, on='class_code', how='outer')

# Ensure numeric, then compute change
for c in ['area_km2_before', 'area_km2_after']:
    merged[c] = pd.to_numeric(merged[c], errors='coerce')

merged['area_km2_change'] = merged['area_km2_after'].fillna(0) - merged['area_km2_before'].fillna(0)

# Save
out_csv_change = lc_output.replace(".tif", "_class_summary_CHANGE.csv")
merged.to_csv(out_csv_change, index=False)
print(f"Saved: {out_csv_change}")

print("\n=== Class proportions changed ===")
print(merged.to_string(index=False))

Saved: G:\Shared drives\Wellcome Trust Project Data\1_preprocess\UrbanCoolingModel\EP_preliminary_tests\clipped_lulc\UKECH\LCM2021_london_clip2aoi_scenario3_TreeRisk_class_summary_CHANGE.csv

=== Class proportions changed ===
 class_code  area_km2_before  area_km2_after  area_km2_change
        1.0            86.79           83.40            -3.39
        2.0             6.51            6.21            -0.30
        3.0            59.70           59.70             0.00
        4.0           355.96          359.65             3.69
        5.0             0.00            0.00             0.00
        6.0             5.07            5.07             0.00
        8.0             4.98            4.98             0.00
        9.0             0.12            0.12             0.00
       10.0             0.81            0.81             0.00
       12.0             1.29            1.29             0.00
       13.0             1.10            1.10             0.00
       14.0            33.59  