In [25]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely import wkt

ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

Load the brownfield data and the local authority district 2023 geojson file. We used the BGC version of the local authority district 2023 geojson file and put it into [Map Shaper](https://mapshaper.org/) to simplify the shapes. We then used our [GeoJSON minifier](https://open-innovations.github.io/geojson-minify/) to reduce the file size.

In [26]:
brownfield_sites = pd.read_csv(ROOT / "raw/brownfield/brownfield-land.csv")
lad_geojson = pd.read_json(ROOT / "src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json", orient='columns')

The function below catches errors in the `wkt.loads` function. If that doesn't work, we return `None` so that the rest of our code still runs.

In [27]:
def wkt_loads(x):
    try:
        return wkt.loads(x)
    except Exception:
        return None

In [28]:
# Load the points dataframe
brownfield_sites['geometry'] = brownfield_sites['point'].apply(wkt_loads)
points_gdf = gpd.GeoDataFrame(brownfield_sites, geometry='geometry')

# Load the polygons GeoJSON
polygons_gdf = gpd.read_file(ROOT / 'src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json')

# Ensure the same CRS
points_gdf = points_gdf.set_crs(polygons_gdf.crs, allow_override=True)

# Perform spatial join, checking if "points" are within any of the local authority polygons
joined_gdf = gpd.sjoin(points_gdf, polygons_gdf, how="left", predicate="within")

# Now, joined_gdf will have a 'LAD24CD' and 'LAD24NM' column from the polygons, so we can perform some value counts
name_counts = joined_gdf[['LAD24NM', 'LAD24CD']].value_counts()

# Rename the series
name_counts.name = 'sites'

# Filter only the columns we need
data = joined_gdf[['LAD24CD', 'LAD24NM', 'hectares', 'maximum-net-dwellings', 'minimum-net-dwellings']]

# Group by LAD24CD and LAD24NM and sum the numeric columns
data = data.groupby(['LAD24CD', 'LAD24NM']).sum(numeric_only=True).reset_index()

# Join frames
data = data.join(name_counts, on=['LAD24NM', 'LAD24CD'], how='inner')

# Write file
data.to_csv(ROOT / 'data/brownfield/sites_by_local_authority.csv', index=False)

In [29]:
print(f"There are {len(brownfield_sites) - len(joined_gdf['LAD24CD'].dropna())} sites that had invalid points and/or were not matched to a local authority.")

There are 2011 sites that had invalid points and/or were not matched to a local authority.
