In [18]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely import wkt
import duckdb as db
import re

ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

Load the brownfield data and the local authority district 2023 geojson file. We used the BGC version of the local authority district 2023 geojson file and put it into [Map Shaper](https://mapshaper.org/) to simplify the shapes. We then used our [GeoJSON minifier](https://open-innovations.github.io/geojson-minify/) to reduce the file size.

In [19]:
brownfield_sites = pd.read_csv(ROOT / "raw/brownfield/brownfield-land.csv")
lad_geojson = pd.read_json(ROOT / "src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json", orient='columns')

The function below catches errors in the `wkt.loads` function. If that doesn't work, we return `None` so that the rest of our code still runs.

In [20]:
def wkt_loads(x):
    try:
        return wkt.loads(x)
    except Exception:
        return None

def extract_postcode(x):
    if type(x) != str:
        x  = str(x)
    match = re.search(r"([Gg][Ii][Rr] 0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9][A-Za-z]?))))\s?[0-9][A-Za-z]{2})", x)
    if match:
        return match.group()
    else:
        return None

def postcode_to_latlong(postcode):
    con = db.connect()
    pcd_start = re.search(r"^[A-z]{0,2}", postcode).group()
    # Create the URL for the lookup
    url = f'https://github.com/odileeds/Postcodes2LatLon/raw/main/postcodes/{pcd_start}.csv'
    try:
        # Use duckDB to execute the query to get lat/long for the postcode.
        coords = con.execute(f"SELECT lat, long FROM '{url}' WHERE Postcode=='{postcode}'").fetch_df().values[0]
    except Exception:
        try:
            stripped_pcd = postcode.replace(" ", '')
            # print('Stripped pcd:', stripped_pcd)
            coords = con.execute(f"SELECT lat, long FROM '{url}' WHERE Postcode=='{stripped_pcd}'").fetch_df().values[0]
        except Exception:
            coords = None

    if type(coords) != None:
        # If coords has been defined by a successful query, use the lat and long.
        lat, long = coords[0], coords[1]
    else:
        # If the query didn't work
        lat, long = None, None

    return lat, long

def fix_points(df):
    df_copy = df.copy()
    for i, row in df.iterrows():
        point = row['point']
        if type(point) == str:
            if point[0:5] == 'POINT':
                # Everything is ok. Continue to next row.
                continue
        else:
            site_address = row['site-address']
            postcode = extract_postcode(site_address)
            if postcode:
                lat, long = postcode_to_latlong(postcode)
                df_copy.loc[i, 'point'] = f"POINT({long} {lat})"
            else:
                print(f"No postcode found in {site_address}")
                df_copy.loc[i, 'point'] = None
        
    return df_copy

def find_local_authority(data):
    data['geometry'] = data['point'].apply(wkt_loads)
    
    points_gdf = gpd.GeoDataFrame(data, geometry='geometry')

    # Load the polygons GeoJSON
    polygons_gdf = gpd.read_file(ROOT / 'src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json')

    # Ensure the same CRS
    points_gdf = points_gdf.set_crs(polygons_gdf.crs, allow_override=True)

    # Perform spatial join, checking if "points" are within any of the local authority polygons
    joined_gdf = gpd.sjoin(points_gdf, polygons_gdf, how="left", predicate="within")

    # Now, joined_gdf will have a 'LAD24CD' and 'LAD24NM' column from the polygons, so we can perform some value counts
    name_counts = joined_gdf[['LAD24NM', 'LAD24CD']].value_counts()

    # Rename the series
    name_counts.name = 'sites'

    # Filter only the columns we need
    out_frame = joined_gdf[['LAD24CD', 'LAD24NM', 'hectares', 'maximum-net-dwellings', 'minimum-net-dwellings']]

    # Group by LAD24CD and LAD24NM and sum the numeric columns
    out_frame = out_frame.groupby(['LAD24CD', 'LAD24NM']).sum(numeric_only=True).reset_index()

    # Join frames
    out_frame = out_frame.join(name_counts, on=['LAD24NM', 'LAD24CD'], how='inner')

    return out_frame

In [21]:
# Load the points dataframe
brownfield_sites = fix_points(brownfield_sites)

No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in nan
No postcode found in TIPNER WEST
No postcode found in TIPNER FIRING RANGE
No postcode found in LAND OFF AND BETWEEN M275 SOUTH OF TIPNER LAKE INCLUDING GREYHOUND STADIUM TWYFORD AVENUE
No postcode found in PORT SOLENT, HMS EXCELLENT
No postcode found in GARAGES AT EXMOUTH ROAD
No postcode found in LAND SOUTH OF MARINA KEEP
No postcode found in 108 - 112 ELM GROVE
No postcode found in ROYAL MARINE MUSEUM
No postcode found in CITY RECORDS OFFICE
No postcode found in WINGFIELD HOUSE
No postcode found in 32 WESTERN PARADE
No postcode found in 140-142 KINGSTON ROAD (BINGO HALL)
No postcode found in GARAGES AT EXMOUTH ROAD
No postcode found in FORMER AQUATICS CENTRE, 201 HIGHLAND ROAD AND 197 AND LAND AT REAR
No postcode found in LAND SOUTH OF MARINA KEEP
No postcode found in NORTH END KWIKSAVE


In [22]:
no_point_data = len(brownfield_sites[brownfield_sites['point'].isna()])
print(f"There were {no_point_data} sites wth no point data after using postcodes to try to fill in gaps.")

There were 633 sites wth no point data after using postcodes to try to fill in gaps.


In [24]:
data = find_local_authority(brownfield_sites)

# Write file
data.to_csv(ROOT / 'data/brownfield/sites_by_local_authority.csv', index=False)

In [25]:
print(f"There are {len(brownfield_sites) - len(joined_gdf['LAD24CD'].dropna())} sites that had invalid points and/or were not matched to a local authority.")

There are 1889 sites that had invalid points and/or were not matched to a local authority.
