In [1]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
from shapely import wkt
import duckdb as db
import requests
import re

ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

Load the brownfield data and the local authority district 2023 geojson file. We used the BGC version of the local authority district 2023 geojson file and put it into [Map Shaper](https://mapshaper.org/) to simplify the shapes. We then used our [GeoJSON minifier](https://open-innovations.github.io/geojson-minify/) to reduce the file size.

In [2]:
brownfield_sites = pd.read_csv(ROOT / "raw/brownfield/brownfield-land.csv")
lad_geojson = pd.read_json(ROOT / "src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json", orient='columns')

  brownfield_sites = pd.read_csv(ROOT / "raw/brownfield/brownfield-land.csv")


In [3]:
len(brownfield_sites[brownfield_sites['point'].isna()])

2030

In [4]:
def wkt_loads(x):
    '''Add error handling for WKT loads'''
    try:
        return wkt.loads(x)
    except Exception:
        return None

def extract_postcode(x):
    '''Search an address string for a postcode using regex.'''
    if type(x) != str:
        x  = str(x)
    match = re.search(r"([Gg][Ii][Rr] 0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9][A-Za-z]?))))\s?[0-9][A-Za-z]{2})", x)
    if match:
        return match.group()
    else:
        return None

def postcode_to_latlong(postcode):
    '''Convert a postcode to a latitude and longitude using it's centroid coordinates.'''
    con = db.connect()
    pcd_start = re.search(r"^[A-z]{0,2}", postcode).group()
    # Create the URL for the lookup
    url = f'https://github.com/odileeds/Postcodes2LatLon/raw/main/postcodes/{pcd_start}.csv'
    try:
        # Use duckDB to execute the query to get lat/long for the postcode.
        coords = con.execute(f"SELECT lat, long FROM '{url}' WHERE Postcode=='{postcode}'").fetch_df().values[0]
    except Exception:
        try:
            stripped_pcd = postcode.replace(" ", '')
            # print('Stripped pcd:', stripped_pcd)
            coords = con.execute(f"SELECT lat, long FROM '{url}' WHERE Postcode=='{stripped_pcd}'").fetch_df().values[0]
        except Exception:
            coords = None

    if type(coords) != None:
        # If coords has been defined by a successful query, use the lat and long.
        lat, long = coords[0], coords[1]
    else:
        # If the query didn't work
        lat, long = None, None

    return lat, long

def fix_points(df):
    '''Try to get a postcode and use the postcode centroid point if a POINT not given in the data.'''
    df_copy = df.copy()
    for i, row in df.iterrows():
        point = row['point']
        if type(point) == str:
            if point[0:5] == 'POINT':
                # Everything is ok. Continue to next row.
                continue
        else:
            site_address = row['site-address']
            postcode = extract_postcode(site_address)
            if postcode:
                lat, long = postcode_to_latlong(postcode)
                df_copy.loc[i, 'point'] = f"POINT({long} {lat})"
            else:
                print(f"Didn't find a postcode for: {site_address}.")
                df_copy.loc[i, 'point'] = None
        
    return df_copy

def find_local_authority(data):
    '''
        Use points to do a spatial join with the UK LAD geojson.
        If that provides a null result, use the organisation entity LAD
    '''
    # Load the points as geometry
    data['geometry'] = data['point'].apply(wkt_loads)
    
    points_gdf = gpd.GeoDataFrame(data, geometry='geometry')

    # Load the polygons GeoJSON
    polygons_gdf = gpd.read_file(ROOT / 'src/_data/geojson/Local_Authority_Districts_May_2024_Boundaries_UK_BGC_Simplified.json')
   
    # Ensure the same CRS
    points_gdf = points_gdf.set_crs(polygons_gdf.crs, allow_override=True)

    # Perform spatial join, checking if "points" are within any of the local authority polygons
    joined_gdf = gpd.sjoin(points_gdf, polygons_gdf, how="left", predicate="within")

    return joined_gdf

lookup = pd.read_csv(ROOT / "metadata/lookups/organisation_entity_to_LAD.csv")

def check_local_authority(org):
    '''Lookup an organisation entity 3 digit code and return a GSS geography code'''
    try:
        org = int(org)
        LAD24CD = lookup[lookup['organisation-entity'] == org]['LAD24CD'].values[0]
    except:
        return 
    return LAD24CD


In [5]:
# Use the organisation entity to find an LAD code, if it exists.
brownfield_sites['org_entity_lad'] = brownfield_sites['organisation-entity'].apply(check_local_authority)

# Use postcodes to add additional points
brownfield_sites = fix_points(brownfield_sites)

# Use geopandas to match the points to a local authority.
brownfield_sites = find_local_authority(brownfield_sites)

# If the "LAD24CD" column is null after the spatial join, fill it with values from 'org_entity_lad'
brownfield_sites['LAD24CD'] = brownfield_sites['LAD24CD'].combine_first(brownfield_sites['org_entity_lad'])

no_LAD = len(brownfield_sites[brownfield_sites['LAD24CD'].isna()])
print(f"There were {no_LAD} sites wth no local authority assigned after using postcodes and organisation entity to try to fill in gaps.")

Didn't find a postcode for: 392a and 394 Camden Road.
Didn't find a postcode for: 100 Hornsey Road.
Didn't find a postcode for: 1 Kingsland Passage and the Telephone Exchange, Kingsland Green, Dalston.
Didn't find a postcode for: Dixon Clark Court.
Didn't find a postcode for: Hathersage and Besant Courts.
Didn't find a postcode for: Catherine Dalley & Silverdale, Scalford Road, Melton Mowbray.
Didn't find a postcode for: Beebys Yard, Burton Street, Melton Mowbray.
Didn't find a postcode for: Land at Thorpe Road, Melton Mowbray.
Didn't find a postcode for: Birleys Garage, Waltham Lane, Long Clawson.
Didn't find a postcode for: Former Millway Foods Premises, Colston Lane, Harby.
Didn't find a postcode for: The Gollings; Main Street; Wymondham.
Didn't find a postcode for: Land Adjacent 7, Ashby Road, Gaddesby.
Didn't find a postcode for: nan.
Didn't find a postcode for: nan.
Didn't find a postcode for: nan.
Didn't find a postcode for: nan.
Didn't find a postcode for: nan.
Didn't find a po

TypeError: 'NoneType' object is not subscriptable

In [6]:
# Now, joined_gdf will have a 'LAD24CD' and 'LAD24NM' column from the polygons, so we can perform some value counts
LAD_counts = brownfield_sites[['LAD24NM', 'LAD24CD']].value_counts()

# Rename the series
LAD_counts.name = 'sites'

# Filter only the columns we need
totals = brownfield_sites[['LAD24CD', 'LAD24NM', 'hectares', 'maximum-net-dwellings', 'minimum-net-dwellings']]

# Group by LAD24CD and LAD24NM and sum the numeric columns
totals = totals.groupby(['LAD24CD', 'LAD24NM']).sum(numeric_only=True).reset_index()
totals = totals.round(1)
# Join frames
totals = totals.join(LAD_counts, on=['LAD24NM', 'LAD24CD'], how='inner')

In [7]:
# Write file
totals.to_csv(ROOT / 'data/brownfield/sites_by_local_authority.csv', index=False)