- [x] The geography tree is as follows: LADs/Met counties/counties/combined authorities => Regions => England/Wales/Scotland/NI. 
- [x] Lookups for the most recent names and codes of these geographies have been downloaded into `data/lookups`.
- [x] To generate a list of active geographies we combined all of these lookups into a single file with the column titles `geography_code` and `geography_name`. This is temporarily stored in `metadata/temp`.
- [x] Each data set may contain some/all or none of these geographies. Per dataset, we iterate through each file's unique geographies and check if they are in the list of active codes. If they are not, we add them to a list of inactive codes stored in `metadata/temp`.
- [] For all geographies we determine when data was first and last published per dataset.
- [] This is stored in a `JSON` file in `src/data/areas/place-page/_data/metadata.json` and used to generate the site.

Import modules and set up paths for reading and writing files

In [1]:
from pathlib import Path
import pandas as pd
ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

In [2]:
active_geographies = pd.DataFrame()
paths = [
    "metadata/lookups/Local_Authority_Districts_(April_2023)_Names_and_Codes_in_the_United_Kingdom.csv", 
    "metadata/lookups/Metropolitan_Counties_(December_2023)_Names_and_Codes_in_EN.csv", 
    "metadata/lookups/Regions_(December_2023)_Names_and_Codes_in_EN.csv",
    "metadata/lookups/Combined_Authorities_(May_2024)_Names_and_Codes_in_England.csv",
    "metadata/lookups/Counties_(April_2023)_Names_and_Codes_in_EN.csv",
    "metadata/lookups/Countries_(December_2023)_Names_and_Codes_in_the_UK.csv"
    ]
for path in paths:
    data = pd.read_csv(ROOT / path)
    code_name = data.columns[data.columns.str.endswith('CD')].values[0]
    geo_name = data.columns[data.columns.str.endswith('NM')].values[0]
    data.rename(columns={f'{code_name}': 'geography_code', f'{geo_name}': 'geography_name'}, inplace=True)
    data = data[['geography_code', 'geography_name']]
    active_geographies = pd.concat([active_geographies, data])

active_geographies = active_geographies[~active_geographies['geography_code'].str.startswith(('W', 'S', 'N', 'K'))]
active_geographies.reset_index(inplace=True, drop=True)
active_geographies['active'] = 'true'
active_geographies.set_index('geography_code', inplace=True)
active_geographies.to_json(ROOT / 'metadata/temp/active_geographies.json', orient='index', indent=4)

len(active_geographies.geography_name.unique())

337

Inactive geographies

In [3]:
files = [ROOT / 'data/vacant-homes/AllCombined_Cleaned_2024.csv', ROOT / 'data/house-prices/median_house_prices.csv', ROOT / 'data/affordable-homes/by_tenure.csv']
inactive_geographies = pd.DataFrame(columns=['geography_code', 'geography_name'])
for file in files:
    # Read the file
    d = pd.read_csv(file)
    
    columns = d.columns.to_list()
    assert 'geography_code' in columns, 'No column geography_code'
    assert 'geography_name' in columns, 'No column geography_name'

    # Group the names and codes to get unique combinations, drop the size column.
    d = d.groupby(['geography_code', 'geography_name']).size().reset_index().drop(columns=0)

    # fix some known naming bugs.
    d['geography_name'] = d['geography_name'].str.replace('&', 'and')
    d['geography_name'] = d['geography_name'].str.replace('St Edmundsbury', 'St. Edmundsbury')

    # Ensure no duplicates remain
    d.drop_duplicates(inplace=True)

    # Get lists of unique codes and names in the current dataset
    unique_active_codes = active_geographies.index.unique()
    unique_active_names = active_geographies['geography_name'].unique()
    df_A = active_geographies.reset_index().drop(columns='active')
    df_B = d
    # Merge DataFrames with indicator to show the source of each row
    merged_df = df_B.merge(df_A, how='left', indicator=True)

    # Filter rows that are only in DataFrame B
    unique_to_B = merged_df[merged_df['_merge'] == 'left_only']

    # Drop the _merge column
    unique_to_B = unique_to_B.drop(columns='_merge')
    inactive_geographies = pd.concat([unique_to_B, inactive_geographies])

# Set the active status remaining geographies to false
inactive_geographies['active'] = 'false'

inactive_geographies.set_index('geography_code', inplace=True)
# Drop any duplicates that came from multiple files
inactive_geographies.drop_duplicates(inplace=True)
inactive_geographies.to_json(ROOT / 'metadata/temp/inactive_geographies.json', orient='index', indent=4)

In [4]:
combined = pd.concat([active_geographies, inactive_geographies])
dupes = combined[combined.index.duplicated()]
if dupes.empty:
    print('Contains no duplicates...\n writing to JSON file.')
    combined.to_json(ROOT / "metadata/UK_geo_activity_status.json", orient='index', indent=4)
else: 
    print(dupes)

Contains no duplicates...
 writing to JSON file.
