In [1]:
from pathlib import Path
import pandas as pd
ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

In [2]:
active_geographies = pd.DataFrame()
paths = [
    "metadata/lookups/Local_Authority_Districts_(April_2023)_Names_and_Codes_in_the_United_Kingdom.csv", 
    "metadata/lookups/Metropolitan_Counties_(December_2023)_Names_and_Codes_in_EN.csv", 
    "metadata/lookups/Regions_(December_2023)_Names_and_Codes_in_EN.csv",
    "metadata/lookups/Combined_Authorities_(May_2024)_Names_and_Codes_in_England.csv",
    "metadata/lookups/Counties_(April_2023)_Names_and_Codes_in_EN.csv",
    "metadata/lookups/Countries_(December_2023)_Names_and_Codes_in_the_UK.csv"
    ]
for path in paths:
    data = pd.read_csv(ROOT / path)
    code_name = data.columns[data.columns.str.endswith('CD')].values[0]
    geo_name = data.columns[data.columns.str.endswith('NM')].values[0]
    data.rename(columns={f'{code_name}': 'geography_code', f'{geo_name}': 'geography_name'}, inplace=True)
    data = data[['geography_code', 'geography_name']]
    active_geographies = pd.concat([active_geographies, data])

active_geographies = active_geographies[~active_geographies['geography_code'].str.startswith(('W', 'S', 'N', 'K'))]
active_geographies.reset_index(inplace=True, drop=True)
active_geographies['active'] = True
active_geographies.set_index('geography_code', inplace=True)
active_geographies.to_json(ROOT / 'metadata/temp/active_geographies.json', orient='index', indent=4)

In [3]:
files = [ROOT / 'data/vacant-homes/standard/AllCombined_Cleaned_2024.csv', ROOT / 'data/house-prices/standard/median_house_prices.csv', ROOT / 'data/affordable-homes/standard/by_tenure.csv']
inactive_geographies = pd.DataFrame(columns=['geography_code', 'geography_name'])
for file in files:
    # Read the file
    d = pd.read_csv(file)
    
    columns = d.columns.to_list()
    assert 'geography_code' in columns, 'No column geography_code'
    assert 'geography_name' in columns, 'No column geography_name'
    
    # # Group the names and codes to get unique combinations, drop the size column.
    g = d.groupby(['geography_code', 'geography_name']).size().reset_index().drop(columns=0)
    # g.reset_index(inplace=True)
    # fix some known naming bugs.
    g['geography_name'] = g['geography_name'].str.replace('&', 'and')
    g['geography_name'] = g['geography_name'].str.replace('St Edmundsbury', 'St. Edmundsbury')

    # For now, we only want place in England as this is the data we have.
    g = g[g.geography_code.str.startswith('E')]
    
    # Ensure no duplicates remain
    g.drop_duplicates(inplace=True)

    # Get lists of unique codes and names in the current dataset
    unique_active_codes = active_geographies.index.unique()
    unique_active_names = active_geographies['geography_name'].unique()
    df_A = active_geographies.reset_index().drop(columns='active')
    df_B = g
    # Merge DataFrames with indicator to show the source of each row
    merged_df = df_B.merge(df_A, how='left', indicator=True)

    # Filter rows that are only in DataFrame B
    unique_to_B = merged_df[merged_df['_merge'] == 'left_only']

    # Drop the _merge column
    unique_to_B = unique_to_B.drop(columns='_merge')
    inactive_geographies = pd.concat([unique_to_B, inactive_geographies])

# Set the active status remaining geographies to false
inactive_geographies['active'] = False

inactive_geographies.set_index('geography_code', inplace=True, drop=True)
# Drop any duplicates that came from multiple files
inactive_geographies.drop_duplicates(inplace=True)
inactive_geographies.to_json(ROOT / 'metadata/temp/inactive_geographies.json', orient='index', indent=4)

Combine the frames and write to file.

In [4]:
combined = pd.concat([active_geographies, inactive_geographies])
dupes = combined[combined.index.duplicated()]
if dupes.empty:
    print('Contains no duplicates...\n writing to JSON file.')
    combined.to_json(ROOT / "src/data/areas/place-page/_data/areas.json", orient='index', indent=4)
else: 
    print(dupes)

Contains no duplicates...
 writing to JSON file.


Read the standard files. Fidn the first and last published dates. write it to JSON

In [5]:
# files = [ROOT / 'data/vacant-homes/standard/AllCombined_Cleaned_2024.csv', ROOT / 'data/house-prices/standard/median_house_prices.csv', ROOT / 'data/affordable-homes/standard/by_tenure.csv']
# joined = pd.DataFrame()
# i = 0
# for file in files:
#     d = pd.read_csv(file)
#     assert 'Measure' in d.columns

#     group = d.groupby(['geography_code', 'geography_name', 'Measure'])
        
#     min_dates = group['date'].min()
#     max_dates = group['date'].max()
#     g = group.size().reset_index().drop(columns=0)
#     unique_min_dates = min_dates.unique()
#     unique_max_dates = max_dates.unique()

#     if len(unique_min_dates) == 1:
#         min_date = unique_min_dates[0]
#         max_date = unique_max_dates[0]
#         g['fP'] = min_date
#         g['lP'] = max_date
#     else:
#         g['lP'] = max_dates.reset_index()['date']
#         g['fP'] = min_dates.reset_index()['date']

#     g = g.pivot(index=['geography_code', 'geography_name'], columns='Measure', values=['fP', 'lP'])
#     g.columns = g.columns.map('_'.join)
#     g = g.reset_index().set_index('geography_code')
#     # for i, row in g.iterrows():
#     #     if i in joined.index:
#     #         print(i)
#     #         joined = pd.merge(joined, g, how='inner')
#     #     else:
#     #         joined = pd.concat([joined, g])
#     g.to_json(f'blah{i}.json', orient='index', indent=4)
#     i += 1
# joined[joined.index == 'E06000001']
