In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

In [2]:
def fill_new_la_codes(data, old_codes, new_code, new_name):
    '''Use the old LADs to backfill data for new LADs'''
    # Find the old codes from the data and get a copy
    old_codes_data = data[data.geography_code.isin(old_codes)].copy()
    # Replace any [x] missing values with NaN
    old_codes_data.replace('[x]', np.nan, inplace=True)
    # Drop the NA values.
    old_codes_data.dropna(inplace=True)
    # Group the old codes by date and measure, and sum the 'value' column, then reset the index.
    new_code_data = old_codes_data.groupby(['date', 'Measure'])['value'].sum().reset_index()
    # Set the name and code for the new data.
    new_code_data[['geography_code', 'geography_name']] = new_code, new_name
    # Drop the old codes from the data with a negation (~)
    data = data[~data.geography_code.isin(old_codes)]
    # Add the new data to the dataframe.
    result = pd.concat([data, new_code_data])

    return result

In [3]:
data = pd.read_csv(ROOT / 'raw/affordable-homes/affordable_homes_open_data_202223')

In [4]:
# Remove un-needed columns
data.drop(columns=['LA code', 'LA name', 'District name', 'Metropolitan code', 'Metropolitan name', 'Region code', 'Region name', 'LA type 202223'], inplace=True)

In [5]:
# Rename column
data.rename(columns={'LA code 202223': 'geography_code', 'LA name 202223': 'geography_name', 'Year': 'date', 'Tenure': 'Measure', 'Units': 'value'}, inplace=True)

In [6]:
# We are only interested in completions for now
data = data.loc[data.Completions=='Completion']

In [7]:
# Sum values per year, per local authority, per tenure type
local_authority = data.groupby(['geography_code', 'geography_name', 'date', 'Measure']).sum(numeric_only=True).reset_index()

old_codes = [
    ['E07000026', 'E07000028', 'E07000029'], 
    ['E07000163', 'E07000164', 'E07000165', 'E07000166', 'E07000167', 'E07000168', 'E07000169'], 
    ['E07000187', 'E07000188', 'E07000246', 'E07000189'],
    ['E07000027', 'E07000030', 'E07000031']]
new_codes = ['E06000063', 'E066000065', 'E06000066', 'E06000064']
new_names = ['Cumberland', 'North Yorkshire', 'Somerset', 'Westmorland and Furness']

for i, j, k in zip(old_codes, new_codes, new_names):
    local_authority = fill_new_la_codes(local_authority, i, j, k)

local_authority_wide = local_authority.pivot(index=['geography_code', 'geography_name', 'date'], columns='Measure', values='value')

# Add a column for all affordable homes, which is the sum of all the tenure types.
local_authority_wide['All afforable'] = local_authority_wide.sum(axis=1)

In [8]:
# Calculate the values for the whole of England
all_england = local_authority.groupby(['date', 'Measure']).sum(numeric_only=True).reset_index()

# Pivot to wide format
all_england_wide = all_england.pivot(index='date', columns='Measure', values='value')

# Add a column for all affordable homes, which is the sum of all the tenure types.
all_england_wide['All afforable'] = all_england_wide.sum(axis=1)

# Add the geo code for England and append it to the index.
all_england_wide['geography_code'] = 'E92000001'
all_england_wide['geography_name'] = 'England'
all_england_wide.set_index(['geography_code', 'geography_name'], append=True, inplace=True)
all_england_wide = all_england_wide.reorder_levels(['geography_code', 'geography_name', 'date'])

In [9]:
# Concatenate the dataframes
combined_wide = pd.concat([local_authority_wide, all_england_wide])

In [10]:
all_england['geography_code'] = 'E92000001'
all_england['geography_name'] = 'England'
combined_long = pd.concat([local_authority, all_england])

In [11]:
# Wide file to parquet for the site, csv long file to standard for viewing and metadata. 
combined_wide.to_parquet(ROOT / 'data/affordable-homes/site/by_tenure.parquet')
combined_long.to_csv(ROOT / 'data/affordable-homes/standard/by_tenure.csv', index=False)

In [12]:
combined_long

Unnamed: 0,geography_code,geography_name,date,Measure,value
0,E06000001,Hartlepool,1991-92,Affordable Home Ownership,13
1,E06000001,Hartlepool,1991-92,Social Rent,134
2,E06000001,Hartlepool,1992-93,Affordable Home Ownership,29
3,E06000001,Hartlepool,1992-93,Social Rent,204
4,E06000001,Hartlepool,1993-94,Affordable Home Ownership,20
...,...,...,...,...,...
114,E92000001,England,2022-23,Intermediate Rent,2701
115,E92000001,England,2022-23,London Affordable Rent,4296
116,E92000001,England,2022-23,Shared Ownership,20517
117,E92000001,England,2022-23,Social Rent,9535


# TODO 
Use a lookup to go from LAs to met counties and regions.