In [1]:
from pathlib import Path
import pandas as pd
ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

In [2]:
d = pd.read_excel(ROOT / 'raw/additional-dwellings/LT122.ods', engine='odf', sheet_name='LT122', header=3, na_values=['[p]', '[r]', '[x]'])

In [3]:
import re
# Drop all the rows that don't have a current ONS code
d = d[~d['CurrentONS code'].isna()]

# drop unused columns
d.drop(columns=['DCLG code', 'FormerONS code', 'Authority Data'], inplace=True)

# Rename geography column code and set as index
d.rename(columns={'CurrentONS code': 'geography_code'}, inplace=True)
d.set_index('geography_code', inplace=True)

# Remove anything that isn't actually a date from the date column titles
d.columns = [re.match(r'\d{4}-\d{2}', col).group() for col in d.columns]
d

Unnamed: 0_level_0,2001-02,2002-03,2003-04,2004-05,2005-06,2006-07,2007-08,2008-09,2009-10,2010-11,...,2013-14,2014-15,2015-16,2016-17,2017-18,2018-19,2019-20,2020-21,2021-22,2022-23
geography_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E06000022,270.400000,264.400000,301.400000,151.400000,172.400000,258.400000,497.400000,378.400000,346.400000,241.400000,...,557.000000,640.000000,818.000000,880,1254,1049,1176.0,773.0,552.0,490.0
E06000055,,,,,,,,,507.300000,675.300000,...,1030.500000,861.500000,997.500000,1288.5,1383.5,1392.5,1059.5,1232.5,1019.0,1465.0
E06000008,-73.800000,263.200000,639.200000,-93.800000,594.200000,230.200000,619.200000,82.200000,336.200000,372.200000,...,246.100000,235.100000,115.100000,159.1,299.1,505.1,323.1,461.1,440.0,553.0
E06000009,159.700000,54.700000,241.700000,240.700000,239.700000,173.700000,356.700000,310.700000,-1.300000,180.700000,...,144.000000,294.000000,447.000000,51,473,565,532.0,358.0,267.0,233.0
E06000028,764.300000,920.300000,1103.300000,1036.300000,900.300000,1029.300000,1474.300000,1173.300000,562.300000,432.300000,...,392.487666,962.487666,721.487666,579.487666,633.487666,657.487666,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E07000235,121.800000,369.800000,201.800000,209.800000,179.800000,298.800000,320.800000,270.800000,236.800000,150.800000,...,169.600000,243.600000,308.600000,348.6,440.6,562.6,377.6,448.6,407.0,227.0
E07000236,248.000000,290.000000,422.000000,325.000000,292.000000,482.000000,245.000000,109.000000,178.000000,130.000000,...,173.800000,340.800000,186.800000,188.8,389.8,395.8,284.8,204.8,127.0,103.0
E07000237,3.371618,250.371618,219.371618,284.371618,476.371618,522.371618,432.371618,476.371618,386.371618,239.655434,...,287.900000,467.900000,616.900000,477.9,255.9,239.9,221.9,351.9,414.0,273.0
E07000238,555.000000,427.000000,426.000000,481.000000,358.000000,349.000000,309.000000,271.000000,236.000000,256.000000,...,851.800000,870.800000,902.800000,712.8,1102.8,1296.8,802.8,599.8,861.0,663.0


In [4]:
d = d.reset_index().melt(id_vars='geography_code', value_vars=d.columns, var_name='date', value_name='value')

In [5]:
england_and_regions = pd.read_excel(ROOT / "raw/additional-dwellings/LT118.ods", sheet_name='LT118_(unrounded)', header=4, nrows=24)

In [6]:
england_and_regions.rename(columns={'ONS Code': 'date'}, inplace=True) # Seems slightly sadistic but the column is not named what it should be.

# Drop a row we don't need
england_and_regions = england_and_regions[england_and_regions.date != 'Total net additional dwellings [note 1]']

# Set the date as the index ready to transpose
england_and_regions.set_index('date', inplace=True)
england_and_regions = england_and_regions.T

# Now geography_code is the index so name it correctly
england_and_regions.index.rename('geography_code', inplace=True)

# Make sure only dates are actually in the date column headings
england_and_regions.columns = [re.match(r'\d{4}-\d{2}', col).group() for col in england_and_regions.columns]

# Unpivot the data
england_and_regions = england_and_regions.reset_index().melt(id_vars='geography_code', value_vars=england_and_regions.columns, var_name='date', value_name='value')

england_and_regions

Unnamed: 0,geography_code,date,value
0,E12000001,2000-01,2890.0
1,E12000002,2000-01,10720.0
2,E12000003,2000-01,10800.0
3,E12000004,2000-01,14830.0
4,E12000005,2000-01,13790.0
...,...,...,...
225,E12000006,2022-23,30185.0
226,E12000007,2022-23,35305.0
227,E12000008,2022-23,42140.0
228,E12000009,2022-23,25551.0


In [7]:
# Concatenate the data
combined = pd.concat([d, england_and_regions])
# Round the figures
combined['value'] = combined['value'].astype(float).round(0)
combined

Unnamed: 0,geography_code,date,value
0,E06000022,2001-02,270.0
1,E06000055,2001-02,
2,E06000008,2001-02,-74.0
3,E06000009,2001-02,160.0
4,E06000028,2001-02,764.0
...,...,...,...
225,E12000006,2022-23,30185.0
226,E12000007,2022-23,35305.0
227,E12000008,2022-23,42140.0
228,E12000009,2022-23,25551.0


In [8]:
# Write to parquet
combined.to_parquet(ROOT / 'data/additional-dwellings/parquet/by_local_authority.parquet', index=False)
combined['Measure'] = 'net_additional_dwellings'
combined.to_csv(ROOT / 'data/additional-dwellings/csv/by_local_authority.csv', index=False)