In [None]:
from pathlib import Path
import pandas as pd
ROOT = Path('../..')
ROOT.resolve()

In [None]:
d = pd.read_excel(ROOT / 'raw/additional-dwellings/LT122.ods', engine='odf', sheet_name='LT122', header=3, na_values=['[p]', '[r]', '[x]'])

In [None]:
import re
# Drop all the rows that don't have a current ONS code
d = d[~d['CurrentONS code'].isna()]

# drop unused columns
d.drop(columns=['DCLG code', 'FormerONS code', 'Authority Data'], inplace=True)

# Rename geography column code and set as index
d.rename(columns={'CurrentONS code': 'geography_code'}, inplace=True)
d.set_index('geography_code', inplace=True)

# Remove anything that isn't actually a date from the date column titles
d.columns = [re.match(r'\d{4}-\d{2}', col).group() for col in d.columns]
d

In [None]:
d = d.reset_index().melt(id_vars='geography_code', value_vars=d.columns, var_name='date', value_name='value')

In [None]:
england_and_regions = pd.read_excel(ROOT / "raw/additional-dwellings/LT118.ods", sheet_name='LT118_(unrounded)', header=3, nrows=24)

In [None]:
england_and_regions.rename(columns={"Year": "date",
                                    "North East": "E12000001",
                                    "North West": "E12000002",
                                    "Yorkshire and The Humber": "E12000003",
                                    "East Midlands": "E12000004",
                                    "West Midlands": "E12000005",
                                    "East of England": "E12000006",
                                    "London": "E12000007",
                                    "South East": "E12000008",
                                    "South West": "E12000009",
                                    "England": "E92000001"}, inplace=True) # Seems slightly sadistic but the column is not named what it should be.

england_and_regions.drop(columns='Components of net housing supply', inplace=True)

# Drop a row we don't need
# england_and_regions = england_and_regions[england_and_regions.date != 'Total net additional dwellings [note 1]']

# Set the date as the index ready to transpose
england_and_regions.set_index('date', inplace=True)
england_and_regions = england_and_regions.T

# Now geography_code is the index so name it correctly
england_and_regions.index.rename('geography_code', inplace=True)

# Make sure only dates are actually in the date column headings
england_and_regions.columns = [re.match(r'\d{4}-\d{2}', col).group() for col in england_and_regions.columns]

# Unpivot the data
england_and_regions = england_and_regions.reset_index().melt(id_vars='geography_code', value_vars=england_and_regions.columns, var_name='date', value_name='value')

england_and_regions

In [None]:
# Concatenate the data
combined = pd.concat([d, england_and_regions])
# Round the figures
combined['value'] = combined['value'].astype(float).round(0)
combined

In [None]:
# Write to parquet
combined.to_parquet(ROOT / 'data/additional-dwellings/parquet/by_local_authority.parquet', index=False)
combined['Measure'] = 'net_additional_dwellings'
combined.to_csv(ROOT / 'data/additional-dwellings/csv/by_local_authority.csv', index=False)