In [1]:
from pathlib import Path
import pandas as pd
ROOT = Path('../..')
ROOT.resolve()

PosixPath('/Users/lukestrange/Code/housing')

In [2]:
def clean_data(file, sheet_name, header, skiprows=None):
    # Read the excel sheet from file.
    data = pd.read_excel(file, sheet_name=sheet_name, header=header, skiprows=skiprows)
    # Strip whitespace
    data = data.rename(columns=lambda x: x.strip())
    # Simplify column names
    data = data.rename(columns=lambda x: x.replace('Year ending ', ''))
    # Drop un-used columns
    if sheet_name == '2a':
        data.drop(columns=['Region/Country code', 'Region/Country name'], inplace=True)
        old_code_title = 'Local authority code'
        old_name_title = 'Local authority name'
    else:
        old_code_title = 'Code'
        old_name_title = 'Name'
    # Standardise the geo code column name
    data.rename(columns={old_code_title: 'geography_code', old_name_title: 'geography_name'}, inplace=True)
    # Get all columns apart from the geo code
    cols = [col for col in data.columns if col not in ['geography_code', 'geography_name']]
    # Melt the frame (unpivot) and set the index as the geo code
    data = data.melt(id_vars=['geography_code', 'geography_name'], value_vars=cols, var_name='date', value_name='Median').set_index('geography_code')
    return data

In [3]:
fpath = ROOT / 'raw/house-prices/medianhousepricesforadministrativegeographies.xlsx'

In [4]:
la_data = clean_data(fpath, '2a', 2)
ca_data = clean_data(fpath, '4a', 2)
reg_nat_data = clean_data(fpath, '1a', 2)

In [5]:
combined_wide = pd.concat([la_data, ca_data, reg_nat_data])

In [6]:
combined_wide.to_parquet(ROOT / 'data/house-prices/site/median_house_prices.parquet')

In [7]:
combined_wide['Measure'] = 'Median'
combined_long = combined_wide.rename(columns={'Median': 'value'})
combined_long.to_csv(ROOT / 'data/house-prices/standard/median_house_prices.csv')