In [1]:
import pandas as pd
import os 
os.chdir('../..')

In [2]:
def clean_data(file, sheet_name, header, skiprows=None):
    # Read the excel sheet from file.
    data = pd.read_excel(file, sheet_name=sheet_name, header=header, skiprows=skiprows)
    # Strip whitespace
    data = data.rename(columns=lambda x: x.strip())
    # Simplify column names
    data = data.rename(columns=lambda x: x.replace('Year ending ', ''))
    # Drop un-used columns
    if sheet_name == '2a':
        data.drop(columns=['Region/Country code', 'Region/Country name', 'Local authority name'], inplace=True)
        old_code_name = 'Local authority code'
    else:
        data.drop(columns='Name', inplace=True)
        old_code_name = 'Code'
    # Standardise the geo code column name
    data.rename(columns={old_code_name: 'geography_code'}, inplace=True)
    # Get all columns apart from the geo code
    cols = [col for col in data.columns if col != 'geography_code']
    # Melt the frame (unpivot) and set the index as the geo code
    data = data.melt(id_vars='geography_code', value_vars=cols, var_name='date', value_name='Median (£)').set_index('geography_code')
    return data

In [3]:
fpath = 'raw/house-prices/hpssadataset9medianpricepaidforadministrativegeographies.xls'

In [4]:
la_data = clean_data(fpath, '2a', 6)
ca_data = clean_data(fpath, '4a', 6, skiprows=[8,10])
reg_nat_data = clean_data(fpath, '1a', 6, skiprows=[8,10])

In [5]:
combined = pd.concat([la_data, ca_data, reg_nat_data])

In [6]:
combined.to_csv('data/house-prices/median_house_prices.csv')