In [None]:
import os

import pandas
%matplotlib inline

In [None]:
data_path = os.path.join('.', 'data_as_provided')
output_path = os.path.join('.', 'data_processed')

baseline_file = os.path.join(data_path, 'Scenario Baseline - Dwelling+Employment projections.xlsx')
expansion_file = os.path.join(data_path, 'Scenario Expansion - Dwelling+Employment projections.xlsx')
settlements_file = os.path.join(data_path, 'Scenario New Settlements - Dwelling+Employment projections.xlsx')
unplanned_file = os.path.join(data_path, 'Scenario Unplanned Development - Dwelling+Employment projections.xlsx')

In [None]:
def read_excel_into_df(filename, sheet_name):
    
    years = [str(x) for x in range(2004, 2050)]
    types = {str(year): int for year in years}
    
    return pandas.read_excel(
        filename, 
        sheet_name=sheet_name, 
        header=3, 
        index_col=0,
        names=['Area Name'].extend(years),
        nrows=22,
        dtype=types
    )

def process_df(df):
    df = df.dropna(
        ).reset_index(
        ).melt(
            id_vars='Area Name',
            var_name='timestep',
            value_name='dwellings')
    df = df.set_index(['Area Name', 'timestep'])
    return df

lad_nmcds = pandas.read_csv(os.path.join(data_path, 'lad_nmcd_changes.csv'))
lad_nmcds = lad_nmcds[['lad11nm', 'lad11cd', 'lad18nm', 'lad18cd']]
all_lad_nms = set(lad_nmcds.lad11nm)

def add_lad_codes(df):

    baseline_lad_nms = set(df.reset_index()['Area Name'].unique())
    df_wlad = df.reset_index().merge(lad_nmcds, 
                                           left_on='Area Name', 
                                           right_on='lad11nm').drop(columns='Area Name')
    return df_wlad

def write_out(df, filename):
    df.to_csv(os.path.join(output_path, filename), index=False)

In [None]:
baseline = read_excel_into_df(baseline_file, 'Projection dwelling baseline')
expansion = read_excel_into_df(expansion_file, 'Projection Expansion data')
settlements = read_excel_into_df(settlements_file, 'Projection New Settlem data')
unplanned = read_excel_into_df(unplanned_file, 'Projection UnplannedDev')

baseline = process_df(baseline)
expansion = process_df(expansion)
settlements = process_df(settlements)
unplanned = process_df(unplanned)

baseline = add_lad_codes(baseline)
expansion = add_lad_codes(expansion)
settlements = add_lad_codes(settlements)
unplanned = add_lad_codes(unplanned)

write_out(baseline, 'baseline.csv')
write_out(expansion, 'expansion.csv')
write_out(settlements, 'settlements.csv')
write_out(unplanned, 'unplanned.csv')

### Different LAD codes

In [None]:
baseline_wlad[
    (baseline_wlad.lad11nm != baseline_wlad.lad18nm) | 
    (baseline_wlad.lad11cd != baseline_wlad.lad18cd)].lad18nm.unique()