In [60]:
import pandas as pd
import xlrd
import os

Get a list of sheets

In [61]:
sheets = xlrd.open_workbook('../../data/raw/a06-latest.xls').sheet_names()
sheets

['Note', 'People', 'Men', 'Women']

Find the row with the IDs

In [62]:
test = pd.read_excel('../../data/raw/a06-latest.xls', sheet_name='People', header=None, usecols=[0])[0]
id_row = test[test == 'Dataset identifier code'].index.to_list().pop()

Get the headers, based on the ID row

In [80]:
def get_headers(sheet_name, id_row):
    headers = pd.read_excel('../../data/raw/a06-latest.xls', sheet_name=sheet_name, index_col=0, header=None).head(id_row+1).T.set_index('Dataset identifier code').ffill()
    headers = headers.loc[:, headers.columns.isna()]
    headers.columns = ['age', 'variable', 'measure']
    return headers
  
  
def add_group(data, value):
    data['group'] = value
    return data

headers = pd.concat([get_headers(s, id_row=id_row).pipe(add_group, s) for s in sheets[1:]]).sort_index()

Save metadata file

In [81]:
METADATA_FILE = '../../data/metadata/labour-market/a06-codes.csv'
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
headers.to_csv(METADATA_FILE)

# Processing functions

The following functions are used to process data from the sheets.

Load the data

In [64]:
def load_data(sheet_name, id_row):
    data = pd.read_excel('../../data/raw/a06-latest.xls', sheet_name=sheet_name, skiprows=id_row).rename(columns={ 'Dataset identifier code': 'date_name' })
    data = data[data.iloc[:, 1].notna()]
    return data

Construct the date - this marked as the centre month of the rolling period

In [65]:
def create_date_column(data):
    data['date'] = pd.to_datetime(data.date_name.str.slice(4), format="%b %Y") - pd.DateOffset(months=1)
    return data

Normalise by melting wide column to narrow

In [66]:
def make_long(data):
    return data.melt(id_vars=['date', 'date_name'], var_name='variable_name')

In [74]:
data = pd.concat([
  load_data(s, id_row).pipe(create_date_column).pipe(make_long)
  for s
  in sheets[1:]
]).sort_values(by=['date', 'variable_name'])

In [75]:
DATA_FILE = '../../data/processed/labour-market/a06.csv'
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)
data.to_csv(DATA_FILE, index=None)