Pipeline to pull data from Households Below Average Income data from Stat-Xplore

In [1]:
import os
import json

from api import run_statxplore_query

In [2]:
OUTPUT_DIR="../../data/hbai"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
HBAI_JSON = "json/data/HBAI.json"

In [4]:
views = {
    "by_age_category": ["str:field:HBAI:V_F_HBAI:TYPE_AGECAT"],
    "by_number_of_children": [
        "str:field:HBAI:V_F_HBAI:NUMBKIDS"
    ],
    "by_age_of_youngest_child": [
        "str:field:HBAI:V_F_HBAI:YOUNGCH"
    ],
    "by_tenure_type": [
        "str:field:HBAI:V_F_HBAI:TENHBAI"
    ],
    "by_savings_and_investments": [
        "str:field:HBAI:V_F_HBAI:CAPITAL"
    ],
    "by_ethnic_group": [
        "str:field:HBAI:V_F_HBAI:ETHGRPHHPUB"
    ],
}

In [5]:
location = "Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)"

def extract_location_code(data):
    # Extract geography code from location
    data["geography_code"] = (
        data[location].str.split(r"[()]", expand=True).iloc[:,1].str.strip()
    )
    data = data.drop(location, axis=1)
    return data

In [6]:
def run_query(dim):
    with open(HBAI_JSON) as json_file:
        query = json.load(json_file)
    query["dimensions"] += [dim]
    return run_statxplore_query(query)


def reshape_statxplore_result(data):
    # Convert wide to long format
    data = data.melt(ignore_index=False, var_name='income_status')
    # Add in a variable name
    data['variable_name'] = 'households_in_low_income'
    # Split geo codes and remove bracketed statements from colum names
    data.index = data.index.set_levels(
            data.index.levels[1].str.strip(')').str.split('(').map(lambda l: l[1]).values, level=1
        ).set_names(
            'geography_code', level=1
        ).set_names(
            data.index.names[2].split('(')[0].strip(),
            level=2
        )
    return data


for name, dim in views.items():
    data = run_query(dim).pipe(reshape_statxplore_result)
    data.to_csv(f"{OUTPUT_DIR}/{name}.csv")