Pipeline to pull data from Households Below Average Income data from Stat-Xplore

In [1]:
import os
import json

from api import run_statxplore_query

In [2]:
OUTPUT_DIR="../../data/hbai"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
HBAI_JSON = "json/data/HBAI.json"

In [4]:
dimensions = {
    "Type of Individual by Age Category": ["str:field:HBAI:V_F_HBAI:TYPE_AGECAT"],
    "Number of Children in the Family of the Individual": [
        "str:field:HBAI:V_F_HBAI:NUMBKIDS"
    ],
    "Age of the Youngest Child in the Family of the Individual": [
        "str:field:HBAI:V_F_HBAI:YOUNGCH"
    ],
    "Tenure Type of the Household of the Individual": [
        "str:field:HBAI:V_F_HBAI:TENHBAI"
    ],
    "Savings and Investments of Adults in the Family of the Individual": [
        "str:field:HBAI:V_F_HBAI:CAPITAL"
    ],
    "Ethnic Group of the Head of the Household (please calculate three-year averages - click on i for the correct method)": [
        "str:field:HBAI:V_F_HBAI:ETHGRPHHPUB"
    ],
}

In [5]:
location = "Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)"

def split_location_code(data):
    # Split location code
    data[["geography_name", "geography_code"]] = (
        data[location].str.split(r"[()]", expand=True).iloc[:, [0, 1]]
    )
    data["geography_code"] = data["geography_code"].str.strip()
    data["geography_name"] = data["geography_name"].str.strip()
    data.drop(location, axis=1, inplace=True)

In [6]:
for name, dim in dimensions.items():
    with open(HBAI_JSON) as json_file:
        query = json.load(json_file)

    query["dimensions"] += [dim]

    HBAI = run_statxplore_query(query).reset_index()

    # Split location code
    HBAI.pipe(split_location_code)

    HBAI = HBAI.melt(
        id_vars=["Financial Year", "geography_name", "geography_code", f"{name}"],
        var_name="variable_name",
    )
    HBAI.set_index("Financial Year", inplace=True)
    HBAI["variable_name"] = HBAI["variable_name"].str.replace(
        " (at or above threshold)", ""
    )
    HBAI.to_csv(f"{OUTPUT_DIR}/{name}.csv")