Pipeline to pull data from Households Below Average Income data from Stat-Xplore

In [26]:
import os
import json

from api import run_statxplore_query

In [27]:
OUTPUT_DIR="../../data/hbai"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [28]:
HBAI_JSON = "json/data/HBAI.json"

In [29]:
views = {
    "by_age_category": ["str:field:HBAI:V_F_HBAI:TYPE_AGECAT"],
    "by_number_of_children": [
        "str:field:HBAI:V_F_HBAI:NUMBKIDS"
    ],
    "by_age_of_youngest_child": [
        "str:field:HBAI:V_F_HBAI:YOUNGCH"
    ],
    "by_tenure_type": [
        "str:field:HBAI:V_F_HBAI:TENHBAI"
    ],
    "by_savings_and_investments": [
        "str:field:HBAI:V_F_HBAI:CAPITAL"
    ],
    "by_ethnic_group": [
        "str:field:HBAI:V_F_HBAI:ETHGRPHHPUB"
    ],
    "by_marital_status": [
        "str:field:HBAI:V_F_HBAI:COUPLE"
    ]
}

In [30]:
location = "Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)"

def extract_location_code(data):
    # Extract geography code from location
    data["geography_code"] = (
        data[location].str.split(r"[()]", expand=True).iloc[:,1].str.strip()
    )
    data = data.drop(location, axis=1)
    return data

In [31]:
def run_query(dim):
    with open(HBAI_JSON) as json_file:
        query = json.load(json_file)
    query["dimensions"] += [dim]
    return run_statxplore_query(query)


def reshape_statxplore_result(data):
    # Convert wide to long format
    data = data.melt(ignore_index=False, var_name='income_status')
    # Add in a variable name
    data['variable_name'] = 'population'
    # Split geo codes and remove bracketed statements from colum names
    data.index = data.index.set_levels(
            data.index.levels[1].str.strip(')').str.split('(').map(lambda l: l[1]).values, level=1
        ).set_names(
            'geography_code', level=1
        ).set_names(
            data.index.names[2].split('(')[0].strip(),
            level=2
        )
    return data


for name, dim in views.items():
    data = run_query(dim).pipe(reshape_statxplore_result)
    data.to_csv(f"{OUTPUT_DIR}/{name}.csv")

In [32]:
print(data.to_csv())

Financial Year,geography_code,Marital Status of Adults and Type of Couple in the Family of the Individual,income_status,value,variable_name
1994-95,E12000001,Couple,Not in low income (at or above threshold),1971185.0,population
1994-95,E12000001,Single,Not in low income (at or above threshold),0.0,population
1994-95,E12000002,Couple,Not in low income (at or above threshold),5288744.0,population
1994-95,E12000002,Single,Not in low income (at or above threshold),0.0,population
1994-95,E12000003,Couple,Not in low income (at or above threshold),3743033.0,population
1994-95,E12000003,Single,Not in low income (at or above threshold),0.0,population
1995-96,E12000001,Couple,Not in low income (at or above threshold),1906464.0,population
1995-96,E12000001,Single,Not in low income (at or above threshold),0.0,population
1995-96,E12000002,Couple,Not in low income (at or above threshold),5453457.0,population
1995-96,E12000002,Single,Not in low income (at or above threshold),0.0,population
1995-96,E1

### Processing HBAI data

In [33]:
import pandas as pd
def three_year_average(data, date_name, groupby, round=False):
    '''
    Three year rolling average
    '''

    #removing missing data
    data = data[data[date_name] != "2020-21"]
    
    #getting a list of unique dates
    dates = data[date_name].unique()
    frames = []
    for i in range(len(dates)-1):
        #accounting for missing data in accordance with HBAI data guidance.
        if dates[0] == '2018-19':
            most_recent_dates = ['2018-19', '2019-20']
        elif dates[0] == '2019-20':
            most_recent_dates = ['2019-20', '2021-22']
        elif i != len(dates)-1:
            most_recent_dates = list(dates[i:(i+3)])

        new_data = data.loc[data[date_name].isin(most_recent_dates)].copy()
        new_data = new_data.groupby(groupby).sum(numeric_only=True) / len(most_recent_dates)
        new_data["date"] = f'{most_recent_dates[0]} - {most_recent_dates[len(most_recent_dates)-1]}'
        #new_data["variable_name"] = variable_name
        # rounding to nearest 0.1mil, according to user guidance.
        if round:
            new_data = new_data.round(-5)
        frames.append(new_data.reset_index())
        #i+=1
        #print(i)
    #concatenating the frames
    return pd.concat(frames, axis=0)

def get_percentages(data):
    data['population'] = data.groupby(['date', 'geography_code'])['value'].transform('sum')
    data['percent'] = (data['value'] / data['population'] * 100).round(0)
    return data

In [34]:
hbai_ethnic_group = pd.read_csv('../../data/hbai/by_ethnic_group.csv')

# Three year average
processed_data = three_year_average(hbai_ethnic_group, 
                                    date_name='Financial Year', 
                                    groupby=['geography_code','Ethnic Group of the Head of the Household','income_status','variable_name'])

# calculate percentages
processed_data = get_percentages(processed_data)

# Write to file
processed_data.to_csv('../../data/interim/hbai_by_ethnic_group.csv')

In [35]:
hbai_marital_status = pd.read_csv('../../data/hbai/by_marital_status.csv')

# no data on single people before 97/98
processed_data = hbai_marital_status.loc[hbai_marital_status['Financial Year'] >= '1997-98'].copy()
processed_data.rename(columns={'Marital Status of Adults and Type of Couple in the Family of the Individual': 'marital_status'}, inplace=True)
# Three year average
processed_data = three_year_average(processed_data, 
                                    date_name='Financial Year', 
                                    groupby=['geography_code','income_status','variable_name', 'marital_status'])

# calculate percentages
processed_data = get_percentages(processed_data).set_index('geography_code')

# Write to file
processed_data.to_csv('../../data/interim/hbai_by_marital_status.csv')

In [36]:
hbai_age_cat = pd.read_csv('../../data/hbai/by_age_category.csv')
hbai_age_cat = three_year_average(hbai_age_cat, 'Financial Year', ['geography_code', 'income_status', 'Type of Individual by Age Category', 'variable_name'])
hbai_age_cat = get_percentages(hbai_age_cat)
hbai_age_cat.to_csv(f'../../data/interim/hbai_by_age_category.csv')

In [37]:
savings = pd.read_csv(f'../../data/hbai/by_savings_and_investments.csv')
savings = three_year_average(savings, 'Financial Year', ['geography_code', 'income_status', 'Savings and Investments of Adults in the Family of the Individual', 'variable_name'])
savings = get_percentages(savings)
savings = savings[savings.income_status == 'In low income (below threshold)']
savings.to_csv(f'../../data/interim/hbai_savings_investments.csv')