Pipeline to pull data from Households Below Average Income data from Stat-Xplore

In [1]:
import os
import json
import pandas as pd
from api import run_statxplore_query

In [2]:
OUTPUT_DIR="../../data/hbai"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
HBAI_JSON = "json/data/HBAI.json"

In [4]:
views = {
    "by_age_category": [
        "str:field:HBAI:V_F_HBAI:TYPE_AGECAT"
    ],
    "by_number_of_children": [
        "str:field:HBAI:V_F_HBAI:NUMBKIDS"
    ],
    "by_age_of_youngest_child": [
        "str:field:HBAI:V_F_HBAI:YOUNGCH"
    ],
    "by_tenure_type": [
        "str:field:HBAI:V_F_HBAI:TENHBAI"
    ],
    "by_savings_and_investments": [
        "str:field:HBAI:V_F_HBAI:CAPITAL"
    ],
    "by_ethnic_group": [
        "str:field:HBAI:V_F_HBAI:ETHGRPHHPUB"
    ],
    "by_marital_status": [
        "str:field:HBAI:V_F_HBAI:COUPLE"
    ]
}

In [5]:
location = "Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)"

def extract_location_code(data):
    # Extract geography code from location
    data["geography_code"] = (
        data[location].str.split(r"[()]", expand=True).iloc[:,1].str.strip()
    )
    data = data.drop(location, axis=1)
    return data

In [6]:
def run_query(dim):
    with open(HBAI_JSON) as json_file:
        query = json.load(json_file)
    query["dimensions"] += [dim]
    return run_statxplore_query(query)


def reshape_statxplore_result(data):
    # Convert wide to long format
    data = data.melt(ignore_index=False, 
                     value_vars=['Not in low income (at or above threshold)', 'In low income (below threshold)'], 
                     var_name='bhc_income_status')
    # Add in a variable name
    data['variable_name'] = 'population'
    print(data.index.names)
    #data = extract_location_code(data)
    # Split geo codes and remove bracketed statements from column names
    data.index = data.index.set_levels(
            data.index.levels[2].str.strip(')').str.split('(').map(lambda l: l[1]).values, level=2
        ).set_names(
            'geography_code', level=2
        ).set_names(
            data.index.names[3].split('(')[0].strip(),
            level=3
        )
    data.index.set_names('ahc_income_status', level=1, inplace=True)
    return data

def three_year_average(data, date_name='Financial Year', round=False):
    '''
    Three year rolling average
    '''
    # Dynamically get a list of variables to group by - everything apart from date and value.
    data.reset_index(inplace=True)
    colnames = data.columns.to_list()
    if "Financial Year" not in colnames:
        return print("Check the date name of the input dataframe - should be financial year", colnames)
    else: 
        colnames.remove('Financial Year')
    if "value" in colnames:
        colnames.remove('value')
        groupby = colnames
    else: 
        return print('no value column in the data')
    #removing missing data
    data = data[data[date_name] != "2020-21"]
    
    #getting a list of unique dates
    dates = data[date_name].unique()
    frames = []
    for i in range(len(dates)-1):
        #accounting for missing data in accordance with HBAI data guidance.
        if dates[0] == '2018-19':
            most_recent_dates = ['2018-19', '2019-20']
        elif dates[0] == '2019-20':
            most_recent_dates = ['2019-20', '2021-22']
        elif i != len(dates)-1:
            most_recent_dates = list(dates[i:(i+3)])

        new_data = data.loc[data[date_name].isin(most_recent_dates)].copy()
        new_data = new_data.groupby(groupby).sum(numeric_only=True) / len(most_recent_dates)
        new_data["date"] = f'{most_recent_dates[0]} - {most_recent_dates[len(most_recent_dates)-1]}'
        #new_data["variable_name"] = variable_name
        # rounding to nearest 0.1mil, according to user guidance.
        if round:
            new_data = new_data.round(-5)
        frames.append(new_data.reset_index())
        #i+=1
        #print(i)
    #concatenating the frames
    return pd.concat(frames, axis=0)

def aggregate_to_north(data):
    gb = data.columns.to_list()
    gb.remove('value')
    gb.remove('geography_code')
    north_data = data.groupby(gb)['value'].sum().reset_index()
    north_data['geography_code'] = 'E12999901'
    print(north_data)
    return pd.concat([data, north_data]) 

def get_percentages(data):
    additional_metric = data.columns.to_list()
    for item  in ['date', 'geography_code', 'ahc_income_status', 'bhc_income_status', 'variable_name', 'value']:
        additional_metric.remove(item)
    #print(additional_metric)
    groupby = ['date', 'geography_code']
    groupby.append(additional_metric[0])
    data['population_for_group_for_year'] = data.groupby(groupby)['value'].transform('sum')
    data['percent'] = (data['value'] / data['population_for_group_for_year'] * 100).round(0)
    data.set_index('geography_code', inplace=True)
    return data

for name, dim in views.items():
    data = run_query(dim).pipe(reshape_statxplore_result).pipe(three_year_average).pipe(aggregate_to_north).pipe(get_percentages)
    #print(data.to_csv('test.csv'))
    data.to_csv(f"{OUTPUT_DIR}/{name}.csv")
    
    PARQUET_FILE = f"../../data-mart/hbai/{name}.parquet"
    os.makedirs(os.path.dirname(PARQUET_FILE), exist_ok=True)
    data.to_parquet(PARQUET_FILE)

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Type of Individual by Age Category']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
307  Not in low income (at or above threshold)   
308  Not in low income (at or above threshold)   
309  Not in low income (at or above threshold)   
310  Not in low income (at or above threshold)   
311  Not in low income (at or above threshold)   

    Type of Individual by Age Category  \
0                                Child   
1                                Child   
2  

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Number of Children in the Family of the Individual']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
411  Not in low income (at or above threshold)   
412  Not in low income (at or above threshold)   
413  Not in low income (at or above threshold)   
414  Not in low income (at or above threshold)   
415  Not in low income (at or above threshold)   

    Number of Children in the Family of the Individual  \
0                                          No children 

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Age of the Youngest Child in the Family of the Individual']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
515  Not in low income (at or above threshold)   
516  Not in low income (at or above threshold)   
517  Not in low income (at or above threshold)   
518  Not in low income (at or above threshold)   
519  Not in low income (at or above threshold)   

    Age of the Youngest Child in the Family of the Individual  \
0                                        

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Tenure Type of the Household of the Individual']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
307  Not in low income (at or above threshold)   
308  Not in low income (at or above threshold)   
309  Not in low income (at or above threshold)   
310  Not in low income (at or above threshold)   
311  Not in low income (at or above threshold)   

    Tenure Type of the Household of the Individual  \
0                             All rented privately   
1        

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Savings and Investments of Adults in the Family of the Individual']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
827  Not in low income (at or above threshold)   
828  Not in low income (at or above threshold)   
829  Not in low income (at or above threshold)   
830  Not in low income (at or above threshold)   
831  Not in low income (at or above threshold)   

    Savings and Investments of Adults in the Family of the Individual  \
0                        

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Ethnic Group of the Head of the Household (please calculate three-year averages - click on i for the correct method)']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
619  Not in low income (at or above threshold)   
620  Not in low income (at or above threshold)   
621  Not in low income (at or above threshold)   
622  Not in low income (at or above threshold)   
623  Not in low income (at or above threshold)   

    Ethnic Group of the Head of the Household  

['Financial Year', '60 per cent of median net household income (AHC) in latest prices', 'Location in the United Kingdom of the Household of the Individual (please calculate three-year averages - click on i for the correct method)', 'Marital Status of Adults and Type of Couple in the Family of the Individual']
                             ahc_income_status  \
0              In low income (below threshold)   
1              In low income (below threshold)   
2              In low income (below threshold)   
3              In low income (below threshold)   
4              In low income (below threshold)   
..                                         ...   
203  Not in low income (at or above threshold)   
204  Not in low income (at or above threshold)   
205  Not in low income (at or above threshold)   
206  Not in low income (at or above threshold)   
207  Not in low income (at or above threshold)   

    Marital Status of Adults and Type of Couple in the Family of the Individual  \
0    

In [7]:
print(data.to_csv())

geography_code,ahc_income_status,Marital Status of Adults and Type of Couple in the Family of the Individual,bhc_income_status,variable_name,value,date,population_for_group_for_year,percent
E12000001,In low income (below threshold),Couple,In low income (below threshold),population,570947.6666666666,1994-95 - 1996-97,2543594.0,22.0
E12000001,In low income (below threshold),Couple,Not in low income (at or above threshold),population,172774.33333333334,1994-95 - 1996-97,2543594.0,7.0
E12000001,In low income (below threshold),Single,In low income (below threshold),population,0.0,1994-95 - 1996-97,0.0,
E12000001,In low income (below threshold),Single,Not in low income (at or above threshold),population,0.0,1994-95 - 1996-97,0.0,
E12000002,In low income (below threshold),Couple,In low income (below threshold),population,1285379.6666666667,1994-95 - 1996-97,6719032.333333334,19.0
E12000002,In low income (below threshold),Couple,Not in low income (at or above threshold),population,450426.33333