# Transform place data

In [1]:
import os
import pandas as pd
from geo import get_place_list, get_place_data
from params import DATA_DIR, SRC_DATA_DIR

## Data load

Load the data into a series of variables

### `population_data`

Current population estimates for geographies

In [2]:
population_data = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates.csv')
population_data = population_data[population_data.age_name == "All Ages"]
group = population_data
#population_data = population_data[population_data.date == max(population_data['date'])]
population_data = population_data.loc[population_data.groupby('geography_code')['date'].idxmax()]
#population_data = population_data.query()
#print(group)
population_data

Unnamed: 0,date,geography_code,geography_name,geography_type,gender_code,gender_name,age_code,age_name,variable_code,variable_name,value
87696,2020,E05000026,Abbey,wd22,0,Total,200,All Ages,n_persons,Number of persons,16149
87714,2020,E05000027,Alibon,wd22,0,Total,200,All Ages,n_persons,Number of persons,10907
87732,2020,E05000028,Becontree,wd22,0,Total,200,All Ages,n_persons,Number of persons,15182
87750,2020,E05000029,Chadwell Heath,wd22,0,Total,200,All Ages,n_persons,Number of persons,11463
87768,2020,E05000030,Eastbrook,wd22,0,Total,200,All Ages,n_persons,Number of persons,11557
...,...,...,...,...,...,...,...,...,...,...,...
6264,2021,E47000007,West Midlands,cauth22,0,Total,200,All Ages,n_persons,Number of persons,2916132
6282,2021,E47000008,Cambridgeshire and Peterborough,cauth22,0,Total,200,All Ages,n_persons,Number of persons,896756
6300,2021,E47000009,West of England,cauth22,0,Total,200,All Ages,n_persons,Number of persons,954276
6318,2021,E47000010,North East,cauth22,0,Total,200,All Ages,n_persons,Number of persons,1139626


### `gva`

In [3]:
gva = pd.read_csv(f'{DATA_DIR}/gva/gva.csv')
#gva['date'].astype(float)
#gva[gva['geography_code'].str.startswith('E09')]
gva['date'] = pd.to_datetime(gva['date'])
gva['date'] = gva['date'].dt.year
gva = gva.loc[gva.groupby('geography_code')['date'].idxmax()]
gva[gva['geography_code'].str.startswith('E05')]

Unnamed: 0,date,geography_code,variable_name,variable_unit,value
172260,2020,E05000026,GVA,£m,487.120255
172261,2020,E05000027,GVA,£m,118.668199
172262,2020,E05000028,GVA,£m,80.469218
172263,2020,E05000029,GVA,£m,107.323509
172264,2020,E05000030,GVA,£m,201.595050
...,...,...,...,...,...
179242,2020,E05013859,GVA,£m,108.915558
179243,2020,E05013860,GVA,£m,82.712088
179244,2020,E05013861,GVA,£m,189.998642
179245,2020,E05013862,GVA,£m,58.629043


### `area_of_place`

Geographic area of the place

In [4]:
area_of_place = pd.read_csv(f'{DATA_DIR}/geo/area_of_places.csv')
#@TODO temporary filter to remove
area_of_place.drop_duplicates(subset=['geography_code'], inplace=True)
# len(area_of_place.geography_code)
# l1 = area_of_place.geography_code.to_list()
# l2 = area_of_place.geography_code.unique()
# seen = set()
# dupes = [x for x in l1 if x in seen or seen.add(x)]
# print(dupes)
area_of_place

Unnamed: 0,geography_code,variable_name,value
0,E05000650,Area in sq km,6.556496
1,E05000651,Area in sq km,8.998946
2,E05000652,Area in sq km,3.719582
3,E05000653,Area in sq km,7.340085
4,E05000654,Area in sq km,3.497711
...,...,...,...
1646,E11000007,Area in sq km,538.992682
1647,E12000001,Area in sq km,8563.803423
1648,E12000002,Area in sq km,14103.955722
1649,E12000003,Area in sq km,15369.599209


### `households`

Number of households in the geographic area

In [5]:
households = pd.read_csv(f'{DATA_DIR}/households/households.csv')
households = households.loc[households.groupby('geography_code')['date'].idxmax()]
households

Unnamed: 0,date,geography_code,variable_name,value
0,2022,E05000026,Number of households,5800
1,2022,E05000027,Number of households,4090
2,2022,E05000028,Number of households,5380
3,2022,E05000029,Number of households,4300
4,2022,E05000030,Number of households,4100
...,...,...,...,...
7825,2022,W05001034,Number of households,940
7826,2022,W05001035,Number of households,2310
7827,2022,W05001036,Number of households,1910
7828,2022,W05001037,Number of households,860


### `council_tax_data`

In [6]:
council_tax_data = pd.read_csv(f'{DATA_DIR}/council-tax-support/council-tax-support.csv')
#council_tax_data = council_tax_data[council_tax_data.date == max(council_tax_data.date)]
council_tax_data['date'] = pd.to_datetime(council_tax_data['date'])
council_tax_data['date'] = council_tax_data['date'].dt.year
council_tax_data = council_tax_data.loc[council_tax_data.groupby(['geography_code', 'variable_name'])['date'].idxmax()]
council_tax_data.replace("pensioners", "council_tax_pensioners", inplace=True)
council_tax_data.replace("working_age", "council_tax_working_age", inplace=True)
council_tax_data[council_tax_data.variable_name == "council_tax_working_age"]
#council_tax_data[council_tax_data.geography_code.str.startswith('E05')]

Unnamed: 0,date,geography_code.ba,geography_code,geography_name,geography_type,variable_code,variable_name,value
14876,2022,E0701,E06000001,Hartlepool,UA,council_tax_working_age,council_tax_working_age,8712.0
16302,2022,E0702,E06000002,Middlesbrough,UA,council_tax_working_age,council_tax_working_age,12738.0
17480,2022,E0703,E06000003,Redcar & Cleveland,UA,council_tax_working_age,council_tax_working_age,8028.0
19185,2022,E0704,E06000004,Stockton-on-Tees,UA,council_tax_working_age,council_tax_working_age,11143.0
13202,2022,E1301,E06000005,Darlington,UA,council_tax_working_age,council_tax_working_age,6044.0
...,...,...,...,...,...,...,...,...
10691,2022,,E12000006,East of England,,council_tax_working_age,council_tax_working_age,211975.0
10722,2022,,E12000007,London,,council_tax_working_age,council_tax_working_age,445550.0
10815,2022,,E12000008,South East England,,council_tax_working_age,council_tax_working_age,284443.0
10846,2022,,E12000009,South West England,,council_tax_working_age,council_tax_working_age,207054.0


### `clif_data`

Children in low income families

In [7]:
clif_data = pd.read_csv(f'{DATA_DIR}/clif/clif_REL.csv')
clif_data = clif_data[
    (clif_data['Age of Child (years and bands)'] == 'Total') &
    (clif_data['Gender of Child'] == 'Total') &
    (clif_data['Family Type'] == 'Total') &
    (clif_data['Work Status'] == 'Total') &
    (clif_data.Year == max(clif_data.Year))
]
clif_data[clif_data.geography_code == 'E08000035']

Unnamed: 0,Year,Age of Child (years and bands),Gender of Child,Family Type,Work Status,variable_name,geography_code,value
11474,2021/22,Total,Total,Total,Total,children_in_low_income,E08000035,39995.0


### `number_of_children`

In [8]:
number_of_persons = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates-ages.csv')
number_of_children = number_of_persons[number_of_persons.age_band.isin(['0-4', '5-10', '11-15', '16-19'])]
number_of_children = number_of_children.groupby('geography_code').sum(numeric_only=True).reset_index()
number_of_children['variable_name'] = 'number_of_children'
number_of_children

Unnamed: 0,geography_code,date,value,variable_name
0,E05000650,8080,3390,number_of_children
1,E05000651,8080,2530,number_of_children
2,E05000652,8080,3722,number_of_children
3,E05000653,8080,2813,number_of_children
4,E05000654,8080,4875,number_of_children
...,...,...,...,...
1315,E47000003,8084,586078,number_of_children
1316,E47000004,8084,346827,number_of_children
1317,E47000006,8084,159599,number_of_children
1318,E47000010,8084,247569,number_of_children


### `number of pensioners`

In [9]:
number_of_pensioners = number_of_persons[number_of_persons.age_band == "65+"]
number_of_pensioners = number_of_pensioners.groupby('geography_code').sum(numeric_only=True).reset_index()
number_of_pensioners['variable_name'] = 'number_of_pensioners'
number_of_pensioners

Unnamed: 0,geography_code,date,value,variable_name
0,E05000650,2020,2637,number_of_pensioners
1,E05000651,2020,3155,number_of_pensioners
2,E05000652,2020,2572,number_of_pensioners
3,E05000653,2020,3190,number_of_pensioners
4,E05000654,2020,2000,number_of_pensioners
...,...,...,...,...
1315,E47000003,2021,394715,number_of_pensioners
1316,E47000004,2021,299278,number_of_pensioners
1317,E47000006,2021,134431,number_of_pensioners
1318,E47000010,2021,239672,number_of_pensioners


### `number_of_working_age`

In [10]:
number_of_working_age = number_of_persons[number_of_persons.age_band == "20-64"]
number_of_working_age = number_of_working_age.groupby('geography_code').sum(numeric_only=True).reset_index()
number_of_working_age['variable_name'] = 'number_of_working_age'
number_of_working_age

Unnamed: 0,geography_code,date,value,variable_name
0,E05000650,2020,8106,number_of_working_age
1,E05000651,2020,5646,number_of_working_age
2,E05000652,2020,7784,number_of_working_age
3,E05000653,2020,7500,number_of_working_age
4,E05000654,2020,9953,number_of_working_age
...,...,...,...,...
1315,E47000003,2021,1369194,number_of_working_age
1316,E47000004,2021,905617,number_of_working_age
1317,E47000006,2021,384143,number_of_working_age
1318,E47000010,2021,652385,number_of_working_age


### `households_in_poverty by age category`

In [11]:
# TODO Luke to look at this!
hbai_age_cat = pd.read_csv(f'{DATA_DIR}/hbai/by_age_category.csv')
hbai_total_low_income = hbai_age_cat.loc[
    (hbai_age_cat.ahc_income_status == 'In low income (below threshold)') &
    (hbai_age_cat.date == max(hbai_age_cat.date))
].groupby(['geography_code'])['value'].sum().to_frame()
hbai_total_low_income['variable_name'] = 'total_low_income'
hbai_total_low_income.reset_index(inplace=True)

hbai_percent = hbai_age_cat.loc[
    (hbai_age_cat.date == max(hbai_age_cat.date)) &
    (hbai_age_cat.ahc_income_status == 'In low income (below threshold)')
].groupby('geography_code').sum(numeric_only=True)
hbai_percent = pd.DataFrame(data={'value': hbai_percent['percent']})
hbai_percent['variable_name'] = 'percent_in_low_income'
hbai_percent.reset_index(inplace=True)
hbai_total_low_income


Unnamed: 0,geography_code,value,variable_name
0,E12000001,664279.0,total_low_income
1,E12000002,1639662.5,total_low_income
2,E12000003,1262334.0,total_low_income


### `savings`

In [12]:
savings = pd.read_csv(f'{DATA_DIR}/hbai/by_savings_and_investments.csv')
savings = savings[savings.ahc_income_status == 'In low income (below threshold)']
savings = savings[savings['Savings and Investments of Adults in the Family of the Individual'] == 'No savings']
savings = savings[savings.date == max(savings.date)].round(-5)
savings['variable_name'] = "households_low_income_no_savings"
savings

Unnamed: 0,geography_code,ahc_income_status,Savings and Investments of Adults in the Family of the Individual,bhc_income_status,variable_name,value,date,population,percent
2402,E12000001,In low income (below threshold),No savings,In low income (below threshold),households_low_income_no_savings,100000.0,2019-20 - 2021-22,500000.0,0.0
2403,E12000001,In low income (below threshold),No savings,Not in low income (at or above threshold),households_low_income_no_savings,0.0,2019-20 - 2021-22,500000.0,0.0
2418,E12000002,In low income (below threshold),No savings,In low income (below threshold),households_low_income_no_savings,400000.0,2019-20 - 2021-22,1300000.0,0.0
2419,E12000002,In low income (below threshold),No savings,Not in low income (at or above threshold),households_low_income_no_savings,100000.0,2019-20 - 2021-22,1300000.0,0.0
2434,E12000003,In low income (below threshold),No savings,In low income (below threshold),households_low_income_no_savings,300000.0,2019-20 - 2021-22,900000.0,0.0
2435,E12000003,In low income (below threshold),No savings,Not in low income (at or above threshold),households_low_income_no_savings,100000.0,2019-20 - 2021-22,900000.0,0.0


### `imd_older_people` and `imd_children`

In [13]:
# Index of Multiple deprivation
data = pd.read_csv(f'{DATA_DIR}/imd/imd.csv')
data  = data[data.variable_name == 'Average score'] #adding this as in future there will be measures for each age cat.#
imd = data.loc[data.dataset == 'IMD'].copy()
imd['variable_name'] = 'imd_average_score'

imd_older_people = data.loc[data.dataset == 'IDAOPI'].copy()
imd_older_people['variable_name'] = 'imd_older_people'

imd_children = data.loc[data.dataset == "IDACI"].copy()
imd_children['variable_name'] = 'imd_children'


### `unemployment` and `economic_inactivity` 

In [14]:
labour_market = pd.read_csv(f'{DATA_DIR}/labour-market/labour-market.csv')

unemployment = labour_market.loc[labour_market.variable_name == "Unemployment rate - aged 16-64"]
unemployment = unemployment[unemployment.date == max(unemployment.date)]
unemployment['variable_name'] = 'unemployment_rate_16_64'

economic_inactivity = labour_market.loc[labour_market.variable_name == "% who are economically inactive - aged 16-64"]
economic_inactivity = economic_inactivity[economic_inactivity.date == max(economic_inactivity.date)]
economic_inactivity = economic_inactivity.replace("% who are economically inactive - aged 16-64","economic_inactivity_16_64")

economic_inactivity

Unnamed: 0,date,geography_code,geography_name,variable_code,variable_name,value
17084,2023-03-01,E06000005,Darlington,111,economic_inactivity_16_64,19.8
17087,2023-03-01,E06000047,County Durham,111,economic_inactivity_16_64,25.3
17090,2023-03-01,E06000001,Hartlepool,111,economic_inactivity_16_64,26.1
17093,2023-03-01,E06000002,Middlesbrough,111,economic_inactivity_16_64,28.1
17096,2023-03-01,E06000057,Northumberland,111,economic_inactivity_16_64,26.5
...,...,...,...,...,...,...
17303,2023-03-01,E10000017,Lancashire,111,economic_inactivity_16_64,24.3
17306,2023-03-01,E10000023,North Yorkshire,111,economic_inactivity_16_64,18.6
17309,2023-03-01,E12000001,North East,111,economic_inactivity_16_64,26.0
17312,2023-03-01,E12000002,North West,111,economic_inactivity_16_64,23.5


### `housing_benefit`

In [15]:
housing_benefit = pd.read_csv(f'{DATA_DIR}/HB/claimants.csv')
housing_benefit['Month'] = pd.to_datetime(housing_benefit['Month'])
housing_benefit.rename(columns={'Unnamed: 0': 'dt_idx'}, inplace=True)
housing_benefit = housing_benefit.loc[housing_benefit.groupby('geography_code')['dt_idx'].idxmax()]
housing_benefit[housing_benefit.geography_code.str.startswith('E08')]

Unnamed: 0,dt_idx,Month,geography_code,value,variable_name
1651,4896,2023-02-01,E08000001,12115.0,HB_claimants
1710,5073,2023-02-01,E08000002,6472.0,HB_claimants
1769,5250,2023-02-01,E08000003,30631.0,HB_claimants
1828,5427,2023-02-01,E08000004,8589.0,HB_claimants
1887,5604,2023-02-01,E08000005,10194.0,HB_claimants
1946,5781,2023-02-01,E08000006,14051.0,HB_claimants
2005,5958,2023-02-01,E08000007,8977.0,HB_claimants
2064,6135,2023-02-01,E08000008,10009.0,HB_claimants
2123,6312,2023-02-01,E08000009,6830.0,HB_claimants
2182,6489,2023-02-01,E08000010,11410.0,HB_claimants


### `smi`

In [16]:
smi = pd.read_csv(f'{DATA_DIR}/smi/smi_loans_in_payment_households.csv')
smi['Quarter'] = pd.to_datetime(smi['Quarter'])
smi.rename(columns={'Unnamed: 0': 'dt_idx'}, inplace=True)
smi = smi.loc[smi.groupby('geography_code')['Quarter'].idxmax()]
smi

Unnamed: 0,dt_idx,Quarter,geography_code,value,variable_name
9,9,2023-02-01,E12000001,537.0,smi_loans_in_payment_households
29,29,2023-02-01,E12000002,1752.0,smi_loans_in_payment_households
49,49,2023-02-01,E12000003,963.0,smi_loans_in_payment_households


### `statutory-homelessness`

In [17]:
homelessness = pd.read_csv(f'{DATA_DIR}/statutory-homelessness/statutory-homelessness.csv')
homelessness['date'] = homelessness['date'].str[:4]
homelessness['date'] = pd.to_numeric(homelessness['date'])
homelessness = homelessness.loc[homelessness.groupby('geography_code')['date'].idxmax()]
homelessness

Unnamed: 0,date,geography_code,geography_name,variable_name,value
129,2023,E06000001,Hartlepool,Total households assessed as owed a duty,174.0
174,2023,E06000002,Middlesbrough,Total households assessed as owed a duty,382.0
208,2023,E06000003,Redcar & Cleveland,Total households assessed as owed a duty,103.0
260,2023,E06000004,Stockton-on-Tees,Total households assessed as owed a duty,838.0
79,2023,E06000005,Darlington,Total households assessed as owed a duty,321.0
...,...,...,...,...,...
7,2023,E12000006,East of England,Total households assessed as owed a duty,7640.0
1,2023,E12000007,London,Total households assessed as owed a duty,14320.0
9,2023,E12000008,South East,Total households assessed as owed a duty,10930.0
10,2023,E12000009,South West,Total households assessed as owed a duty,7520.0


In [18]:
# claimants = pd.read_csv(f'{DATA_DIR}/claimant-count/claimant-count.csv')
# claimants = claimants.loc[claimants.variable_name=='Claimants as a proportion of residents aged 16-64']
# #claimants = claimants.loc[claimants.groupby('geography_code')['date'].idxmax()]
# claimants = claimants[claimants.date == max(claimants.date)]
# claimants.drop_duplicates(subset='geography_code', inplace=True)
# claimants.reset_index(drop=True)
# claimants[claimants.geography_code.duplicated()]

### `house_prices`

In [19]:
house_prices = pd.read_csv(f'{DATA_DIR}/house-prices/house-prices.csv')
house_prices['date'] = pd.to_datetime(house_prices['date'])
house_prices = house_prices.loc[house_prices.groupby(['geography_code'])['date'].idxmax()]
print(len(house_prices.geography_code.unique()))
geo = pd.read_csv(f"{DATA_DIR}/geo/geography_lookup.csv")

tot = len(geo.LAD22CD.unique()) + len(geo.WD22CD.unique())


1314


In [20]:
weekly_earnings = pd.read_csv(f'{DATA_DIR}/ashe/weekly-earnings.csv')
weekly_earnings['value'] = pd.to_numeric(weekly_earnings['value'], errors='coerce')
weekly_earnings.round({'value':1})
mean_weekly_earnings = weekly_earnings[weekly_earnings.variable_name == 'mean_weekly_wage']
median_weekly_earnings = weekly_earnings[weekly_earnings.variable_name == 'median_weekly_wage']
median_weekly_earnings

Unnamed: 0,date,geography_code,geography_name,variable_name,value
0,2022,E12000001,North East,median_weekly_wage,490.6
3,2022,E06000005,Darlington UA,median_weekly_wage,482.2
6,2022,E06000001,Hartlepool UA,median_weekly_wage,498.3
9,2022,E06000002,Middlesbrough UA,median_weekly_wage,454.5
12,2022,E06000003,Redcar and Cleveland UA,median_weekly_wage,440.7
...,...,...,...,...,...
234,2022,E08000032,Bradford,median_weekly_wage,476.9
237,2022,E08000033,Calderdale,median_weekly_wage,494.4
240,2022,E08000034,Kirklees,median_weekly_wage,513.5
243,2022,E08000035,Leeds,median_weekly_wage,536.6


In [21]:
# fuel_poverty = pd.read_csv(f'{DATA_DIR}/fuel-poverty/fuel-poverty.csv')

# fuel_poverty = fuel_poverty[fuel_poverty.variable_name == 'Proportion of households fuel poor (%)']

# fuel_poverty

Concatenate the loaded data, then pivot into a table with a line per geography code. Filter this to only include geographies that are in the canonical list of areas.

In [22]:
place_data = pd.concat([
    population_data,
    area_of_place,
    gva,
    households,
    council_tax_data,
    clif_data,
    number_of_children,
    savings,
    imd,
    house_prices,
    hbai_total_low_income,
    hbai_percent,
    number_of_pensioners,
    number_of_working_age,
    unemployment,
    economic_inactivity,
    imd_children,
    imd_older_people,
    housing_benefit,
    smi,
    homelessness,
    # claimants,
    median_weekly_earnings,
    mean_weekly_earnings
]).pivot(index='geography_code', columns='variable_name', values='value')
place_data = place_data.loc[place_data.index.isin(get_place_list())]
place_data = place_data.merge(get_place_data(), left_index=True, right_index=True, how='outer')
place_data.index.name = 'geography_code'
place_data

ValueError: Index contains duplicate entries, cannot reshape

Create some additional metrics based on the data in place_data

In [None]:
place_data['Population density'] = place_data['Number of persons'] / place_data['Area in sq km']

Finally, write the data to an interim parquet and json file for later usage.

In [None]:
INTERIM_DIR=f'{DATA_DIR}/interim/'

os.makedirs(INTERIM_DIR, exist_ok=True)
place_data.to_parquet(f'{INTERIM_DIR}/place_data.parquet')
place_data.reset_index().to_json(f"{INTERIM_DIR}/place_data.json", orient="records")