In [1]:
import os
import pandas as pd
from geo import get_place_list, get_place_data
from params import DATA_DIR

Load some data

* `all_geos` is a list of all geographies mentioned in the the geography tree csv file
* `population_data` is population estimates for a (some of) the geographies
* `council_tax_data` is estimates of people in receipt of council tax support
* `clif_data` is data bout children living in poverty 

In [2]:
population_data = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates.csv')

In [3]:
gva = pd.read_csv(f'{DATA_DIR}/gva/gva.csv')

In [4]:
area_of_place =pd.read_csv(f'{DATA_DIR}/geo/area_of_places.csv')

In [5]:
households = pd.read_csv(f'{DATA_DIR}/households/households.csv')

In [6]:
council_tax_data = pd.read_csv(f'{DATA_DIR}/council-tax-support/council-tax-support.csv')
council_tax_data = council_tax_data[council_tax_data.date == max(council_tax_data.date)]

In [7]:
clif_data = pd.read_csv(f'{DATA_DIR}/clif/clif_REL.csv')
clif_data = clif_data[
    (clif_data['Age of Child (years and bands)'] == 'Total') &
    (clif_data['Gender of Child'] == 'Total') &
    (clif_data['Family Type'] == 'Total') &
    (clif_data['Work Status'] == 'Total') &
    (clif_data.Year == max(clif_data.Year))
]
clif_data

Unnamed: 0,Year,Age of Child (years and bands),Gender of Child,Family Type,Work Status,variable_name,geography_code,value
134,2021/22,Total,Total,Total,Total,children_in_low_income,E06000047,27178.0
296,2021/22,Total,Total,Total,Total,children_in_low_income,E06000005,6070.0
458,2021/22,Total,Total,Total,Total,children_in_low_income,E06000001,5870.0
620,2021/22,Total,Total,Total,Total,children_in_low_income,E06000002,13432.0
782,2021/22,Total,Total,Total,Total,children_in_low_income,E06000057,13562.0
...,...,...,...,...,...,...,...,...
60236,2021/22,Total,Total,Total,Total,children_in_low_income,Mid and East Antrim,5687.0
60398,2021/22,Total,Total,Total,Total,children_in_low_income,Mid Ulster,8079.0
60560,2021/22,Total,Total,Total,Total,children_in_low_income,"Newry, Mourne and Down",10990.0
60722,2021/22,Total,Total,Total,Total,children_in_low_income,Unknown,0.0


In [8]:
number_of_persons = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates-ages.csv')
number_of_children = number_of_persons[number_of_persons.age_band.isin(['0-4', '5-10', '11-15', '16-19'])]
number_of_children = number_of_children.groupby('geography_code').sum(numeric_only=True).reset_index()
number_of_children['variable_name'] = 'number_of_children'
number_of_children

Unnamed: 0,geography_code,date,value,variable_name
0,E05000650,8080,3390,number_of_children
1,E05000651,8080,2530,number_of_children
2,E05000652,8080,3722,number_of_children
3,E05000653,8080,2813,number_of_children
4,E05000654,8080,4875,number_of_children
...,...,...,...,...
1315,E47000003,8084,586078,number_of_children
1316,E47000004,8084,346827,number_of_children
1317,E47000006,8084,159599,number_of_children
1318,E47000010,8084,247569,number_of_children


In [9]:
savings = pd.read_csv(f'{DATA_DIR}/hbai/Savings and Investments of Adults in the Family of the Individual.csv')
savings = savings[savings['Financial Year'] == max(savings['Financial Year'])]
savings = savings[savings.variable_name == 'In low income (below threshold)']
savings = savings[savings['Savings and Investments of Adults in the Family of the Individual'] == 'No savings']
savings.variable_name = 'households_low_income_no_savings'
savings
#save each geog_code into 'src/place/EXXXXXX/_data/' and then can build the visualisations in place.njk on a conidtional statement if that data exists.

Unnamed: 0,Financial Year,geography_name,geography_code,Savings and Investments of Adults in the Family of the Individual,variable_name,value
1321,2021-22,North East,E12000001,No savings,households_low_income_no_savings,103723.0
1329,2021-22,North West,E12000002,No savings,households_low_income_no_savings,403448.0
1337,2021-22,Yorkshire and The Humber,E12000003,No savings,households_low_income_no_savings,255022.0


In [10]:
#number of households in poverty
households_in_poverty = pd.read_csv(f'{DATA_DIR}/hbai/Type of Individual by Age Category.csv')
households_in_poverty = households_in_poverty[households_in_poverty.variable_name == "In low income (below threshold)"]

#filtering out 2020-21 as no data was collected
households_in_poverty = households_in_poverty[households_in_poverty["Financial Year"] != "2020-21"]
dates = households_in_poverty["Financial Year"].unique()
most_recent = list(dates[-2:])
print(most_recent)
households_in_poverty = households_in_poverty[households_in_poverty["Financial Year"].isin(most_recent)]

#calculating a 2 year average according to user guidance
households_in_poverty = households_in_poverty.groupby('geography_code').sum(numeric_only=True) / len(most_recent)

#rounding to nearest 0.1mil, according to user guidance.
households_in_poverty = households_in_poverty.round(-5).reset_index()
households_in_poverty['variable_name'] = 'Households in poverty'
households_in_poverty

['2019-20', '2021-22']


Unnamed: 0,geography_code,value,variable_name
0,E12000001,500000.0,Households in poverty
1,E12000002,1300000.0,Households in poverty
2,E12000003,1100000.0,Households in poverty


In [11]:
 # Index of Multiple deprivation
imd = pd.read_csv(f'{DATA_DIR}/imd/imd.csv')
imd  = imd[imd.variable_name == 'IMD - average score'] #adding this as in future there will be measures for each age cat.

In [19]:
unemployment = pd.read_csv(f'{DATA_DIR}/labour-market/labour-market.csv')
unemployment = unemployment[unemployment.variable_name == "Unemployment rate - aged 16-64"]
unemployment = unemployment[unemployment.date == max(unemployment.date)]
unemployment['variable_name'] = 'unemployment_rate_16_64'
unemployment

Unnamed: 0,date,geography_code,geography_name,variable_code,variable_name,value
1,2022-12-01,E06000005,Darlington,84,Unemployment rate - aged 16-64,3.1
4,2022-12-01,E06000047,County Durham,84,Unemployment rate - aged 16-64,4.6
7,2022-12-01,E06000001,Hartlepool,84,Unemployment rate - aged 16-64,6.2
10,2022-12-01,E06000002,Middlesbrough,84,Unemployment rate - aged 16-64,6.8
13,2022-12-01,E06000057,Northumberland,84,Unemployment rate - aged 16-64,5.0
...,...,...,...,...,...,...
220,2022-12-01,E10000017,Lancashire,84,Unemployment rate - aged 16-64,3.9
223,2022-12-01,E10000023,North Yorkshire,84,Unemployment rate - aged 16-64,2.0
226,2022-12-01,E12000001,North East,84,Unemployment rate - aged 16-64,4.7
229,2022-12-01,E12000002,North West,84,Unemployment rate - aged 16-64,4.1


Concatenate the loaded data, then pivot into a table with a line per geography code. Filter this to only include geographies that are in the canonical list of areas.

In [18]:
place_data = pd.concat([
    population_data,
    area_of_place,
    gva,
    households,
    council_tax_data,
    clif_data,
    number_of_children,
    savings,
    imd,
    households_in_poverty,
    unemployment
]).pivot(index='geography_code', columns='variable_name', values='value')
place_data = place_data.loc[place_data.index.isin(get_place_list())]
place_data = place_data.merge(get_place_data(), left_index=True, right_index=True, how='outer')
place_data.index.name = 'geography_code'
place_data

Unnamed: 0_level_0,Area in sq km,GVA,Households in poverty,Number of households,Number of persons,Unemployment rate - aged 16-64,children_in_low_income,households_low_income_no_savings,number_of_children,pensioners,working_age,ancestors,parents,children,name,type
geography_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
E05000650,,82.047782,,,14133.0,,,,3390.0,,,"[E08000001, E47000001, E12000002, E12999901]",[E08000001],[],Astley Bridge,WD22
E05000651,,79.731727,,,11331.0,,,,2530.0,,,"[E08000001, E47000001, E12000002, E12999901]",[E08000001],[],Bradshaw,WD22
E05000652,,178.319099,,,14078.0,,,,3722.0,,,"[E08000001, E47000001, E12000002, E12999901]",[E08000001],[],Breightmet,WD22
E05000653,,110.950950,,,13503.0,,,,2813.0,,,"[E08000001, E47000001, E12000002, E12999901]",[E08000001],[],Bromley Cross,WD22
E05000654,,275.878141,,,16828.0,,,,4875.0,,,"[E08000001, E47000001, E12000002, E12999901]",[E08000001],[],Crompton,WD22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E47000003,2022.032423,57909.000000,,1035190.0,2349987.0,,,,586078.0,,,"[E12000003, E12999901]",[E12000003],"[E08000032, E08000033, E08000034, E08000035, E...",West Yorkshire,CAUTH22
E47000004,722.669297,33598.000000,,729960.0,1551722.0,,,,346827.0,,,"[E12000002, E12999901]","[E12000002, E12000002]","[E08000011, E08000012, E08000013, E08000014, E...",Liverpool City Region,CAUTH22
E47000006,801.149717,13951.000000,,317080.0,678173.0,,,,159599.0,,,"[E12000001, E12999901]",[E12000001],"[E06000001, E06000002, E06000003, E06000004, E...",Tees Valley,CAUTH22
E47000010,2567.223988,22244.000000,,550480.0,1139626.0,,,,247569.0,,,"[E12000001, E12999901]",[E12000001],"[E08000023, E08000024, E08000037, E06000047]",North East,CAUTH22


Create some additional metrics based on the data in place_data

In [13]:
place_data['Population density'] = place_data['Number of persons'] / place_data['Area in sq km']
place_data.loc['E12000003', :]

Area in sq km                                                            15369.745447
GVA                                                                          128050.0
Households in poverty                                                       1100000.0
Number of households                                                        2492850.0
Number of persons                                                           5481431.0
children_in_low_income                                                            NaN
households_low_income_no_savings                                             255022.0
number_of_children                                                          1276610.0
pensioners                                                                   152150.0
working_age                                                                  260248.0
ancestors                                                                 [E12999901]
parents                                               

Finally, write the data to an interim parquet and json file for later usage.

In [14]:
os.makedirs(f'{DATA_DIR}/interim/', exist_ok=True)
place_data.to_parquet(f'{DATA_DIR}/interim/place_data.parquet')
place_data.reset_index().to_json(f"{DATA_DIR}/interim/place_data.json", orient="records")

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.