In [1]:
import os
import pandas as pd
import numpy as np
from prepare import DATA_DIR
from geo import get_all_codes, get_place_data

Load some data

* `all_geos` is a list of all geographies mentioned in the the geography tree csv file
* `population_data` is population estimates for a (some of) the geographies
* `council_tax_data` is estimates of people in receipt of council tax support
* `clif_data` is data bout children living in poverty 

In [2]:
population_data = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates.csv')

In [3]:
council_tax_data = pd.read_csv(f'{DATA_DIR}/council-tax-support/council-tax-support.csv')
council_tax_data = council_tax_data[council_tax_data.date == max(council_tax_data.date)]

In [4]:
clif_data = pd.read_csv(f'{DATA_DIR}/clif/clif_REL.csv')
clif_data = clif_data[
    (clif_data['Age of Child (years and bands)'] == 'Total') &
    (clif_data['Gender of Child'] == 'Total') &
    (clif_data['Family Type'] == 'Total') &
    (clif_data['Work Status'] == 'Total') &
    (clif_data.Year == max(clif_data.Year))
]

Concatenate the loaded data, then pivot into a table with a line per geography code. Filter this to only include geographies that are in the canonical list of areas.

In [11]:
place_data = pd.concat([
    population_data,
    council_tax_data,
    clif_data
]).pivot(index='geography_code', columns='variable_name', values='value')
place_data = place_data.loc[place_data.index.isin(get_all_codes())]
place_data = place_data.merge(get_place_data(), left_index=True, right_index=True, how='outer')

Finally, write the data to an interim parquet and json file for later usage.

In [6]:
os.makedirs(f'{DATA_DIR}/interim/', exist_ok=True)
place_data.to_parquet(f'{DATA_DIR}/interim/place_data.parquet')
place_data.reset_index().to_json(f"{DATA_DIR}/interim/place_data.json", orient="records")