# Transform place data

In [None]:
import os
import pandas as pd
from geo import get_place_list, get_place_data
from params import DATA_DIR, SRC_DATA_DIR

## Data load

Load the data into a series of variables

### `population_data`

Current population estimates for geographies

In [None]:
population_data = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates.csv')
population_data = population_data[population_data.age_name == "All Ages"]
group = population_data
#population_data = population_data[population_data.date == max(population_data['date'])]
population_data = population_data.loc[population_data.groupby('geography_code')['date'].idxmax()]
#population_data = population_data.query()
#print(group)
population_data

### `gva`

In [None]:
gva = pd.read_csv(f'{DATA_DIR}/gva/gva.csv')
#gva['date'].astype(float)
#gva[gva['geography_code'].str.startswith('E09')]
gva['date'] = pd.to_datetime(gva['date'])
gva['date'] = gva['date'].dt.year
gva = gva.loc[gva.groupby('geography_code')['date'].idxmax()]
gva[gva['geography_code'].str.startswith('E05')]

### `area_of_place`

Geographic area of the place

In [None]:
area_of_place = pd.read_csv(f'{DATA_DIR}/geo/area_of_places.csv')
#@TODO temporary filter to remove
area_of_place.drop_duplicates(subset=['geography_code'], inplace=True)
# len(area_of_place.geography_code)
# l1 = area_of_place.geography_code.to_list()
# l2 = area_of_place.geography_code.unique()
# seen = set()
# dupes = [x for x in l1 if x in seen or seen.add(x)]
# print(dupes)
area_of_place

### `households`

Number of households in the geographic area

In [None]:
households = pd.read_csv(f'{DATA_DIR}/households/households.csv')
households = households.loc[households.groupby('geography_code')['date'].idxmax()]
households

### `council_tax_data`

In [None]:
council_tax_data = pd.read_csv(f'{DATA_DIR}/council-tax-support/council-tax-support.csv')
#council_tax_data = council_tax_data[council_tax_data.date == max(council_tax_data.date)]
council_tax_data['date'] = pd.to_datetime(council_tax_data['date'])
council_tax_data['date'] = council_tax_data['date'].dt.year
council_tax_data = council_tax_data.loc[council_tax_data.groupby(['geography_code', 'variable_name'])['date'].idxmax()]
council_tax_data.replace("pensioners", "council_tax_pensioners", inplace=True)
council_tax_data.replace("working_age", "council_tax_working_age", inplace=True)
council_tax_data[council_tax_data.variable_name == "council_tax_working_age"]
#council_tax_data[council_tax_data.geography_code.str.startswith('E05')]

### `clif_data`

Children in low income families

In [None]:
clif_data = pd.read_csv(f'{DATA_DIR}/clif/clif_REL.csv')
clif_data = clif_data[
    (clif_data['Age of Child (years and bands)'] == 'Total') &
    (clif_data['Gender of Child'] == 'Total') &
    (clif_data['Family Type'] == 'Total') &
    (clif_data['Work Status'] == 'Total') &
    (clif_data.Year == max(clif_data.Year))
]
clif_data[clif_data.geography_code == 'E08000035']

# `number_of_children`

In [None]:
number_of_persons = pd.read_csv(f'{DATA_DIR}/population-estimates/population-estimates-ages.csv')
number_of_children = number_of_persons[number_of_persons.age_band.isin(['0-4', '5-10', '11-15', '16-19'])]
number_of_children = number_of_children.groupby('geography_code').sum(numeric_only=True).reset_index()
number_of_children['variable_name'] = 'number_of_children'
number_of_children

### `households_in_poverty`

In [None]:
#number of households in poverty
households_in_poverty = pd.read_csv(f'{DATA_DIR}/hbai/Type of Individual by Age Category.csv')
households_in_poverty = households_in_poverty[households_in_poverty.variable_name == "In low income (below threshold)"]

def three_year_average(data):
    #@TODO function will need work to account for 3 year average of any period given.
    #filtering out 2020-21 as no data was collected
    data = data[data["Financial Year"] != "2020-21"]
    #getting a list of dates
    dates = data["Financial Year"].unique()
    most_recent_dates = list(dates[-2:])
    #print(most_recent)
    data = data[data["Financial Year"].isin(most_recent_dates)]
    #calculating a 2 year average according to user guidance
    data = data.groupby('geography_code').sum(numeric_only=True) / len(most_recent_dates)
    #rounding to nearest 0.1mil, according to user guidance.
    data = data.round(-5).reset_index()
    return data

households_in_poverty = three_year_average(households_in_poverty)

households_in_poverty['variable_name'] = 'Households in poverty'
households_in_poverty

### `savings`

In [None]:
savings = pd.read_csv(f'{DATA_DIR}/hbai/Savings and Investments of Adults in the Family of the Individual.csv')
savings = savings[savings['Financial Year'] == max(savings['Financial Year'])]
savings = savings[savings.variable_name == 'In low income (below threshold)']
savings = savings[savings['Savings and Investments of Adults in the Family of the Individual'] == 'No savings']
savings = three_year_average(savings)
savings['variable_name'] = 'households_low_income_no_savings'
savings
#save each geog_code into 'src/place/EXXXXXX/_data/' and then can build the visualisations in place.njk on a conidtional statement if that data exists.

### `imd_older_people` and `imd_children`

In [None]:
# Index of Multiple deprivation
data = pd.read_csv(f'{DATA_DIR}/imd/imd.csv')
data  = data[data.variable_name == 'Average score'] #adding this as in future there will be measures for each age cat.#
imd = data.loc[data.dataset == 'IMD'].copy()
imd['variable_name'] = 'imd_average_score'

imd_older_people = data.loc[data.dataset == 'IDAOPI'].copy()
imd_older_people['variable_name'] = 'imd_older_people'

imd_children = data.loc[data.dataset == "IDACI"].copy()
imd_children['variable_name'] = 'imd_children'


### `unemployment` and `economic_inactivity` 

In [None]:
labour_market = pd.read_csv(f'{DATA_DIR}/labour-market/labour-market.csv')

unemployment = labour_market.loc[labour_market.variable_name == "Unemployment rate - aged 16-64"]
unemployment = unemployment[unemployment.date == max(unemployment.date)]
unemployment['variable_name'] = 'unemployment_rate_16_64'

economic_inactivity = labour_market.loc[labour_market.variable_name == "% who are economically inactive - aged 16-64"]
economic_inactivity = economic_inactivity[economic_inactivity.date == max(economic_inactivity.date)]
economic_inactivity = economic_inactivity.replace("% who are economically inactive - aged 16-64","economic_inactivity_16_64")

economic_inactivity

### `housing_benefit`

In [None]:
housing_benefit = pd.read_csv(f'{DATA_DIR}/HB/claimants.csv')
housing_benefit['Month'] = pd.to_datetime(housing_benefit['Month'])
housing_benefit.rename(columns={'Unnamed: 0': 'dt_idx'}, inplace=True)
housing_benefit = housing_benefit.loc[housing_benefit.groupby('geography_code')['dt_idx'].idxmax()]
housing_benefit[housing_benefit.geography_code.str.startswith('E08')]

### `smi`

In [None]:
smi = pd.read_csv(f'{DATA_DIR}/smi/smi_loans_in_payment_households.csv')
smi['Quarter'] = pd.to_datetime(smi['Quarter'])
smi.rename(columns={'Unnamed: 0': 'dt_idx'}, inplace=True)
smi = smi.loc[smi.groupby('geography_code')['Quarter'].idxmax()]
smi

### `smi`

In [None]:
homelessness = pd.read_csv(f'{DATA_DIR}/statutory-homelessness/statutory-homelessness.csv')
homelessness['date'] = homelessness['date'].str[:4]
homelessness['date'] = pd.to_numeric(homelessness['date'])
homelessness = homelessness.loc[homelessness.groupby('geography_code')['date'].idxmax()]
homelessness

### `house_prices`

In [None]:
house_prices = pd.read_csv(f'{DATA_DIR}/house-prices/house-prices.csv')
house_prices['date'] = pd.to_datetime(house_prices['date'])
house_prices = house_prices.loc[house_prices.groupby(['geography_code'])['date'].idxmax()]
print(len(house_prices.geography_code.unique()))
geo = pd.read_csv(f"{DATA_DIR}/geo/geography_lookup.csv")

tot = len(geo.LAD22CD.unique()) + len(geo.WD22CD.unique())


Concatenate the loaded data, then pivot into a table with a line per geography code. Filter this to only include geographies that are in the canonical list of areas.

In [None]:
place_data = pd.concat([
    population_data,
    area_of_place,
    gva,
    households,
    council_tax_data,
    clif_data,
    number_of_children,
    savings,
    imd,
    house_prices,
    households_in_poverty,
    unemployment,
    economic_inactivity,
    imd_children,
    imd_older_people,
    housing_benefit,
    smi,
    homelessness
]).pivot(index='geography_code', columns='variable_name', values='value')
place_data = place_data.loc[place_data.index.isin(get_place_list())]
place_data = place_data.merge(get_place_data(), left_index=True, right_index=True, how='outer')
place_data.index.name = 'geography_code'
place_data

Create some additional metrics based on the data in place_data

In [None]:
place_data['Population density'] = place_data['Number of persons'] / place_data['Area in sq km']

Finally, write the data to an interim parquet and json file for later usage.

In [None]:
INTERIM_DIR=f'{DATA_DIR}/interim/'

os.makedirs(INTERIM_DIR, exist_ok=True)
place_data.to_parquet(f'{INTERIM_DIR}/place_data.parquet')
place_data.reset_index().to_json(f"{INTERIM_DIR}/place_data.json", orient="records")

## Current rental prices

In [None]:
rental_prices = pd.read_csv(f'{DATA_DIR}/rental-prices/rental-prices.csv')
rental_prices['date'] = pd.to_numeric(rental_prices['date'].str[:4])
rental_prices = rental_prices[rental_prices.variable_code == 'Mean']
rental_prices = rental_prices.loc[rental_prices.groupby(['geography_code', 'property_name'])['date'].idxmax()]
#print(rental_prices.variable_code.unique())
rental_prices_pivot = rental_prices.pivot(index='property_code', columns='geography_code', values='value')
# TODO fix missing data Ticket #67
rental_prices_pivot.fillna(0).to_csv(os.path.join(INTERIM_DIR, 'current_rental_prices.csv'))

### Time series data

Generate 

In [None]:
house_prices = pd.read_csv(f'{DATA_DIR}/house-prices/house-prices.csv')
house_prices['date'] = pd.to_datetime(house_prices['date'])
#house_prices = house_prices.loc[house_prices.groupby(['geography_code'])['date'].idxmax()]
house_prices = house_prices.pivot(index='date', columns="geography_code", values="value")
house_prices.to_csv(os.path.join(INTERIM_DIR, 'house_prices.csv'))
house_prices.max()