In [1]:
import os
import sys
import pandas as pd

LIB_DIR = '../../'
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

In [2]:
from scripts.util.date import quarter_to_date

In [3]:
NEET_DATA = '../../working/upstream/neet.csv'
ALL_METADATA = '../../working/upstream/metadata.csv'
NEET_16_24 = '../../data/neet/neet.csv'
NEET_METADATA = '../../data/neet/metadata.json'

os.makedirs(os.path.dirname(NEET_16_24), exist_ok=True)

In [4]:
data = pd.read_csv('../../working/upstream/neet.csv')

In [5]:
variable_mapper = {
    'Young people who were NEET_Total': 'age_16_to_24_neet_total_sa',
    'Young people who were NEET_Unemployed': 'age_16_to_24_neet_unemployed_sa',
    'Young people who were NEET_Economically inactive': 'age_16_to_24_neet_economically_inactive_sa',
    'Total people in relevant population group': 'age_16_to_24_population',
    'People who were NEET as a percentage of people in relevant population group': 'age_16_to_24_neet_total_rate_sa',
}

In [6]:
data = data[data.age == 'Aged 16-24']

In [7]:
data = data.replace({
  'measure': variable_mapper
})

In [8]:
data['quarter_start'] = pd.Index(data.date.pipe(quarter_to_date))
data['variable'] = data.sheet.str.split(' - ').apply(lambda x: x[0]).str.lower() + '_' + data.measure

In [9]:
data = data.pivot_table(index='quarter_start', columns='variable', values='value')

In [10]:
def calculate_rates(data, prefix):
    data[prefix + '_age_16_to_24_neet_unemployed_rate_sa'] = data[prefix + '_age_16_to_24_neet_unemployed_sa'] / data[prefix + '_age_16_to_24_population'] * 100
    data[prefix + '_age_16_to_24_neet_economically_inactive_rate_sa'] = data[prefix + '_age_16_to_24_neet_economically_inactive_sa'] / data[prefix + '_age_16_to_24_population'] * 100
    return data
  
data = data.pipe(
    calculate_rates, 'people'
).pipe(
    calculate_rates, 'men'
).pipe(
    calculate_rates, 'women'
)


In [11]:
column_order = [
  'people_age_16_to_24_neet_total_sa',
  'people_age_16_to_24_neet_unemployed_sa',
  'people_age_16_to_24_neet_economically_inactive_sa',
  'people_age_16_to_24_population',
  'people_age_16_to_24_neet_total_rate_sa',
  'people_age_16_to_24_neet_unemployed_rate_sa',
  'people_age_16_to_24_neet_economically_inactive_rate_sa',
  'men_age_16_to_24_neet_total_sa',
  'men_age_16_to_24_neet_unemployed_sa',
  'men_age_16_to_24_neet_economically_inactive_sa',
  'men_age_16_to_24_population',
  'men_age_16_to_24_neet_total_rate_sa',
  'men_age_16_to_24_neet_unemployed_rate_sa',
  'men_age_16_to_24_neet_economically_inactive_rate_sa',
  'women_age_16_to_24_neet_total_sa',
  'women_age_16_to_24_neet_unemployed_sa',
  'women_age_16_to_24_neet_economically_inactive_sa',
  'women_age_16_to_24_population',
  'women_age_16_to_24_neet_total_rate_sa',
  'women_age_16_to_24_neet_unemployed_rate_sa',
  'women_age_16_to_24_neet_economically_inactive_rate_sa'
]

data.loc[:, column_order].to_csv(NEET_16_24)

In [12]:
metadata = pd.read_csv(ALL_METADATA, parse_dates=['last_update', 'next_update'])
metadata.set_index(
    'id'
).rename(
    columns={
        'last_update': 'published'
    }
).loc['NEET'].to_json(NEET_METADATA, date_format='iso')

metadata.dtypes


id                     object
last_update    datetime64[ns]
next_update    datetime64[ns]
dtype: object