# APS

Code to process, work with and plot APS data to create indicators about skills supply in the UK. 

We are interested in the following indicators:

* Percentage of the population with tertiary education
* Percentage of population employed in professional occupations

Raw collected from https://www.nomisweb.co.uk/articles/676.aspx

See [this table](https://docs.google.com/spreadsheets/d/1V2fAQcvuLsoImwo6uLdyIK3x80pBNoX97CxsxkjvRP4/edit?usp=sharing) for more information.

## Preamble

In [None]:
import requests 

import numpy as np 
import pandas as pd 
import json
import seaborn as sns

import os
cwd = os.getcwd()

import matplotlib.pyplot as plt
%matplotlib inline

## Data Processing & Transformation

Raw data is downloaded via command line using `get_aps_nomis_data` module in the `beis-indicators/data` directory.
Note: `get_nomis_data` uses `nomis`

### Processing for 'Percentage of population employed in professional occupations' data

#### NUTS 2 level

In [None]:
# fetching raw data
data_occupations_n10 = '../../data/raw/pro_occs/nomis_percent_pro_occs_12_14_n10-0-25000.json'
data_occupations_json_n13 = '../../data/raw/pro_occs/nomis_percent_pro_occs_15_17_n13-0-25000.json'
data_occupations_json_n16 = '../../data/raw/pro_occs/nomis_percent_pro_occs_18_n16-0-25000.json'
with open(data_occupations_n10) as f:
    data_n10 = json.load(f)
    
with open(data_occupations_json_n13) as f:
    data_n13 = json.load(f)
    
with open(data_occupations_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
df_occupations_n10 = pd.DataFrame.from_records(data_n10)
df_occupations_n13 = pd.DataFrame.from_records(data_n13)
df_occupations_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
df_occupations_n10 = df_occupations_n10[(df_occupations_n10['measures_name']=='Variable')].reset_index(drop=True)
df_occupations_n13 = df_occupations_n13[(df_occupations_n13['measures_name']=='Variable')].reset_index(drop=True)
df_occupations_n16 = df_occupations_n16[(df_occupations_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_occupations_pivot_n10 = df_occupations_n10.pivot_table(index=df_occupations_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_occupations_pivot_n13 = df_occupations_n13.pivot_table(index=df_occupations_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_occupations_pivot_n16 = df_occupations_n16.pivot_table(index=df_occupations_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_occupations_pivot_n10.reset_index(inplace=True)
df_occupations_pivot_n13.reset_index(inplace=True)
df_occupations_pivot_n16.reset_index(inplace=True)

In [None]:
df_occupations_pivot_n10.set_index('geography_name',inplace=True)
df_occupations_pivot_n13.set_index('geography_name',inplace=True)
df_occupations_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# df_occupations_pivot.columns

In [None]:
df_occupations_pivot_n10['date_code'] = df_occupations_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_occupations_pivot_n13['date_code'] = df_occupations_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_occupations_pivot_n16['date_code'] = df_occupations_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_occupations_pivot_n10.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']
df_occupations_pivot_n13.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']
df_occupations_pivot_n16.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']


df_occupations_pivot_n10['nuts_year_spec'] = [2010]*len(df_occupations_pivot_n10)
df_occupations_pivot_n13['nuts_year_spec'] = [2013]*len(df_occupations_pivot_n13)
df_occupations_pivot_n16['nuts_year_spec'] = [2016]*len(df_occupations_pivot_n16)

In [None]:
df_occupations_pivot = pd.concat([df_occupations_pivot_n10, df_occupations_pivot_n13, df_occupations_pivot_n16]).sort_index()

In [None]:
df_occupations_pivot.dropna(subset = ['aps_pro_occupations_data'], inplace=True)

In [None]:
df_occupations_pivot = df_occupations_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_pro_occupations_data']].reset_index(drop=True)

In [None]:
#saving pivot table
df_occupations_pivot.to_csv('../../data/processed/aps/aps_pro_occupations_data.nuts2.csv', index=False)

#### NUTS 3 level

In [None]:
# fetching raw data
data_occupations_n10 = '../../data/raw/pro_occs/nomis_percent_pro_occs_3_12_14_n10-0-25000.json'
data_occupations_json_n13 = '../../data/raw/pro_occs/nomis_percent_pro_occs_3_15_17_n13-0-25000.json'
data_occupations_json_n16 = '../../data/raw/pro_occs/nomis_percent_pro_occs_3_18_n16-0-25000.json'
with open(data_occupations_n10) as f:
    data_n10 = json.load(f)
    
with open(data_occupations_json_n13) as f:
    data_n13 = json.load(f)
    
with open(data_occupations_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
df_occupations_n10 = pd.DataFrame.from_records(data_n10)
df_occupations_n13 = pd.DataFrame.from_records(data_n13)
df_occupations_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
df_occupations_n10 = df_occupations_n10[(df_occupations_n10['measures_name']=='Variable')].reset_index(drop=True)
df_occupations_n13 = df_occupations_n13[(df_occupations_n13['measures_name']=='Variable')].reset_index(drop=True)
df_occupations_n16 = df_occupations_n16[(df_occupations_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_occupations_pivot_n10 = df_occupations_n10.pivot_table(index=df_occupations_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_occupations_pivot_n13 = df_occupations_n13.pivot_table(index=df_occupations_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_occupations_pivot_n16 = df_occupations_n16.pivot_table(index=df_occupations_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_occupations_pivot_n10.reset_index(inplace=True)
df_occupations_pivot_n13.reset_index(inplace=True)
df_occupations_pivot_n16.reset_index(inplace=True)

In [None]:
df_occupations_pivot_n10.set_index('geography_name',inplace=True)
df_occupations_pivot_n13.set_index('geography_name',inplace=True)
df_occupations_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
df_occupations_pivot_n10['date_code'] = df_occupations_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_occupations_pivot_n13['date_code'] = df_occupations_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_occupations_pivot_n16['date_code'] = df_occupations_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_occupations_pivot_n10.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']
df_occupations_pivot_n13.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']
df_occupations_pivot_n16.columns = ['nuts_id', 'year', 'aps_pro_occupations_data']


df_occupations_pivot_n10['nuts_year_spec'] = [2010]*len(df_occupations_pivot_n10)
df_occupations_pivot_n13['nuts_year_spec'] = [2013]*len(df_occupations_pivot_n13)
df_occupations_pivot_n16['nuts_year_spec'] = [2016]*len(df_occupations_pivot_n16)

In [None]:
df_occupations_pivot = pd.concat([df_occupations_pivot_n10, df_occupations_pivot_n13, df_occupations_pivot_n16]).sort_index()

In [None]:
df_occupations_pivot.dropna(subset = ['aps_pro_occupations_data'], inplace=True)

In [None]:
df_occupations_pivot = df_occupations_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_pro_occupations_data']].reset_index(drop=True)

In [None]:
#saving pivot table
df_occupations_pivot.to_csv('../../data/processed/aps/aps_pro_occupations_data.nuts3.csv', index=False)

#### LEP level

In [None]:
# fetching raw data
data_occupations_lep = '../../data/raw/pro_occs/nomis_percent_pro_occs_lep-0-25000.json'
with open(data_occupations_lep) as f:
    data_lep = json.load(f)

In [None]:
df_occupations_lep = pd.DataFrame.from_records(data_lep)

In [None]:
# selecting the rows with the variable in question
df_occupations_lep = df_occupations_lep[(df_occupations_lep['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_occupations_pivot_lep = df_occupations_lep.pivot_table(index=df_occupations_lep[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_occupations_pivot_lep.reset_index(inplace=True)

In [None]:
df_occupations_pivot_lep.set_index('geography_name',inplace=True)

In [None]:
df_occupations_pivot_lep['date_code'] = df_occupations_pivot_lep['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_occupations_pivot_lep.columns = ['lep_id', 'year', 'aps_pro_occupations_data']


df_occupations_pivot_lep['lep_year_spec'] = [2017]*len(df_occupations_pivot_lep)

In [None]:
df_occupations_pivot_lep.dropna(subset = ['aps_pro_occupations_data'], inplace=True)

In [None]:
df_occupations_pivot_lep = df_occupations_pivot_lep[['year','lep_id', 'lep_year_spec', 'aps_pro_occupations_data']].reset_index(drop=True)

In [None]:
#saving pivot table
df_occupations_pivot_lep.to_csv('../../data/processed/aps/aps_pro_occupations_data.lep.csv', index=False)

### Processing for 'Economically active with NVQ4+ (graduates)' data

#### NUTS 2 level

In [None]:
# fetching raw data
data_edu_json_n10 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_12_14_n10-0-25000.json'
data_edu_json_n13 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_15_17_n13-0-25000.json'
data_edu_json_n16 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_18_n16-0-25000.json'
    
with open(data_edu_json_n10) as f:
    data_n10 = json.load(f)
    
with open(data_edu_json_n13) as f:
    data_n13 = json.load(f)
    
with open(data_edu_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
df_edu_n10 = pd.DataFrame.from_records(data_n10)
df_edu_n13 = pd.DataFrame.from_records(data_n13)
df_edu_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
df_edu_n10 = df_edu_n10[(df_edu_n10['measures_name']=='Variable')].reset_index(drop=True)
df_edu_n13 = df_edu_n13[(df_edu_n13['measures_name']=='Variable')].reset_index(drop=True)
df_edu_n16 = df_edu_n16[(df_edu_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_edu_pivot_n10 = df_edu_n10.pivot_table(index=df_edu_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_edu_pivot_n13 = df_edu_n13.pivot_table(index=df_edu_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_edu_pivot_n16 = df_edu_n16.pivot_table(index=df_edu_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_edu_pivot_n10.reset_index(inplace=True)
df_edu_pivot_n13.reset_index(inplace=True)
df_edu_pivot_n16.reset_index(inplace=True)

df_edu_pivot_n10.set_index('geography_name',inplace=True)
df_edu_pivot_n13.set_index('geography_name',inplace=True)
df_edu_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# df_edu_pivot.columns

In [None]:
df_edu_pivot_n10['date_code'] = df_edu_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_edu_pivot_n13['date_code'] = df_edu_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_edu_pivot_n16['date_code'] = df_edu_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_edu_pivot_n10.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']
df_edu_pivot_n13.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']
df_edu_pivot_n16.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']


df_edu_pivot_n10['nuts_year_spec'] = [2010]*len(df_edu_pivot_n10)
df_edu_pivot_n13['nuts_year_spec'] = [2013]*len(df_edu_pivot_n13)
df_edu_pivot_n16['nuts_year_spec'] = [2016]*len(df_edu_pivot_n16)

In [None]:
df_edu_pivot = pd.concat([df_edu_pivot_n10, df_edu_pivot_n13, df_edu_pivot_n16]).sort_index()

In [None]:
df_edu_pivot.dropna(subset = ['aps_nvq4_education_data'], inplace=True)

In [None]:
df_edu_pivot = df_edu_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_nvq4_education_data']].reset_index(drop=True)

In [None]:
#saving pivot table
df_edu_pivot.to_csv('../../data/processed/aps/aps_nvq4_education_data.nuts2.csv', index=False)

#### NUTS 3 level

In [None]:
# fetching raw data
data_edu_json_n10 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_3_12_14_n10-0-25000.json'
data_edu_json_n13 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_3_15_17_n13-0-25000.json'
data_edu_json_n16 = '../../data/raw/nvq4_tert/nomis_nvq4_grads_3_18_n16-0-25000.json'
    
with open(data_edu_json_n10) as f:
    data_n10 = json.load(f)
    
with open(data_edu_json_n13) as f:
    data_n13 = json.load(f)
    
with open(data_edu_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
df_edu_n10 = pd.DataFrame.from_records(data_n10)
df_edu_n13 = pd.DataFrame.from_records(data_n13)
df_edu_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
df_edu_n10 = df_edu_n10[(df_edu_n10['measures_name']=='Variable')].reset_index(drop=True)
df_edu_n13 = df_edu_n13[(df_edu_n13['measures_name']=='Variable')].reset_index(drop=True)
df_edu_n16 = df_edu_n16[(df_edu_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_edu_pivot_n10 = df_edu_n10.pivot_table(index=df_edu_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_edu_pivot_n13 = df_edu_n13.pivot_table(index=df_edu_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_edu_pivot_n16 = df_edu_n16.pivot_table(index=df_edu_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_edu_pivot_n10.reset_index(inplace=True)
df_edu_pivot_n13.reset_index(inplace=True)
df_edu_pivot_n16.reset_index(inplace=True)

df_edu_pivot_n10.set_index('geography_name',inplace=True)
df_edu_pivot_n13.set_index('geography_name',inplace=True)
df_edu_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
df_edu_pivot_n10['date_code'] = df_edu_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_edu_pivot_n13['date_code'] = df_edu_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_edu_pivot_n16['date_code'] = df_edu_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_edu_pivot_n10.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']
df_edu_pivot_n13.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']
df_edu_pivot_n16.columns = ['nuts_id', 'year', 'aps_nvq4_education_data']


df_edu_pivot_n10['nuts_year_spec'] = [2010]*len(df_edu_pivot_n10)
df_edu_pivot_n13['nuts_year_spec'] = [2013]*len(df_edu_pivot_n13)
df_edu_pivot_n16['nuts_year_spec'] = [2016]*len(df_edu_pivot_n16)

In [None]:
df_edu_pivot = pd.concat([df_edu_pivot_n10, df_edu_pivot_n13, df_edu_pivot_n16]).sort_index()

In [None]:
df_edu_pivot = df_edu_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_nvq4_education_data']].reset_index(drop=True)

In [None]:
df_edu_pivot.dropna(subset = ['aps_nvq4_education_data'], inplace=True)

In [None]:
#saving pivot table
df_edu_pivot.to_csv('../../data/processed/aps/aps_nvq4_education_data.nuts3.csv', index=False)

#### LEP level

In [None]:
# fetching raw data
data_edu_lep = '../../data/raw/nvq4_tert/nomis_nvq4_grads_lep-0-25000.json'
with open(data_edu_lep) as f:
    data_lep = json.load(f)

In [None]:
df_edu_lep = pd.DataFrame.from_records(data_lep)

In [None]:
# selecting the rows with the variable in question
df_edu_lep = df_edu_lep[(df_edu_lep['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_edu_pivot_lep = df_edu_lep.pivot_table(index=df_edu_lep[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']

In [None]:
df_edu_pivot_lep.reset_index(inplace=True)

In [None]:
df_edu_pivot_lep.set_index('geography_name',inplace=True)

In [None]:
df_edu_pivot_lep['date_code'] = df_edu_pivot_lep['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_edu_pivot_lep.columns = ['lep_id', 'year', 'aps_nvq4_education_data']


df_edu_pivot_lep['lep_year_spec'] = [2017]*len(df_edu_pivot_lep)

In [None]:
df_edu_pivot_lep.dropna(subset = ['aps_nvq4_education_data'], inplace=True)

In [None]:
df_edu_pivot_lep = df_edu_pivot_lep[['year','lep_id', 'lep_year_spec', 'aps_nvq4_education_data']].reset_index(drop=True)

In [None]:
#saving pivot table
df_edu_pivot_lep.to_csv('../../data/processed/aps/aps_nvq4_education_data.lep.csv', index=False)

### Processing for 'Economically active in either science, research, engineering and technology, professionals and associated professionals' data

#### NUTS 2 level

In [None]:
# fetching raw data
data_stem_json_n10 = '../../data/raw/stem/nomis_stem_12_14_n10-0-25000.json'
data_stem_json_n13 = '../../data/raw/stem/nomis_stem_15_17_n13-0-25000.json'
data_stem_json_n16 = '../../data/raw/stem/nomis_stem_18_n16-0-25000.json'

with open(data_stem_json_n10) as f:
    data_n10 = json.load(f)
with open(data_stem_json_n13) as f:
    data_n13 = json.load(f)
with open(data_stem_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
data_stem_n10 = pd.DataFrame.from_records(data_n10)
data_stem_n13 = pd.DataFrame.from_records(data_n13)
data_stem_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
data_stem_n10 = data_stem_n10[(data_stem_n10['measures_name']=='Value')].reset_index(drop=True)
data_stem_n13 = data_stem_n13[(data_stem_n13['measures_name']=='Value')].reset_index(drop=True)
data_stem_n16 = data_stem_n16[(data_stem_n16['measures_name']=='Value')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_pivot_n10 = data_stem_n10.pivot_table(index=data_stem_n10[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']
df_stem_pivot_n13 = data_stem_n13.pivot_table(index=data_stem_n13[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']
df_stem_pivot_n16 = data_stem_n16.pivot_table(index=data_stem_n16[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']

In [None]:
df_stem_pivot_n10.reset_index(inplace=True)
df_stem_pivot_n13.reset_index(inplace=True)
df_stem_pivot_n16.reset_index(inplace=True)

df_stem_pivot_n10.set_index('geography_name',inplace=True)
df_stem_pivot_n13.set_index('geography_name',inplace=True)
df_stem_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# df_stem_pivot.columns

In [None]:
df_stem_pivot_n10['date_code'] = df_stem_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_pivot_n13['date_code'] = df_stem_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_pivot_n16['date_code'] = df_stem_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_pivot_n10.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']
df_stem_pivot_n13.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']
df_stem_pivot_n16.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']


df_stem_pivot_n10['aps_econ_active_stem_profs_data'] = df_stem_pivot_n10['aps_econ_active_stem_profs_data'].apply(lambda x: int(x))
df_stem_pivot_n13['aps_econ_active_stem_profs_data'] = df_stem_pivot_n13['aps_econ_active_stem_profs_data'].apply(lambda x: int(x))
df_stem_pivot_n16['aps_econ_active_stem_profs_data'] = df_stem_pivot_n16['aps_econ_active_stem_profs_data'].apply(lambda x: int(x))


df_stem_pivot_n10['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n10['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x))
df_stem_pivot_n13['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n13['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x))
df_stem_pivot_n16['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n16['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x))


df_stem_pivot_n10['nuts_year_spec'] = [2010]*len(df_stem_pivot_n10)
df_stem_pivot_n13['nuts_year_spec'] = [2013]*len(df_stem_pivot_n13)
df_stem_pivot_n16['nuts_year_spec'] = [2016]*len(df_stem_pivot_n16)

In [None]:
df_stem_pivot = pd.concat([df_stem_pivot_n10, df_stem_pivot_n13, df_stem_pivot_n16]).sort_index()

In [None]:
df_stem_pivot_prof = df_stem_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_profs_data']].reset_index(drop=True)
df_stem_pivot_aprof = df_stem_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_associate_profs_data']].reset_index(drop=True)

In [None]:
df_stem_pivot_prof.dropna(subset=['aps_econ_active_stem_profs_data'], inplace=True)
df_stem_pivot_aprof.dropna(subset=['aps_econ_active_stem_associate_profs_data'], inplace=True)

In [None]:
#saving pivot table
df_stem_pivot_prof.to_csv('../../data/processed/aps/aps_econ_active_stem_profs_data.nuts2.csv', index=False)
df_stem_pivot_aprof.to_csv('../../data/processed/aps/aps_econ_active_stem_associate_profs_data.nuts2.csv', index=False)

#### NUTS 3 level

In [None]:
# fetching raw data
data_stem_json_n10 = '../../data/raw/stem/nomis_stem_3_12_14_n10-0-25000.json'
data_stem_json_n13 = '../../data/raw/stem/nomis_stem_3_15_17_n13-0-25000.json'
data_stem_json_n16 = '../../data/raw/stem/nomis_stem_3_18_n16-0-25000.json'

with open(data_stem_json_n10) as f:
    data_n10 = json.load(f)
with open(data_stem_json_n13) as f:
    data_n13 = json.load(f)
with open(data_stem_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
data_stem_n10 = pd.DataFrame.from_records(data_n10)
data_stem_n13 = pd.DataFrame.from_records(data_n13)
data_stem_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
data_stem_n10 = data_stem_n10[(data_stem_n10['measures_name']=='Value')].reset_index(drop=True)
data_stem_n13 = data_stem_n13[(data_stem_n13['measures_name']=='Value')].reset_index(drop=True)
data_stem_n16 = data_stem_n16[(data_stem_n16['measures_name']=='Value')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_pivot_n10 = data_stem_n10.pivot_table(index=data_stem_n10[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']
df_stem_pivot_n13 = data_stem_n13.pivot_table(index=data_stem_n13[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']
df_stem_pivot_n16 = data_stem_n16.pivot_table(index=data_stem_n16[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']

In [None]:
df_stem_pivot_n10.reset_index(inplace=True)
df_stem_pivot_n13.reset_index(inplace=True)
df_stem_pivot_n16.reset_index(inplace=True)

df_stem_pivot_n10.set_index('geography_name',inplace=True)
df_stem_pivot_n13.set_index('geography_name',inplace=True)
df_stem_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# df_stem_pivot.columns

In [None]:
df_stem_pivot_n10['date_code'] = df_stem_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_pivot_n13['date_code'] = df_stem_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_pivot_n16['date_code'] = df_stem_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_pivot_n10.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']
df_stem_pivot_n13.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']
df_stem_pivot_n16.columns = ['nuts_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']


df_stem_pivot_n10['aps_econ_active_stem_profs_data'] = df_stem_pivot_n10['aps_econ_active_stem_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)
df_stem_pivot_n13['aps_econ_active_stem_profs_data'] = df_stem_pivot_n13['aps_econ_active_stem_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)
df_stem_pivot_n16['aps_econ_active_stem_profs_data'] = df_stem_pivot_n16['aps_econ_active_stem_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)


df_stem_pivot_n10['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n10['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)
df_stem_pivot_n13['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n13['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)
df_stem_pivot_n16['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_n16['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)


df_stem_pivot_n10['nuts_year_spec'] = [2010]*len(df_stem_pivot_n10)
df_stem_pivot_n13['nuts_year_spec'] = [2013]*len(df_stem_pivot_n13)
df_stem_pivot_n16['nuts_year_spec'] = [2016]*len(df_stem_pivot_n16)

In [None]:
df_stem_pivot = pd.concat([df_stem_pivot_n10, df_stem_pivot_n13, df_stem_pivot_n16]).sort_index()

In [None]:
df_stem_pivot_prof = df_stem_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_profs_data']].reset_index(drop=True)
df_stem_pivot_aprof = df_stem_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_associate_profs_data']].reset_index(drop=True)

In [None]:
df_stem_pivot_prof.dropna(subset=['aps_econ_active_stem_profs_data'], inplace=True)
df_stem_pivot_aprof.dropna(subset=['aps_econ_active_stem_associate_profs_data'], inplace=True)

In [None]:
df_stem_pivot_aprof['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_aprof['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)

In [None]:
#saving pivot table
df_stem_pivot_prof.to_csv('../../data/processed/aps/aps_econ_active_stem_profs_data.nuts3.csv', index=False)
df_stem_pivot_aprof.to_csv('../../data/processed/aps/aps_econ_active_stem_associate_profs_data.nuts3.csv', index=False)

#### LEP level

In [None]:
# fetching raw data
data_stem_json = '../../data/raw/stem/nomis_stem_lep-0-25000.json'
with open(data_stem_json) as f:
    data_lep = json.load(f)

In [None]:
df_stem_lep = pd.DataFrame.from_records(data_lep)

In [None]:
# selecting the rows with the variable in question
df_stem_lep = df_stem_lep[(df_stem_lep['measures_name']=='Value')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_pivot_lep = df_stem_lep.pivot_table(index=df_stem_lep[['geography_name', 'geography_code', 'date_code']], columns='cell_name', aggfunc='mean')['obs_value']

In [None]:
df_stem_pivot_lep.reset_index(inplace=True)

In [None]:
df_stem_pivot_lep.set_index('geography_name',inplace=True)

In [None]:
df_stem_pivot_lep['date_code'] = df_stem_pivot_lep['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_pivot_lep.columns = ['lep_id', 'year', 'aps_econ_active_stem_associate_profs_data','aps_econ_active_stem_profs_data']


df_stem_pivot_lep['lep_year_spec'] = [2017]*len(df_stem_pivot_lep)

In [None]:
df_stem_pivot_lep_prof = df_stem_pivot_lep[['year','lep_id', 'lep_year_spec', 'aps_econ_active_stem_profs_data']].reset_index(drop=True)
df_stem_pivot_lep_aprof = df_stem_pivot_lep[['year','lep_id', 'lep_year_spec', 'aps_econ_active_stem_associate_profs_data']].reset_index(drop=True)

In [None]:
df_stem_pivot_lep_prof.dropna(subset=['aps_econ_active_stem_profs_data'], inplace=True)
df_stem_pivot_lep_aprof.dropna(subset=['aps_econ_active_stem_associate_profs_data'], inplace=True)

In [None]:
df_stem_pivot_lep_aprof['aps_econ_active_stem_associate_profs_data'] = df_stem_pivot_lep_aprof['aps_econ_active_stem_associate_profs_data'].apply(lambda x: int(x) if pd.notnull(x) else x)

In [None]:
#saving pivot table
df_stem_pivot_lep_prof.to_csv('../../data/processed/aps/aps_econ_active_stem_profs_data.lep.csv', index=False)
df_stem_pivot_lep_aprof.to_csv('../../data/processed/aps/aps_econ_active_stem_associate_profs_data.lep.csv', index=False)

### Processing for 'STEM employee density' data

#### NUTS 2 level

In [None]:
# fetching raw data
data_stem_dens_json_n10 = '../../data/raw/stem_dens/nomis_stem_dens_12_14_n10-0-25000.json'
data_stem_dens_json_n13 = '../../data/raw/stem_dens/nomis_stem_dens_15_17_n13-0-25000.json'
data_stem_dens_json_n16 = '../../data/raw/stem_dens/nomis_stem_dens_18_n16-0-25000.json'

with open(data_stem_dens_json_n10) as f:
    data_n10 = json.load(f)
with open(data_stem_dens_json_n13) as f:
    data_n13 = json.load(f)
with open(data_stem_dens_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
data_stem_dens_n10 = pd.DataFrame.from_records(data_n10)
data_stem_dens_n13 = pd.DataFrame.from_records(data_n13)
data_stem_dens_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
data_stem_dens_n10 = data_stem_dens_n10[(data_stem_dens_n10['measures_name']=='Variable')].reset_index(drop=True)
data_stem_dens_n13 = data_stem_dens_n13[(data_stem_dens_n13['measures_name']=='Variable')].reset_index(drop=True)
data_stem_dens_n16 = data_stem_dens_n16[(data_stem_dens_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_dens_pivot_n10 = data_stem_dens_n10.pivot_table(index=data_stem_dens_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_stem_dens_pivot_n13 = data_stem_dens_n13.pivot_table(index=data_stem_dens_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_stem_dens_pivot_n16 = data_stem_dens_n16.pivot_table(index=data_stem_dens_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']


In [None]:
df_stem_dens_pivot_n10.reset_index(inplace=True)
df_stem_dens_pivot_n13.reset_index(inplace=True)
df_stem_dens_pivot_n16.reset_index(inplace=True)

df_stem_dens_pivot_n10.set_index('geography_name',inplace=True)
df_stem_dens_pivot_n13.set_index('geography_name',inplace=True)
df_stem_dens_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# data_stem_dens_pivot.columns

In [None]:
df_stem_dens_pivot_n10['date_code'] = df_stem_dens_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_dens_pivot_n13['date_code'] = df_stem_dens_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_dens_pivot_n16['date_code'] = df_stem_dens_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_dens_pivot_n10.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']
df_stem_dens_pivot_n13.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']
df_stem_dens_pivot_n16.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']


df_stem_dens_pivot_n10['nuts_year_spec'] = [2010]*len(df_stem_dens_pivot_n10)
df_stem_dens_pivot_n13['nuts_year_spec'] = [2013]*len(df_stem_dens_pivot_n13)
df_stem_dens_pivot_n16['nuts_year_spec'] = [2016]*len(df_stem_dens_pivot_n16)

In [None]:
data_stem_dens_pivot = pd.concat([df_stem_dens_pivot_n10, df_stem_dens_pivot_n13, df_stem_dens_pivot_n16]).sort_index()

In [None]:
# data_stem_dens_pivot
data_stem_dens_pivot = data_stem_dens_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_density_data']].reset_index(drop=True)

In [None]:
data_stem_dens_pivot.dropna(subset=['aps_econ_active_stem_density_data'], inplace=True)

In [None]:
#saving pivot table
data_stem_dens_pivot.to_csv('../../data/processed/aps/aps_econ_active_stem_density_data.nuts2.csv', index=False)

#### NUTS 3 level

In [None]:
# fetching raw data
data_stem_dens_json_n10 = '../../data/raw/stem_dens/nomis_stem_dens_3_12_14_n10-0-25000.json'
data_stem_dens_json_n13 = '../../data/raw/stem_dens/nomis_stem_dens_3_15_17_n13-0-25000.json'
data_stem_dens_json_n16 = '../../data/raw/stem_dens/nomis_stem_dens_3_18_n16-0-25000.json'

with open(data_stem_dens_json_n10) as f:
    data_n10 = json.load(f)
with open(data_stem_dens_json_n13) as f:
    data_n13 = json.load(f)
with open(data_stem_dens_json_n16) as f:
    data_n16 = json.load(f)

In [None]:
data_stem_dens_n10 = pd.DataFrame.from_records(data_n10)
data_stem_dens_n13 = pd.DataFrame.from_records(data_n13)
data_stem_dens_n16 = pd.DataFrame.from_records(data_n16)

In [None]:
# selecting the rows with the variable in question
data_stem_dens_n10 = data_stem_dens_n10[(data_stem_dens_n10['measures_name']=='Variable')].reset_index(drop=True)
data_stem_dens_n13 = data_stem_dens_n13[(data_stem_dens_n13['measures_name']=='Variable')].reset_index(drop=True)
data_stem_dens_n16 = data_stem_dens_n16[(data_stem_dens_n16['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_dens_pivot_n10 = data_stem_dens_n10.pivot_table(index=data_stem_dens_n10[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_stem_dens_pivot_n13 = data_stem_dens_n13.pivot_table(index=data_stem_dens_n13[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']
df_stem_dens_pivot_n16 = data_stem_dens_n16.pivot_table(index=data_stem_dens_n16[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']


In [None]:
df_stem_dens_pivot_n10.reset_index(inplace=True)
df_stem_dens_pivot_n13.reset_index(inplace=True)
df_stem_dens_pivot_n16.reset_index(inplace=True)

df_stem_dens_pivot_n10.set_index('geography_name',inplace=True)
df_stem_dens_pivot_n13.set_index('geography_name',inplace=True)
df_stem_dens_pivot_n16.set_index('geography_name',inplace=True)

In [None]:
# data_stem_dens_pivot.columns

In [None]:
df_stem_dens_pivot_n10['date_code'] = df_stem_dens_pivot_n10['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_dens_pivot_n13['date_code'] = df_stem_dens_pivot_n13['date_code'].apply(lambda x: int(x.split('-')[0]))
df_stem_dens_pivot_n16['date_code'] = df_stem_dens_pivot_n16['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_dens_pivot_n10.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']
df_stem_dens_pivot_n13.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']
df_stem_dens_pivot_n16.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']


df_stem_dens_pivot_n10['nuts_year_spec'] = [2010]*len(df_stem_dens_pivot_n10)
df_stem_dens_pivot_n13['nuts_year_spec'] = [2013]*len(df_stem_dens_pivot_n13)
df_stem_dens_pivot_n16['nuts_year_spec'] = [2016]*len(df_stem_dens_pivot_n16)

In [None]:
data_stem_dens_pivot = pd.concat([df_stem_dens_pivot_n10, df_stem_dens_pivot_n13, df_stem_dens_pivot_n16]).sort_index()

In [None]:
# data_stem_dens_pivot
data_stem_dens_pivot = data_stem_dens_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_density_data']].reset_index(drop=True)

In [None]:
data_stem_dens_pivot.dropna(subset=['aps_econ_active_stem_density_data'], inplace=True)

In [None]:
#saving pivot table
data_stem_dens_pivot.to_csv('../../data/processed/aps/aps_econ_active_stem_density_data.nuts3.csv', index=False)

#### LEP level

In [None]:
# fetching raw data
data_stem_dens_json_lep = '../../data/raw/stem_dens/nomis_stem_dens_lep-0-25000.json'

with open(data_stem_dens_json_lep) as f:
    data_stem_dens = json.load(f)

In [None]:
data_stem_dens_lep = pd.DataFrame.from_records(data_stem_dens)

In [None]:
# selecting the rows with the variable in question
data_stem_dens_lep = data_stem_dens_lep[(data_stem_dens_lep['measures_name']=='Variable')].reset_index(drop=True)

# creating pivot table with indicators as fields
df_stem_dens_lep_pivot = data_stem_dens_lep.pivot_table(index=data_stem_dens_lep[['geography_name', 'geography_code', 'date_code']], columns='variable_name', aggfunc='mean')['obs_value']


In [None]:
df_stem_dens_lep_pivot.reset_index(inplace=True)

df_stem_dens_lep_pivot.set_index('geography_name',inplace=True)

In [None]:
df_stem_dens_lep_pivot['date_code'] = df_stem_dens_lep_pivot['date_code'].apply(lambda x: int(x.split('-')[0]))

In [None]:
df_stem_dens_lep_pivot.columns = ['nuts_id', 'year', 'aps_econ_active_stem_density_data']


df_stem_dens_lep_pivot['nuts_year_spec'] = [2017]*len(df_stem_dens_lep_pivot)

In [None]:
# data_stem_dens_pivot
df_stem_dens_lep_pivot = df_stem_dens_lep_pivot[['year','nuts_id', 'nuts_year_spec', 'aps_econ_active_stem_density_data']].reset_index(drop=True)

In [None]:
df_stem_dens_lep_pivot.dropna(subset=['aps_econ_active_stem_density_data'], inplace=True)

In [None]:
#saving pivot table
df_stem_dens_lep_pivot.to_csv('../../data/processed/aps/aps_econ_active_stem_density_data.lep.csv', index=False)

## (Processed) Data Collection

This section includes an example of how to work with the dataset if interested in one year (i.e. 2018).

In [None]:
data_edu = '../../data/processed/aps/11_11_2019_aps_tertiary_education_data.csv'
data_occupations = '../../data/processed/aps/11_11_2019_aps_pro_occupations_data.csv'

In [None]:
df_edu = pd.read_csv(data_edu)
df_edu.set_index('geography_name', inplace=True)
df_edu

In [None]:
df_edu_2018 = df_edu[df_edu['date_code'] == '2018-12']
df_edu_2018

In [None]:
df_edu_2018['% with NVQ4+ - aged 16-64'].sort_values(ascending=True).plot(kind='barh', figsize=(10,8))
plt.ylabel('NUTS2 Region', fontsize=12)
plt.xlabel('% of NUTS2 Region Population', fontsize=12)
plt.title('Percentage of population in NUTS2 regions with NVQ4+: 2018')

In [None]:
df_occ = pd.read_csv(data_occupations)
df_occ.set_index('geography_name', inplace=True)
df_occ

In [None]:
df_occ_2018 = df_occ[df_occ['date_code'] == '2018-12']
# df_occ_2018

In [None]:
df_occ_2018.plot(kind='barh', figsize=(10,8), stacked=True)
plt.axvline(x= 100, linestyle='--', color='grey', alpha =0.3)
plt.xlabel('% of NUTS2 Region Population')
plt.ylabel('NUTS2 Region')
plt.title('Percentage of population in NUTS2 regions in given employment categories: 2018')
plt.legend(bbox_to_anchor=(1.05, 1.05))