# Eurostat Processing

### Tasks

1. Read and process Eurostat data
    1. Write functions to filter tables at the levels we need
    2. Produce coverage figures (country x year x indicator)
    3. Analyse geography and trends
    4. Analyse correlations

## Preamble

In [None]:
%run ../notebook_preamble.ipy

import seaborn as sn
import yaml
import altair as alt
from altair_saver import save
from scipy.stats import zscore
from eis.utils.data_processing import *


In [None]:
material_outputs = f"{project_dir}/reports/figures/exploratory_paper_v1"


### Visualise indicator dimensions

In [None]:
dim = pd.read_csv(f"{project_dir}/data/aux/indicator_dimensions.csv")
dim['latest_year'] = [int(x[-4:]) for x in dim['Temporal coverage']]

In [None]:
dim.head()

In [None]:
#Country resolution

a = alt.Chart(dim).mark_bar().encode(y=alt.Y('Highest geographical resolution',title=None),
                                 x = alt.X('Highest geographical resolution','count'),
                                color='Highest geographical resolution').facet(
    row=alt.Row('Type of data:O',title=None,sort=['official','institutional','Web']))

save(a,f"{material_outputs}/fig_0_geo_dimensions.pdf")

In [None]:
a

In [None]:
#Country resolution

b = alt.Chart(dim,width=100).mark_bar().encode(y=alt.Y('Filtering options'),
                                 x = alt.X('Filtering options','count'),
                                color='Filtering options').facet(column=alt.Column(
    'Type of data',sort=['Official','Institutional','Web']))

b
save(b,f"{material_outputs}/fig_1_sect_dimensions.pdf")

In [None]:
b

In [None]:
dim.groupby('Type of data')['latest_year'].mean()

In [None]:
c = alt.Chart(dim,width=200).transform_fold(['Trustworthiness','Complexity'],['Dimension','value']).mark_bar().encode(
y=alt.Key('Dimension:N',title=None),
    x=alt.X('value:Q','mean'),color='Dimension:N').facet(row=alt.Row('Type of data',
                                                              sort=['Official','Institutional','Web']))

save(c,f"{material_outputs}/fig_3_other_dimensions.pdf")

c

### Eurostat indicator analysis

In [None]:
# Load the indicators

indicator_location = f"{project_dir}/data/raw/eurostat/selected_tables"

with open(f"{project_dir}/model_config.yaml",'r') as infile:
    selected_indicators = yaml.safe_load(infile)['eurostat_inventory']

In [None]:
indicator_store = {}

for x in os.listdir(indicator_location):
    
    if any(y in x for y in selected_indicators):
        if 'csv' in x:
            indicator_store[x.split('.')[0]] = pd.read_csv(f"{indicator_location}/{x}")

In [None]:
with open(f'{project_dir}/data/aux/eis_filters.yaml','r') as infile:
    all_filters = yaml.safe_load(infile)
    
with open(f'{project_dir}/data/aux/eurostat_clean_names.json','r') as infile:
    es_clean_names = json.load(infile)
    
with open(f'{project_dir}/data/aux/ind_to_category.json','r') as infile:
    ind_to_cat = json.load(infile)



In [None]:
def filter_df(df,filter_dict,var_name,make_concise=True):
    '''
    Filters a df with the keys and values of a filter_dict
    
    Args:
        df (pandas dataframe) is a dataframe
        filter_dict (dict) is a dict where the keys are filter variables and the values are filter values (lists)
        make_concise (str) only returns country, year and variable
        
    '''
    
    df_2 = df.copy()

    
    for k,v in filter_dict.items():
        df_2 = df_2.loc[[x in v for x in df_2[k]]]
    
    if make_concise==True:
        return(df_2.reset_index(drop=True)[['geo\\time','time',var_name]])
    else:
        return(df_2.reset_index(drop=True))

In [None]:
#Here we create a dict where every element is a df
filtered_dfs = {}

for l in es_clean_names.keys():
    
    df = filter_df(indicator_store[l],
          all_filters[l],l)
    
    df_pivoted = df.pivot_table(
        index='geo\\time',columns='time',values=l,aggfunc='sum').replace(0,np.nan)
    
    #Drop EU aggregates and sort by the last year
    df_processed = df_pivoted.drop(
        [x for x in df_pivoted.index if any(v in x for v in ['EU','EA'])])
    
    sort_countries = df_processed.mean(axis=1).sort_values(ascending=False).index
    
    filtered_dfs[l] = df_processed.loc[sort_countries]

In [None]:
#Create and save a long version

out = []

for k,v in filtered_dfs.items():
    
    long = v.reset_index(drop=False).melt(id_vars='geo\\time').rename(columns={'geo\\time':'country'})
    
    long['source'] = k
    #long['clean_name'] = '\n ('.join(es_clean_names[k].split(' ('))
    long['clean_name'] = es_clean_names[k]
    long['category'] = ind_to_cat[k]
    
    out.append(long)


In [None]:
comb = pd.concat(out)

comb_2 = comb[['country','time','value','source']].rename(columns={'time':'year',
                                                             'source':'variable'})

comb_2.to_csv(f"{project_dir}/data/processed/official_indicators.csv",index=False)

### Visualise results

In [None]:

h = [1000,700,700]

In [None]:
for n,cat in enumerate(['At work','In education','In society']):
    
    print(cat)

    d = comb.loc[comb['category']==cat].reset_index(drop=False)

    c = alt.Chart(d).mark_rect().encode(
        y=alt.Y('country:O',sort= alt.EncodingSortField('value',op='mean',order='descending')),
        x='time:O',color='value:Q',
        facet = alt.Facet('clean_name:N',columns=2,align='none',
                          sort=alt.EncodingSortField('time:O',op='count',order='descending'),
                         title=cat)).properties(
        height=h[n])

    t = c.resolve_scale(x='independent',y='independent',color='independent')

    save(t,f"{material_outputs}/fig_{n+3}_eurostat_{'_'.join(cat.lower().split(' '))}.pdf")

### UN skills data

In [None]:
with open(f'{project_dir}/data/aux/eu_iso_3.txt','r') as infile:
    iso_3 = set(infile.read().split(', '))


In [None]:
un = pd.read_csv('https://opendata.arcgis.com/datasets/43e7742875004583b5eeff0bd01c5a56_0.csv')

un_eu = un.loc[([x in iso_3 for x in un['ISO3']]) & (un['sexDesc']=='Both sexes')].reset_index(drop=False)

In [None]:
y_var = ['value_2010', 'value_2011',
       'value_2012', 'value_2013', 'value_2014', 'value_2015', 'value_2016',
       'value_2017', 'value_2018']

un_eu_melt = un_eu[['ISO3','typeOfSkillDesc']+y_var].melt(id_vars=['ISO3','typeOfSkillDesc'])

In [None]:
un_eu_melt['year'] = un_eu_melt['variable'].apply(lambda x: int(x.split('_')[-1]))

In [None]:
a = alt.Chart(un_eu_melt.dropna(axis=0,subset=['value']),height=400).mark_rect().encode(
    y=alt.Y('ISO3',sort=alt.EncodingSortField('value','count',order='descending')),
    x='year:O',color=alt.Color('value','count',title='Available indicators in year'))

b = alt.Chart(un_eu_melt,height=400).transform_filter(
    alt.datum.year==2017).mark_rect().encode(y=alt.Y('ISO3',sort=alt.EncodingSortField('value','mean',
                                                                                       order='descending')),
                                           x = alt.X(
                                               'typeOfSkillDesc',
                                               sort=alt.EncodingSortField('value','mean',
                                                                          order='descending'),title='Skill'),
                                               
                                               color=alt.Color('value',title='% with skill'))

c = alt.hconcat(a,b).resolve_scale(color='independent')

save(c,f"{material_outputs}/fig_6_un_indicators.pdf")

c