# Eurostat EDA

### Tasks

1. Load indicator spreadsheet and output a couple of tables/figures for the report
    1. Table with key dimensions
    2. figure comparing various dimensions of the indicators
    3. Table with Eurostat indicators
2. Read and process Eurostat data
    1. Write functions to filter tables at the levels we need
    2. Produce coverage figures (country x year x indicator)
    3. Analyse geography and trends
    4. Analyse correlations

## Preamble

In [None]:
%run ../notebook_preamble.ipy

import seaborn as sn
import yaml
from scipy.stats import zscore
from eis.utils.data_processing import *


In [None]:
material_outputs = f"{project_dir}/reports/figures/exploratory_paper"

plt.style.use('seaborn-muted')
plt.rc('font', size=12) 

## 1. Indicator spreadsheet task

In [None]:
#Read data
ind = pd.read_csv(f"{project_dir}/data/aux/eis_indicator_inventory.csv",na_values='TBC')

In [None]:
ind.columns

In [None]:
my_columns = ['category','method_type','temporal_coverage','geographical_coverage',
             'geographical_resolution','trustworthiness','complexity']
table = pd.DataFrame({'dimension':my_columns,'observations':['']*len(my_columns)})
table.to_csv(f'{material_outputs}/table_1.csv',index=False)

### Comparison of indicator dimensions

In [None]:
ax = pd.crosstab(ind['method_type'],ind['geographical_resolution'],normalize=0).plot.barh(figsize=(6,4))
ax.set_xlabel('% of indicators in category')
ax.legend(bbox_to_anchor=(1,1))

save_fig('fig_1_geo_resolution.pdf',material_outputs)

In [None]:
ax = pd.crosstab(ind['method_type'],ind['sectoral_resolution'],normalize=0).plot.barh(figsize=(8,4))
ax.set_xlabel('% of indicators in category')
ax.legend(bbox_to_anchor=(1.1,1),title='Sectoral resolution')

save_fig('fig_2_sectoral_resolution.pdf',material_outputs)

In [None]:
ind_last_year = ind.assign(last_year = lambda x: [int(v.split(',')[1]) if ',' in v else int(v) for v in x['temporal_coverage']])
pd.DataFrame(ind_last_year.groupby('method_type')['last_year'].mean()).T.to_csv(f"{material_outputs}/table_2_mean_last_year.csv")

In [None]:
ax = ind.groupby('method_type')[['trustworthiness','complexity']].mean().plot.barh()
ax.set_xlabel('Average score')
ax.legend(bbox_to_anchor=(1,1))
plt.tight_layout()

save_fig('fig_3_complexity_trustworthiness.pdf',material_outputs)

### Eurostat indicator analysis

In [None]:
# Create 

sn.set_palette('Purples_r')

In [None]:
ind.loc[ind['source']=='Eurostat'][['category','indicator','description']].sort_values(
    'category').to_csv(f"{material_outputs}/table_3_es_indicators.csv",index=False)

In [None]:
# Load the indicators

indicator_location = f"{project_dir}/data/raw/eurostat/selected_tables"

indicator_store = {}

for x in os.listdir(indicator_location):
    
    if 'csv' in x:
        indicator_store[x.split('.')[0]] = pd.read_csv(f"{indicator_location}/{x}")

In [None]:
with open(f'{project_dir}/data/aux/eis_filters.yaml','r') as infile:
    all_filters = yaml.safe_load(infile)
    
with open(f'{project_dir}/data/aux/eurostat_clean_names.json','r') as infile:
    es_clean_names = json.load(infile)


In [None]:
def filter_df(df,filter_dict,var_name,make_concise=True):
    '''
    Filters a df with the keys and values of a filter_dict
    
    Args:
        df (pandas dataframe) is a dataframe
        filter_dict (dict) is a dict where the keys are filter variables and the values are filter values (lists)
        make_concise (str) only returns country, year and variable
        
    '''
    
    df_2 = df.copy()

    
    for k,v in filter_dict.items():
        df_2 = df_2.loc[[x in v for x in df_2[k]]]
    
    if make_concise==True:
        return(df_2.reset_index(drop=True)[['geo\\time','time',var_name]])
    else:
        return(df_2.reset_index(drop=True))

In [None]:
framework_category = ["supply","supply","supply","industrial_skill_base",
                      "demand","demand","demand","supply",
                      "industrial_skill_base","industrial_skill_base",
                      "industrial_skill_base"]

In [None]:
#Here we create a dict where every element is a df
filtered_dfs = {}

for l in es_clean_names.keys():
    
    df = filter_df(indicator_store[l],
          all_filters[l],l)
    
    df_pivoted = df.pivot_table(
        index='geo\\time',columns='time',values=l,aggfunc='sum').replace(0,np.nan)
    
    #Drop EU aggregates and sort by the last year
    df_processed = df_pivoted.drop(
        [x for x in df_pivoted.index if any(v in x for v in ['EU','EA'])])
    
    sort_countries = df_processed.mean(axis=1).sort_values(ascending=False).index
    
    filtered_dfs[l] = df_processed.loc[sort_countries]

In [None]:
def multiplot(rows,cols,dfs,titles,figsize=(12,16)):
    
    fig,ax = plt.subplots(figsize=figsize,nrows=rows,ncols=cols)
    
    row = 0
    col = 0
    
    for n,df in enumerate(dfs):
        sn.heatmap(dfs[n],ax=ax[row,col],cmap='Purples')
        ax[row,col].set_title('\n ('.join(titles[n].split('(')))
        ax[row,col].set_yticks([x+0.5 for x in np.arange(len(dfs[n]))])
        ax[row,col].set_yticklabels(dfs[n].index,rotation=0,size=10)
        
        ax[row,col].set_ylabel('')

        col+=1
        if col>1:
            col=0
            row+=1
            
    if len(dfs)<rows*cols:
        ax[row,1].set_axis_off()

In [None]:
classified_indicators = [[x for x,y in zip(filtered_dfs.keys(),framework_category) if y in val] for val in 
    ['supply','demand','industrial_skill_base']]

In [None]:
st=0

for n,group in enumerate(classified_indicators):
    
    dfs = [filtered_dfs[x] for x in group]
    names = [es_clean_names[x] for x in group]
    
    multiplot(2,2,dfs,names,figsize=(10,15))
    plt.tight_layout()
    
    save_fig(f"fig_4_{n}_time_country_coverage.pdf",material_outputs)

#### Correlation analysis

In [None]:
#We want to merge all variables on country and year
df_long_list = []

#For each name and df in the df container
for k,v in filtered_dfs.items():
    d_2 = v.copy()
    
    d_2 = d_2.reset_index(drop=False)
    
    #Melt
    d_long = d_2.melt(id_vars=['geo\\time'],var_name='year',value_name=k)
    
    #Append
    df_long_list.append(d_long.set_index(['geo\\time','year']))

es_merged = pd.concat(df_long_list,axis=1).reset_index(drop=False)

In [None]:
#This calculates correlations on a year basis
geo_corrs = {}
count_corrs = {}

for y in np.arange(2010,2020):
    
    in_year = es_merged.loc[es_merged['year']==y]
    
    in_year_sel = in_year.drop('year',axis=1).set_index('geo\\time')
    
    geo_corrs[y] = in_year_sel.corr()
    
    count_corrs[y] = in_year_sel.apply(lambda x: zscore(x,nan_policy='omit')).T.corr(method='spearman')

In [None]:
#Create an average correlation by variable over year

indicator_focus = [x for x in filtered_dfs.keys() if x != 'isoc_ske_fct']

indicator_corr = pd.DataFrame(index=indicator_focus,columns=indicator_focus)

In [None]:
for v in indicator_focus:
    
    for w in indicator_focus:
        
        if v == w:
            indicator_corr.loc[v,w]=1
            
        else:
            pair_corrs = []
            for df in geo_corrs.values():
                pair_corrs.append(df.loc[v,w].astype(float))
                indicator_corr.loc[v,w] = np.float(np.mean([x for x in pair_corrs if pd.isnull(x)==False]))
    

In [None]:
indicator_corr = indicator_corr[indicator_corr.columns].astype(float) 

indicator_corr.columns,indicator_corr.index = [[es_clean_names[x] for x in val] for val in 
                                               [indicator_corr.columns,indicator_corr.index]]

In [None]:
sn.clustermap(indicator_corr,cmap='coolwarm')

plt.tight_layout()

save_fig('fig_5_indicator_correlation.pdf',material_outputs)

In [None]:
melt = pd.concat([dfs.reset_index(drop=False).melt(id_vars=['geo\\time','year']) for 
                  dfs in df_long_list]).rename(columns={'geo\\time':'country'})

In [None]:
melt.to_csv(f"{project_dir}/data/interim/official_indicators.csv",index=False)