# EDA categorical on PWDB

## Import libraries

In [583]:
import pathlib as path
import pandas as pd
import plotly.express as px
from collections import  Counter
from IPython.display import display, Markdown
import statsmodels.api as sm
import numpy as np
import scipy.stats as stats

from tqdm import tqdm

## Define constants

In [584]:
LOCAL_FOLDER = path.Path("/mnt/c/Users/Professional/Desktop/Works/MEANING/Sem-Covid19/data/")

SERVER_FOLDER  = path.Path("/home/jovyan/data/")

WORK_DIR = ""
if SERVER_FOLDER.exists():
    WORK_DIR = SERVER_FOLDER
    print("Work with distant directory.")
elif LOCAL_FOLDER.exists():
    WORK_DIR = LOCAL_FOLDER
    print("Work with local directory.")
else:
    print("ERROR: Invalid directory!")

Work with distant directory.


In [585]:
SRC_JSON_FILE_NAME = "covid19db.json"
SRC_PICKLE_FILE_NAME = "pwdb_prepared.pkl"

SRC_FILE_NAME = SRC_JSON_FILE_NAME

SRC_FILE_PATH = WORK_DIR / SRC_FILE_NAME

CATEGORICAL_COLUMNS = ['country', 'category', 'subcategory','actors', 'target_groups','Businesses','Citizens','Workers', 'funding']

BUSINESSES ={'Companies providing essential services', 'Contractors of a company', 'Larger corporations', 'One person or microenterprises', 'Other businesses', 'SMEs', 'Sector specific set of companies', 'Solo-self-employed', 'Start-ups'}

CITIZENS ={'Children (minors)', 'Disabled', 'Migrants', 'Older citizens', 'Other groups of citizens', 'Parents', 'People in care facilities', 'Refugees', 'Single parents', 'The COVID-19 risk group', 'Women', 'Youth (18-25)'}

WORKERS = {'Cross-border commuters', 'Disabled workers', 'Employees in standard employment', 'Female workers', 'Migrants in employment', 'Older people in employment (aged 55+)', 'Other groups of workers', 'Parents in employment', 'Particular professions', 'Platform workers', 'Posted workers', 'Refugees in employment', 'Seasonal workers', 'Self-employed', 'Single parents in employment', 'The COVID-19 risk group at the workplace', 'Undeclared workers', 'Unemployed', 'Workers in care facilities', 'Workers in essential services', 'Workers in non-standard forms of employment', 'Youth (18-25) in employment'}

## Load PWDB dataset

In [586]:
if SRC_FILE_PATH.exists():
    df = pd.read_json(SRC_FILE_PATH)
    #df = pd.read_pickle(SRC_FILE_PATH)
else:
    print("Source path is invalid!")


## Function to transform target_group

In [587]:
def target_group_refactoring(pwdb_dataframe: pd.DataFrame,target_group_column_name: str = 'target_groups') -> pd.DataFrame:
    """
        The target group available in the original dataset is very granular. For the purpose of this exercise
        we would benefit from aggregating the target groups into a more generic sets. As a result we will obtain
        target groups on two levels: L1, L2.
        L1: workers, businesses, citizens
        L2: the original set of categories

        :return: the given dataset with an extra column containing the aggregated (L1) values
    """
    new_columns = {'Businesses':BUSINESSES,'Citizens':CITIZENS,'Workers':WORKERS}
    refactored_pwdb_df = pwdb_dataframe[target_group_column_name]
    for column,class_set in new_columns.items():
         pwdb_dataframe[column] = refactored_pwdb_df.apply(lambda x: any(item in class_set for item in x))
         pwdb_dataframe[column].replace({True: column+"_True", False: column+"_False"}, inplace=True)
    return pwdb_dataframe

## Transform DataFrame to target_group with L1/L2

In [588]:
df = target_group_refactoring(df)


## Function for plot bar chart on observations

In [589]:
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)

## Function for plot pie chart on observations

In [590]:
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )

## Function for making observations on categorical data

In [591]:
def calc_freq_categorical_data(data : pd.Series,title : str ,relative : bool = False ):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    data.dropna(inplace=True)
    observation = pd.DataFrame(Counter(data).most_common(),columns=[title,observation_type_name])
    if relative:
        observation[observation_type_name]/=observation[observation_type_name].sum()/100
        observation[observation_type_name]=round(observation[observation_type_name],2)
    return observation

## Function for making observations on missing data

In [592]:
def calc_freq_missing_data(data : pd.DataFrame,relative : bool = False):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    columns = data.columns
    tmp = pd.Series(dtype=object)
    for column in columns:
        series_tmp = data[column].explode()
        tmp[column]= series_tmp.isnull().sum()
        if relative:
            tmp[column]/=series_tmp.size/100
            tmp[column]=round(tmp[column],2)
    observation = pd.DataFrame(tmp[tmp>0] ,columns=[observation_type_name])
    observation.reset_index(inplace=True)
    return observation

## EDA on categorical data from loaded dataset

In [593]:
def fast_categorical_analyze(data : pd.DataFrame,data_title : str = 'Unknown'):
    results = {}
    abs_miss_obs = calc_freq_missing_data(data)
    display(abs_miss_obs)

    if abs_miss_obs.size>0:
        plot_pie_chart(abs_miss_obs,data_title+' missing values').show()
    data = data[CATEGORICAL_COLUMNS]
    for column_name in data.columns:
        data_column = data[column_name].explode()
        try:
            rel_obs = calc_freq_categorical_data(data_column,column_name,True)
            results[column_name] = rel_obs
            rel_obs = rel_obs.head(10)
            display(rel_obs)
            plot_bar_chart(rel_obs,column_name).show()
            plot_pie_chart(rel_obs,column_name).show()
        except:
            print('Observation on [',column_name,'] fault!')
            print('Check if column [',column_name,'] have compatible type!')
    return results

eda_result = fast_categorical_analyze(df,"PWDB Dataset")




Unnamed: 0,index,Absolute freq
0,end_date,509
1,social_partner_form,1171
2,social_partner_role,1171
3,target_groups,303
4,sectors,729
5,occupations,1056
6,sources,1


Unnamed: 0,country,Relative freq
0,Spain,6.66
1,Italy,5.12
2,Greece,5.04
3,Germany,4.95
4,Austria,4.87
5,Portugal,4.53
6,France,4.01
7,Croatia,3.67
8,Lithuania,3.5
9,Norway,3.5


Unnamed: 0,category,Relative freq
0,Supporting businesses to stay afloat,28.1
1,"Protection of workers, adaptation of workplace",14.35
2,Income protection beyond short-time work,12.21
3,"Promoting the economic, labour market and soci...",10.16
4,Ensuring business continuity and support for e...,9.65
5,Employment protection and retention,9.39
6,Measures to prevent social hardship,7.09
7,Reorientation of business activities,5.72
8,Supporting businesses to get back to normal,3.33


Unnamed: 0,subcategory,Relative freq
0,Direct subsidies (full or partial),12.89
1,Access to finance,7.6
2,Income support for people in employment (e.g. ...,7.26
3,Other,5.81
4,Deferral of payments or liabilities,5.55
5,Occupational health and safety,4.95
6,Extensions of income support to workers not c...,4.78
7,Change of production/innovation,4.61
8,"Active labour market policies, incl. subsidise...",4.61
9,"Teleworking arrangements, remote working",4.36


Unnamed: 0,actors,Relative freq
0,National government,38.36
1,Company / Companies,18.97
2,Trade unions,7.9
3,Employers' organisations,7.29
4,Social partners jointly,6.45
5,Local / regional government,4.91
6,Public employment service,4.53
7,Social insurance,3.83
8,Other social actors (e.g. NGOs),3.36
9,Public support service providers,2.57


Unnamed: 0,target_groups,Relative freq
0,Employees in standard employment,15.97
1,Sector specific set of companies,14.02
2,Particular professions,6.67
3,Self-employed,6.6
4,SMEs,6.13
5,Unemployed,4.78
6,One person or microenterprises,4.38
7,Other groups of workers,3.17
8,Solo-self-employed,3.03
9,Workers in non-standard forms of employment,2.83


Unnamed: 0,Businesses,Relative freq
0,Businesses_False,65.67
1,Businesses_True,34.33


Unnamed: 0,Citizens,Relative freq
0,Citizens_False,92.66
1,Citizens_True,7.34


Unnamed: 0,Workers,Relative freq
0,Workers_False,52.35
1,Workers_True,47.65


Unnamed: 0,funding,Relative freq
0,National funds,46.41
1,No special funding required,19.45
2,Companies,10.48
3,European Funds,8.69
4,Employer,4.48
5,Regional funds,2.9
6,Other,1.79
7,Employers organisation,1.72
8,Local funds,1.66
9,Employees,1.24


## Analysis and visualization:
- Z score.
- cumulative frequencies
- difference in neighboring frequencies

In [594]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    column_name = data.columns[1]
    zscore_column = data.columns[0]+'_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[zscore_column] = round((data[column_name]-data[column_name].mean())/data[column_name].std(),2)
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    display(data)
    px.bar(data,x=data.columns[2],y=data.columns[0]).show()
    px.bar(data,x=data.columns[0],y=data.columns[3]).show()
    px.bar(data,x=data.columns[0],y=data.columns[4]).show()

Std deviation for [country] is [1.12]

Unnamed: 0,country,Relative freq,country_z_score,Cumulative freq,Diff freq
0,Spain,6.66,2.96,6.66,
1,Italy,5.12,1.59,11.78,-1.54
2,Greece,5.04,1.52,16.82,-0.08
3,Germany,4.95,1.44,21.77,-0.09
4,Austria,4.87,1.37,26.64,-0.08
5,Portugal,4.53,1.06,31.17,-0.34
6,France,4.01,0.6,35.18,-0.52
7,Croatia,3.67,0.3,38.85,-0.34
8,Lithuania,3.5,0.15,42.35,-0.17
9,Norway,3.5,0.15,45.85,0.0


Std deviation for [category] is [7.18]

Unnamed: 0,category,Relative freq,category_z_score,Cumulative freq,Diff freq
0,Supporting businesses to stay afloat,28.1,2.37,28.1,
1,"Protection of workers, adaptation of workplace",14.35,0.45,42.45,-13.75
2,Income protection beyond short-time work,12.21,0.15,54.66,-2.14
3,"Promoting the economic, labour market and soci...",10.16,-0.13,64.82,-2.05
4,Ensuring business continuity and support for e...,9.65,-0.2,74.47,-0.51
5,Employment protection and retention,9.39,-0.24,83.86,-0.26
6,Measures to prevent social hardship,7.09,-0.56,90.95,-2.3
7,Reorientation of business activities,5.72,-0.75,96.67,-1.37
8,Supporting businesses to get back to normal,3.33,-1.08,100.0,-2.39


Std deviation for [subcategory] is [2.66]

Unnamed: 0,subcategory,Relative freq,subcategory_z_score,Cumulative freq,Diff freq
0,Direct subsidies (full or partial),12.89,3.8,12.89,
1,Access to finance,7.6,1.81,20.49,-5.29
2,Income support for people in employment (e.g. ...,7.26,1.69,27.75,-0.34
3,Other,5.81,1.14,33.56,-1.45
4,Deferral of payments or liabilities,5.55,1.04,39.11,-0.26
5,Occupational health and safety,4.95,0.82,44.06,-0.6
6,Extensions of income support to workers not c...,4.78,0.75,48.84,-0.17
7,Change of production/innovation,4.61,0.69,53.45,-0.17
8,"Active labour market policies, incl. subsidise...",4.61,0.69,58.06,0.0
9,"Teleworking arrangements, remote working",4.36,0.6,62.42,-0.25


Std deviation for [actors] is [10.59]

Unnamed: 0,actors,Relative freq,actors_z_score,Cumulative freq,Diff freq
0,National government,38.36,2.83,38.36,
1,Company / Companies,18.97,1.0,57.33,-19.39
2,Trade unions,7.9,-0.04,65.23,-11.07
3,Employers' organisations,7.29,-0.1,72.52,-0.61
4,Social partners jointly,6.45,-0.18,78.97,-0.84
5,Local / regional government,4.91,-0.32,83.88,-1.54
6,Public employment service,4.53,-0.36,88.41,-0.38
7,Social insurance,3.83,-0.42,92.24,-0.7
8,Other social actors (e.g. NGOs),3.36,-0.47,95.6,-0.47
9,Public support service providers,2.57,-0.54,98.17,-0.79


Std deviation for [target_groups] is [3.37]

Unnamed: 0,target_groups,Relative freq,target_groups_z_score,Cumulative freq,Diff freq
0,Employees in standard employment,15.97,4.03,15.97,
1,Sector specific set of companies,14.02,3.45,29.99,-1.95
2,Particular professions,6.67,1.27,36.66,-7.35
3,Self-employed,6.6,1.25,43.26,-0.07
4,SMEs,6.13,1.11,49.39,-0.47
5,Unemployed,4.78,0.71,54.17,-1.35
6,One person or microenterprises,4.38,0.59,58.55,-0.4
7,Other groups of workers,3.17,0.23,61.72,-1.21
8,Solo-self-employed,3.03,0.19,64.75,-0.14
9,Workers in non-standard forms of employment,2.83,0.13,67.58,-0.2


Std deviation for [Businesses] is [22.16]

Unnamed: 0,Businesses,Relative freq,Businesses_z_score,Cumulative freq,Diff freq
0,Businesses_False,65.67,0.71,65.67,
1,Businesses_True,34.33,-0.71,100.0,-31.34


Std deviation for [Citizens] is [60.33]

Unnamed: 0,Citizens,Relative freq,Citizens_z_score,Cumulative freq,Diff freq
0,Citizens_False,92.66,0.71,92.66,
1,Citizens_True,7.34,-0.71,100.0,-85.32


Std deviation for [Workers] is [3.32]

Unnamed: 0,Workers,Relative freq,Workers_z_score,Cumulative freq,Diff freq
0,Workers_False,52.35,0.71,52.35,
1,Workers_True,47.65,-0.71,100.0,-4.7


Std deviation for [funding] is [13.23]

Unnamed: 0,funding,Relative freq,funding_z_score,Cumulative freq,Diff freq
0,National funds,46.41,2.88,46.41,
1,No special funding required,19.45,0.84,65.86,-26.96
2,Companies,10.48,0.16,76.34,-8.97
3,European Funds,8.69,0.03,85.03,-1.79
4,Employer,4.48,-0.29,89.51,-4.21
5,Regional funds,2.9,-0.41,92.41,-1.58
6,Other,1.79,-0.49,94.2,-1.11
7,Employers organisation,1.72,-0.5,95.92,-0.07
8,Local funds,1.66,-0.5,97.58,-0.06
9,Employees,1.24,-0.54,98.82,-0.42


## Additional functions for confidence interval analysis:

In [599]:
def confidence_interval_with_mean( series : pd.Series):
    se = tmp_s.std()/np.sqrt(series.size)
    mean = tmp_s.mean()
    z = 1.96
    max = __builtins__.max
    min = __builtins__.min
    left_limit = max(round(100*(mean - z*se),2),0)
    right_limit = min(round(100*(mean + z*se),2),100)
    return [left_limit,right_limit]

In [596]:
def confidence_interval_for_proportion( series : pd.Series):
    conf_int = [ list(sm.stats.proportion_confint(n*p, n)) for p in tmp_s]
    conf_int = pd.DataFrame(conf_int).apply(lambda x: round(100*x,2))
    conf_int = [pd.Interval(row[0],row[1],closed='both') for index,row in conf_int.iterrows()]
    return conf_int

In [597]:
def z_score_for_series(series : pd.Series):
    return pd.Series(stats.zscore(tmp_s)).apply(lambda x: round(x,2))

## Analysis of confidence intervals:
- calculating the confidence interval for each column
- calculating the confidence interval for each proportion from column
- calculating the records from column, that are overrepresented
- calculating the records from column, that are underrepresented

In [600]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    n = data.size
    tmp_s = data[data.columns[1]].copy()
    tmp_s/=100
    ci_mean = confidence_interval_with_mean(tmp_s)
    display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
    data["Confidence Interval"]= confidence_interval_for_proportion(tmp_s)
    data["z_score"] = z_score_for_series(tmp_s)
    display(data)
    display(Markdown(f"Overrepresented records from column : {key}"))
    rel_f = 'Relative freq'
    display(data.loc[data[rel_f]>ci_mean[1]])
    display(Markdown(f"Normal represented records from column : {key}"))
    display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
    display(Markdown(f"Underrepresented records from column : {key}"))
    display(data.loc[data[rel_f]<ci_mean[0]])


Confidence Interval for country is : [2.93%, 3.74%]

Unnamed: 0,country,Relative freq,Confidence Interval,z_score
0,Spain,6.66,"[0.35, 12.97]",3.01
1,Italy,5.12,"[0.0, 10.7]",1.62
2,Greece,5.04,"[0.0, 10.58]",1.54
3,Germany,4.95,"[0.0, 10.44]",1.46
4,Austria,4.87,"[0.0, 10.32]",1.39
5,Portugal,4.53,"[0.0, 9.79]",1.08
6,France,4.01,"[0.0, 8.97]",0.61
7,Croatia,3.67,"[0.0, 8.43]",0.31
8,Lithuania,3.5,"[0.0, 8.15]",0.15
9,Norway,3.5,"[0.0, 8.15]",0.15


Overrepresented records from column : country

Unnamed: 0,country,Relative freq,Confidence Interval,z_score
0,Spain,6.66,"[0.35, 12.97]",3.01
1,Italy,5.12,"[0.0, 10.7]",1.62
2,Greece,5.04,"[0.0, 10.58]",1.54
3,Germany,4.95,"[0.0, 10.44]",1.46
4,Austria,4.87,"[0.0, 10.32]",1.39
5,Portugal,4.53,"[0.0, 9.79]",1.08
6,France,4.01,"[0.0, 8.97]",0.61


Normal represented records from column : country

Unnamed: 0,country,Relative freq,Confidence Interval,z_score
7,Croatia,3.67,"[0.0, 8.43]",0.31
8,Lithuania,3.5,"[0.0, 8.15]",0.15
9,Norway,3.5,"[0.0, 8.15]",0.15
10,Czechia,3.42,"[0.0, 8.02]",0.08
11,Belgium,3.33,"[0.0, 7.87]",-0.0
12,Finland,3.16,"[0.0, 7.59]",-0.16
13,Netherlands,3.16,"[0.0, 7.59]",-0.16
14,Malta,3.07,"[0.0, 7.43]",-0.24
15,Luxembourg,3.07,"[0.0, 7.43]",-0.24
16,Poland,3.07,"[0.0, 7.43]",-0.24


Underrepresented records from column : country

Unnamed: 0,country,Relative freq,Confidence Interval,z_score
17,Cyprus,2.9,"[0.0, 7.15]",-0.39
18,Slovenia,2.9,"[0.0, 7.15]",-0.39
19,Slovakia,2.82,"[0.0, 7.01]",-0.46
20,Latvia,2.82,"[0.0, 7.01]",-0.46
21,Denmark,2.82,"[0.0, 7.01]",-0.46
22,Romania,2.65,"[0.0, 6.71]",-0.62
23,United Kingdom,2.56,"[0.0, 6.56]",-0.7
24,Sweden,2.39,"[0.0, 6.25]",-0.85
25,Bulgaria,2.22,"[0.0, 5.95]",-1.01
26,Hungary,2.13,"[0.0, 5.78]",-1.09


Confidence Interval for category is : [6.42%, 15.8%]

Unnamed: 0,category,Relative freq,Confidence Interval,z_score
0,Supporting businesses to stay afloat,28.1,"[7.34, 48.86]",2.51
1,"Protection of workers, adaptation of workplace",14.35,"[0.0, 30.55]",0.48
2,Income protection beyond short-time work,12.21,"[0.0, 27.33]",0.16
3,"Promoting the economic, labour market and soci...",10.16,"[0.0, 24.12]",-0.14
4,Ensuring business continuity and support for e...,9.65,"[0.0, 23.29]",-0.22
5,Employment protection and retention,9.39,"[0.0, 22.87]",-0.25
6,Measures to prevent social hardship,7.09,"[0.0, 18.95]",-0.59
7,Reorientation of business activities,5.72,"[0.0, 16.45]",-0.8
8,Supporting businesses to get back to normal,3.33,"[0.0, 11.62]",-1.15


Overrepresented records from column : category

Unnamed: 0,category,Relative freq,Confidence Interval,z_score
0,Supporting businesses to stay afloat,28.1,"[7.34, 48.86]",2.51


Normal represented records from column : category

Unnamed: 0,category,Relative freq,Confidence Interval,z_score
1,"Protection of workers, adaptation of workplace",14.35,"[0.0, 30.55]",0.48
2,Income protection beyond short-time work,12.21,"[0.0, 27.33]",0.16
3,"Promoting the economic, labour market and soci...",10.16,"[0.0, 24.12]",-0.14
4,Ensuring business continuity and support for e...,9.65,"[0.0, 23.29]",-0.22
5,Employment protection and retention,9.39,"[0.0, 22.87]",-0.25
6,Measures to prevent social hardship,7.09,"[0.0, 18.95]",-0.59


Underrepresented records from column : category

Unnamed: 0,category,Relative freq,Confidence Interval,z_score
7,Reorientation of business activities,5.72,"[0.0, 16.45]",-0.8
8,Supporting businesses to get back to normal,3.33,"[0.0, 11.62]",-1.15


Confidence Interval for subcategory is : [1.91%, 3.65%]

Unnamed: 0,subcategory,Relative freq,Confidence Interval,z_score
0,Direct subsidies (full or partial),12.89,"[5.15, 20.63]",3.86
1,Access to finance,7.6,"[1.48, 13.72]",1.84
2,Income support for people in employment (e.g. ...,7.26,"[1.27, 13.25]",1.71
3,Other,5.81,"[0.41, 11.21]",1.16
4,Deferral of payments or liabilities,5.55,"[0.26, 10.84]",1.06
5,Occupational health and safety,4.95,"[0.0, 9.96]",0.83
6,Extensions of income support to workers not c...,4.78,"[0.0, 9.71]",0.76
7,Change of production/innovation,4.61,"[0.0, 9.45]",0.7
8,"Active labour market policies, incl. subsidise...",4.61,"[0.0, 9.45]",0.7
9,"Teleworking arrangements, remote working",4.36,"[0.0, 9.08]",0.6


Overrepresented records from column : subcategory

Unnamed: 0,subcategory,Relative freq,Confidence Interval,z_score
0,Direct subsidies (full or partial),12.89,"[5.15, 20.63]",3.86
1,Access to finance,7.6,"[1.48, 13.72]",1.84
2,Income support for people in employment (e.g. ...,7.26,"[1.27, 13.25]",1.71
3,Other,5.81,"[0.41, 11.21]",1.16
4,Deferral of payments or liabilities,5.55,"[0.26, 10.84]",1.06
5,Occupational health and safety,4.95,"[0.0, 9.96]",0.83
6,Extensions of income support to workers not c...,4.78,"[0.0, 9.71]",0.76
7,Change of production/innovation,4.61,"[0.0, 9.45]",0.7
8,"Active labour market policies, incl. subsidise...",4.61,"[0.0, 9.45]",0.7
9,"Teleworking arrangements, remote working",4.36,"[0.0, 9.08]",0.6


Normal represented records from column : subcategory

Unnamed: 0,subcategory,Relative freq,Confidence Interval,z_score
10,"Change of work arrangements (working time, rot...",2.99,"[0.0, 6.92]",0.08
11,Support for parents and carers (financial or i...,2.82,"[0.0, 6.64]",0.02
12,Remuneration and rewards for workers in essent...,2.82,"[0.0, 6.64]",0.02
13,Protection of vulnerable groups (beyond employ...,2.73,"[0.0, 6.49]",-0.02
14,"Support for spending, stimulus packages",2.56,"[0.0, 6.21]",-0.08
15,Paid sick leave,2.48,"[0.0, 6.07]",-0.11
16,Well-being of workers,2.13,"[0.0, 5.47]",-0.25
17,Keeping a safe home,2.05,"[0.0, 5.32]",-0.28
18,Income support for unemployed,1.96,"[0.0, 5.16]",-0.31


Underrepresented records from column : subcategory

Unnamed: 0,subcategory,Relative freq,Confidence Interval,z_score
19,Mobilisation of a larger workforce,1.79,"[0.0, 4.85]",-0.38
20,Smoothing frictions or reallocation of workers,1.62,"[0.0, 4.54]",-0.44
21,Enhancing employability and training,1.54,"[0.0, 4.38]",-0.47
22,Flexibilisation and security,1.28,"[0.0, 3.88]",-0.57
23,Measures to support a gradual relaunch of work,1.28,"[0.0, 3.88]",-0.57
24,Changes of working hours or work arrangements,1.2,"[0.0, 3.72]",-0.6
25,Preventing over-indebtedness,0.94,"[0.0, 3.17]",-0.7
26,Working time and working time flexibility,0.94,"[0.0, 3.17]",-0.7
27,Wage flexibility,0.85,"[0.0, 2.97]",-0.74
28,Changes in work organisation,0.85,"[0.0, 2.97]",-0.74


Confidence Interval for actors is : [2.34%, 14.33%]

Unnamed: 0,actors,Relative freq,Confidence Interval,z_score
0,National government,38.36,"[18.91, 57.81]",2.96
1,Company / Companies,18.97,"[3.28, 34.66]",1.05
2,Trade unions,7.9,"[0.0, 18.69]",-0.04
3,Employers' organisations,7.29,"[0.0, 17.69]",-0.1
4,Social partners jointly,6.45,"[0.0, 16.28]",-0.19
5,Local / regional government,4.91,"[0.0, 13.55]",-0.34
6,Public employment service,4.53,"[0.0, 12.85]",-0.37
7,Social insurance,3.83,"[0.0, 11.51]",-0.44
8,Other social actors (e.g. NGOs),3.36,"[0.0, 10.57]",-0.49
9,Public support service providers,2.57,"[0.0, 8.9]",-0.57


Overrepresented records from column : actors

Unnamed: 0,actors,Relative freq,Confidence Interval,z_score
0,National government,38.36,"[18.91, 57.81]",2.96
1,Company / Companies,18.97,"[3.28, 34.66]",1.05


Normal represented records from column : actors

Unnamed: 0,actors,Relative freq,Confidence Interval,z_score
2,Trade unions,7.9,"[0.0, 18.69]",-0.04
3,Employers' organisations,7.29,"[0.0, 17.69]",-0.1
4,Social partners jointly,6.45,"[0.0, 16.28]",-0.19
5,Local / regional government,4.91,"[0.0, 13.55]",-0.34
6,Public employment service,4.53,"[0.0, 12.85]",-0.37
7,Social insurance,3.83,"[0.0, 11.51]",-0.44
8,Other social actors (e.g. NGOs),3.36,"[0.0, 10.57]",-0.49
9,Public support service providers,2.57,"[0.0, 8.9]",-0.57


Underrepresented records from column : actors

Unnamed: 0,actors,Relative freq,Confidence Interval,z_score
10,"EU (Council, EC, EP)",1.4,"[0.0, 6.1]",-0.68
11,EU level social partners,0.42,"[0.0, 3.01]",-0.78


Confidence Interval for target_groups is : [1.36%, 3.4%]

Unnamed: 0,target_groups,Relative freq,Confidence Interval,z_score
0,Employees in standard employment,15.97,"[8.14, 23.8]",4.08
1,Sector specific set of companies,14.02,"[6.6, 21.44]",3.49
2,Particular professions,6.67,"[1.33, 12.01]",1.29
3,Self-employed,6.6,"[1.29, 11.91]",1.27
4,SMEs,6.13,"[1.0, 11.26]",1.13
5,Unemployed,4.78,"[0.22, 9.34]",0.72
6,One person or microenterprises,4.38,"[0.0, 8.76]",0.6
7,Other groups of workers,3.17,"[0.0, 6.92]",0.24
8,Solo-self-employed,3.03,"[0.0, 6.7]",0.2
9,Workers in non-standard forms of employment,2.83,"[0.0, 6.38]",0.14


Overrepresented records from column : target_groups

Unnamed: 0,target_groups,Relative freq,Confidence Interval,z_score
0,Employees in standard employment,15.97,"[8.14, 23.8]",4.08
1,Sector specific set of companies,14.02,"[6.6, 21.44]",3.49
2,Particular professions,6.67,"[1.33, 12.01]",1.29
3,Self-employed,6.6,"[1.29, 11.91]",1.27
4,SMEs,6.13,"[1.0, 11.26]",1.13
5,Unemployed,4.78,"[0.22, 9.34]",0.72
6,One person or microenterprises,4.38,"[0.0, 8.76]",0.6


Normal represented records from column : target_groups

Unnamed: 0,target_groups,Relative freq,Confidence Interval,z_score
7,Other groups of workers,3.17,"[0.0, 6.92]",0.24
8,Solo-self-employed,3.03,"[0.0, 6.7]",0.2
9,Workers in non-standard forms of employment,2.83,"[0.0, 6.38]",0.14
10,Other businesses,2.83,"[0.0, 6.38]",0.14
11,Companies providing essential services,2.83,"[0.0, 6.38]",0.14
12,Workers in essential services,2.76,"[0.0, 6.26]",0.11
13,Larger corporations,2.49,"[0.0, 5.82]",0.03
14,Disabled workers,1.95,"[0.0, 4.91]",-0.13
15,Workers in care facilities,1.89,"[0.0, 4.8]",-0.15
16,Parents in employment,1.82,"[0.0, 4.68]",-0.17


Underrepresented records from column : target_groups

Unnamed: 0,target_groups,Relative freq,Confidence Interval,z_score
20,Parents,1.28,"[0.0, 3.68]",-0.33
21,Older citizens,1.28,"[0.0, 3.68]",-0.33
22,The COVID-19 risk group,1.01,"[0.0, 3.15]",-0.41
23,Start-ups,1.01,"[0.0, 3.15]",-0.41
24,Children (minors),0.94,"[0.0, 3.0]",-0.43
25,Seasonal workers,0.74,"[0.0, 2.57]",-0.49
26,Youth (18-25),0.74,"[0.0, 2.57]",-0.49
27,Youth (18-25) in employment,0.67,"[0.0, 2.41]",-0.51
28,People in care facilities,0.47,"[0.0, 1.93]",-0.57
29,Migrants in employment,0.4,"[0.0, 1.75]",-0.59


Confidence Interval for Businesses is : [19.29%, 80.71%]

Unnamed: 0,Businesses,Relative freq,Confidence Interval,z_score
0,Businesses_False,65.67,"[19.14, 100.0]",1.0
1,Businesses_True,34.33,"[0.0, 80.86]",-1.0


Overrepresented records from column : Businesses

Unnamed: 0,Businesses,Relative freq,Confidence Interval,z_score


Normal represented records from column : Businesses

Unnamed: 0,Businesses,Relative freq,Confidence Interval,z_score
0,Businesses_False,65.67,"[19.14, 100.0]",1.0
1,Businesses_True,34.33,"[0.0, 80.86]",-1.0


Underrepresented records from column : Businesses

Unnamed: 0,Businesses,Relative freq,Confidence Interval,z_score


Confidence Interval for Citizens is : [0%, 100%]

Unnamed: 0,Citizens,Relative freq,Confidence Interval,z_score
0,Citizens_False,92.66,"[67.1, 100.0]",1.0
1,Citizens_True,7.34,"[0.0, 32.9]",-1.0


Overrepresented records from column : Citizens

Unnamed: 0,Citizens,Relative freq,Confidence Interval,z_score


Normal represented records from column : Citizens

Unnamed: 0,Citizens,Relative freq,Confidence Interval,z_score
0,Citizens_False,92.66,"[67.1, 100.0]",1.0
1,Citizens_True,7.34,"[0.0, 32.9]",-1.0


Underrepresented records from column : Citizens

Unnamed: 0,Citizens,Relative freq,Confidence Interval,z_score


Confidence Interval for Workers is : [45.39%, 54.61%]

Unnamed: 0,Workers,Relative freq,Confidence Interval,z_score
0,Workers_False,52.35,"[3.41, 100.0]",1.0
1,Workers_True,47.65,"[0.0, 96.59]",-1.0


Overrepresented records from column : Workers

Unnamed: 0,Workers,Relative freq,Confidence Interval,z_score


Normal represented records from column : Workers

Unnamed: 0,Workers,Relative freq,Confidence Interval,z_score
0,Workers_False,52.35,"[3.41, 100.0]",1.0
1,Workers_True,47.65,"[0.0, 96.59]",-1.0


Underrepresented records from column : Workers

Unnamed: 0,Workers,Relative freq,Confidence Interval,z_score


Confidence Interval for funding is : [0.85%, 15.82%]

Unnamed: 0,funding,Relative freq,Confidence Interval,z_score
0,National funds,46.41,"[26.46, 66.36]",3.01
1,No special funding required,19.45,"[3.61, 35.29]",0.88
2,Companies,10.48,"[0.0, 22.73]",0.17
3,European Funds,8.69,"[0.0, 19.96]",0.03
4,Employer,4.48,"[0.0, 12.76]",-0.3
5,Regional funds,2.9,"[0.0, 9.61]",-0.43
6,Other,1.79,"[0.0, 7.09]",-0.52
7,Employers organisation,1.72,"[0.0, 6.92]",-0.52
8,Local funds,1.66,"[0.0, 6.77]",-0.53
9,Employees,1.24,"[0.0, 5.67]",-0.56


Overrepresented records from column : funding

Unnamed: 0,funding,Relative freq,Confidence Interval,z_score
0,National funds,46.41,"[26.46, 66.36]",3.01
1,No special funding required,19.45,"[3.61, 35.29]",0.88


Normal represented records from column : funding

Unnamed: 0,funding,Relative freq,Confidence Interval,z_score
2,Companies,10.48,"[0.0, 22.73]",0.17
3,European Funds,8.69,"[0.0, 19.96]",0.03
4,Employer,4.48,"[0.0, 12.76]",-0.3
5,Regional funds,2.9,"[0.0, 9.61]",-0.43
6,Other,1.79,"[0.0, 7.09]",-0.52
7,Employers organisation,1.72,"[0.0, 6.92]",-0.52
8,Local funds,1.66,"[0.0, 6.77]",-0.53
9,Employees,1.24,"[0.0, 5.67]",-0.56
10,Trade union,0.97,"[0.0, 4.89]",-0.58


Underrepresented records from column : funding

Unnamed: 0,funding,Relative freq,Confidence Interval,z_score
11,Social partners jointly,0.21,"[0.0, 2.04]",-0.64


## Class collision analysis

### Define constants

In [None]:
CLASS_COLUMNS = ['category', 'subcategory','actors','target_groups','Businesses','Citizens','Workers', 'funding']

## Function to get binary-matrix from DataFrame

In [None]:
def convert_to_binary_matrix(data : pd.DataFrame):
    binary_matrix = pd.DataFrame([],dtype=object)
    for index,row in data.iterrows():
        new_row = {}
        for key in row.index:
            if type(row[key]) == list:
                for column in row[key]:
                    new_row[column] = 1
            else:
                    new_row[row[key]] = 1
        binary_matrix = binary_matrix.append(new_row,ignore_index=True)
    binary_matrix = binary_matrix.fillna(0)
    return binary_matrix

## Function to get dependency between columns in binary-matrix

In [None]:
def dependency_table( data : pd.DataFrame, dependecy_level : float = 0.9):
    result = {}
    for column in data.columns:
        tmp = data.loc[data[column]==1].copy()
        tmp = tmp.sum()
        tmp /= tmp[column]
        tmp = tmp.drop(column)
        tmp = tmp.loc[tmp.values>=dependecy_level]
        new_row = {}
        if tmp.size>0:
            for index in tmp.index:
                new_row[index] = tmp[index]
            result[column] = new_row
    return pd.DataFrame(result).fillna(0)


## Categorical data collision analysis in columns

In [None]:
def class_collision_in_columns(data : pd.DataFrame):
    for column in data.columns:
        series = data[column]
        tmp_df = pd.DataFrame({column : series.values})
        btable = convert_to_binary_matrix(tmp_df)
        dtable = dependency_table(btable,0.7)
        if dtable.size>0:
            display(Markdown(f"Collision in column : {column}"))
            display(px.imshow(dtable))

In [None]:
class_collision_in_columns(df[CLASS_COLUMNS])

## Analysis of categorical data collisions in DataFrame

In [None]:
def class_collision(data : pd.DataFrame):
    bmat = convert_to_binary_matrix(data)
    dtable = dependency_table(bmat,0.7)
    display(Markdown("Collision in dataframe"))
    display(px.imshow(dtable,aspect='auto'))

In [None]:
class_collision(df[CLASS_COLUMNS])

# Results
## Analysis of confidence intervals
- This analysis gives us information about the records
that are overrepresented and that are underrepresented.
## Categorical data collision analysis in columns
- Some data in the columns collide 100%,
which shows us that these characteristics are present only in the presence of the characteristic that collides 100%,
so we can not give much importance to the presence of characteristics that are in collision 100% with another feature.
## Analysis of categorical data collisions in DataFrame
- This analysis helps us to highlight the independent characteristics,
 compared to the dependent ones that are in collision with a percentage higher than 70%.
## Observations and suggestions
- The granularity (cardinality) of the data is relatively large,
 with an uneven distribution, according to their proportion in the data set.
  To create subsequent classifiers, it would be good to reduce the granularity:
   - by reducing the dependent features (which have a collision with another feature greater than 70 ~ 90%),
   - by grouping the features by semantic value,
   - by segmenting the feature space and performing a max pooling or average pooling.
- One goal to be pursued is to reduce the collisions of characteristics that will be used as classes for classifiers.
   - To reduce the collision it is necessary to add information for these features, such as to be associated with a semantic representation of the feature.
- Due to the relatively high granularity, the following classification methods are recommended:
    - space segmentation and weighted average for class centers, where granularity cannot be considerably reduced
    - KNN, SVM, Random Forest for data sets that do not have large collisions.
    - In-depth learning methods (where the necessary collision reduction transformations will be done automatically).