# EDA categorical on PWDB

## Import libraries

In [30]:
import pathlib as path
import pandas as pd
import plotly.express as px
from collections import  Counter
import scipy.stats as stats
from IPython.display import display, Markdown
from tqdm import tqdm

## Define constants

In [31]:
LOCAL_FOLDER = path.Path("/mnt/c/Users/Professional/Desktop/Works/MEANING/Sem-Covid19/data/")

SERVER_FOLDER  = path.Path("/home/jovyan/data/")

WORK_DIR = ""
if SERVER_FOLDER.exists():
    WORK_DIR = SERVER_FOLDER
    print("Work with distant directory.")
elif LOCAL_FOLDER.exists():
    WORK_DIR = LOCAL_FOLDER
    print("Work with local directory.")
else:
    print("ERROR: Invalid directory!")

Work with distant directory.


In [32]:
SRC_JSON_FILE_NAME = "covid19db.json"
SRC_PICKLE_FILE_NAME = "pwdb_prepared.pkl"

SRC_FILE_NAME = SRC_JSON_FILE_NAME

SRC_FILE_PATH = WORK_DIR / SRC_FILE_NAME

CATEGORICAL_COLUMNS = ['country', 'category', 'subcategory','actors', 'target_groups', 'funding']



## Load PWDB dataset

In [33]:
if SRC_FILE_PATH.exists():
    df = pd.read_json(SRC_FILE_PATH)
    #df = pd.read_pickle(SRC_FILE_PATH)
else:
    print("Source path is invalid!")


## Function for plot bar chart on observations

In [34]:
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)

## Function for plot pie chart on observations

In [35]:
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )

## Function for making observations on categorical data

In [36]:
def calc_freq_categorical_data(data : pd.Series,title : str ,relative : bool = False ):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    data.dropna(inplace=True)
    observation = pd.DataFrame(Counter(data).most_common(),columns=[title,observation_type_name])
    if relative:
        observation[observation_type_name]/=observation[observation_type_name].sum()/100
        observation[observation_type_name]=round(observation[observation_type_name],2)
    return observation

## Function for making observations on missing data

In [37]:
def calc_freq_missing_data(data : pd.DataFrame,relative : bool = False):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    columns = data.columns
    tmp = pd.Series(dtype=object)
    for column in columns:
        series_tmp = data[column].explode()
        tmp[column]= series_tmp.isnull().sum()
        if relative:
            tmp[column]/=series_tmp.size/100
            tmp[column]=round(tmp[column],2)
    observation = pd.DataFrame(tmp[tmp>0] ,columns=[observation_type_name])
    observation.reset_index(inplace=True)
    return observation

## EDA on categorical data from loaded dataset

In [38]:
def fast_categorical_analyze(data : pd.DataFrame,data_title : str = 'Unknown'):
    results = {}
    abs_miss_obs = calc_freq_missing_data(data)
    display(abs_miss_obs)

    if abs_miss_obs.size>0:
        plot_pie_chart(abs_miss_obs,data_title+' missing values').show()
    data = data[CATEGORICAL_COLUMNS]
    for column_name in data.columns:
        data_column = data[column_name].explode()
        try:
            rel_obs = calc_freq_categorical_data(data_column,column_name,True)
            results[column_name] = rel_obs
            rel_obs = rel_obs.head(10)
            display(rel_obs)
            plot_bar_chart(rel_obs,column_name).show()
            plot_pie_chart(rel_obs,column_name).show()
        except:
            print('Observation on [',column_name,'] fault!')
            print('Check if column [',column_name,'] have compatible type!')
    return results

eda_result = fast_categorical_analyze(df,"PWDB Dataset")




Unnamed: 0,index,Absolute freq
0,end_date,509
1,social_partner_form,1171
2,social_partner_role,1171
3,target_groups,303
4,sectors,729
5,occupations,1056
6,sources,1


Unnamed: 0,country,Relative freq
0,Spain,6.66
1,Italy,5.12
2,Greece,5.04
3,Germany,4.95
4,Austria,4.87
5,Portugal,4.53
6,France,4.01
7,Croatia,3.67
8,Lithuania,3.5
9,Norway,3.5


Unnamed: 0,category,Relative freq
0,Supporting businesses to stay afloat,28.1
1,"Protection of workers, adaptation of workplace",14.35
2,Income protection beyond short-time work,12.21
3,"Promoting the economic, labour market and soci...",10.16
4,Ensuring business continuity and support for e...,9.65
5,Employment protection and retention,9.39
6,Measures to prevent social hardship,7.09
7,Reorientation of business activities,5.72
8,Supporting businesses to get back to normal,3.33


Unnamed: 0,subcategory,Relative freq
0,Direct subsidies (full or partial),12.89
1,Access to finance,7.6
2,Income support for people in employment (e.g. ...,7.26
3,Other,5.81
4,Deferral of payments or liabilities,5.55
5,Occupational health and safety,4.95
6,Extensions of income support to workers not c...,4.78
7,Change of production/innovation,4.61
8,"Active labour market policies, incl. subsidise...",4.61
9,"Teleworking arrangements, remote working",4.36


Unnamed: 0,actors,Relative freq
0,National government,38.36
1,Company / Companies,18.97
2,Trade unions,7.9
3,Employers' organisations,7.29
4,Social partners jointly,6.45
5,Local / regional government,4.91
6,Public employment service,4.53
7,Social insurance,3.83
8,Other social actors (e.g. NGOs),3.36
9,Public support service providers,2.57


Unnamed: 0,target_groups,Relative freq
0,Employees in standard employment,15.97
1,Sector specific set of companies,14.02
2,Particular professions,6.67
3,Self-employed,6.6
4,SMEs,6.13
5,Unemployed,4.78
6,One person or microenterprises,4.38
7,Other groups of workers,3.17
8,Solo-self-employed,3.03
9,Workers in non-standard forms of employment,2.83


Unnamed: 0,funding,Relative freq
0,National funds,46.41
1,No special funding required,19.45
2,Companies,10.48
3,European Funds,8.69
4,Employer,4.48
5,Regional funds,2.9
6,Other,1.79
7,Employers organisation,1.72
8,Local funds,1.66
9,Employees,1.24


In [39]:
for key in eda_result.keys():
    data = eda_result[key]
    column_name = data.columns[1]
    zscore_column = data.columns[0]+'_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[zscore_column] = round((data[column_name]-data[column_name].mean())/data[column_name].std(),2)
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(data)
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    px.bar(data,x=data.columns[2],y=data.columns[0]).show()
    px.bar(data,x=data.columns[0],y=data.columns[3]).show()
    px.bar(data,x=data.columns[0],y=data.columns[4]).show()

Ttest_1sampResult(statistic=0.0, pvalue=1.0)
Ttest_1sampResult(statistic=-7.424967886718749e-16, pvalue=0.9999999999999994)
Ttest_1sampResult(statistic=-1.0024133467986045e-15, pvalue=0.9999999999999992)
Ttest_1sampResult(statistic=-5.808204065620902e-16, pvalue=0.9999999999999996)
Ttest_1sampResult(statistic=8.535833571084494e-16, pvalue=0.9999999999999993)
Ttest_1sampResult(statistic=-4.652651803823097e-16, pvalue=0.9999999999999996)


Unnamed: 0,country,Relative freq,country_z_score,Cumulative freq,Diff freq
0,Spain,6.66,2.96,6.66,
1,Italy,5.12,1.59,11.78,-1.54
2,Greece,5.04,1.52,16.82,-0.08
3,Germany,4.95,1.44,21.77,-0.09
4,Austria,4.87,1.37,26.64,-0.08
5,Portugal,4.53,1.06,31.17,-0.34
6,France,4.01,0.6,35.18,-0.52
7,Croatia,3.67,0.3,38.85,-0.34
8,Lithuania,3.5,0.15,42.35,-0.17
9,Norway,3.5,0.15,45.85,0.0


Std deviation for [country] is [1.12]

Unnamed: 0,category,Relative freq,category_z_score,Cumulative freq,Diff freq
0,Supporting businesses to stay afloat,28.1,2.37,28.1,
1,"Protection of workers, adaptation of workplace",14.35,0.45,42.45,-13.75
2,Income protection beyond short-time work,12.21,0.15,54.66,-2.14
3,"Promoting the economic, labour market and soci...",10.16,-0.13,64.82,-2.05
4,Ensuring business continuity and support for e...,9.65,-0.2,74.47,-0.51
5,Employment protection and retention,9.39,-0.24,83.86,-0.26
6,Measures to prevent social hardship,7.09,-0.56,90.95,-2.3
7,Reorientation of business activities,5.72,-0.75,96.67,-1.37
8,Supporting businesses to get back to normal,3.33,-1.08,100.0,-2.39


Std deviation for [category] is [7.18]

Unnamed: 0,subcategory,Relative freq,subcategory_z_score,Cumulative freq,Diff freq
0,Direct subsidies (full or partial),12.89,3.8,12.89,
1,Access to finance,7.6,1.81,20.49,-5.29
2,Income support for people in employment (e.g. ...,7.26,1.69,27.75,-0.34
3,Other,5.81,1.14,33.56,-1.45
4,Deferral of payments or liabilities,5.55,1.04,39.11,-0.26
5,Occupational health and safety,4.95,0.82,44.06,-0.6
6,Extensions of income support to workers not c...,4.78,0.75,48.84,-0.17
7,Change of production/innovation,4.61,0.69,53.45,-0.17
8,"Active labour market policies, incl. subsidise...",4.61,0.69,58.06,0.0
9,"Teleworking arrangements, remote working",4.36,0.6,62.42,-0.25


Std deviation for [subcategory] is [2.66]

Unnamed: 0,actors,Relative freq,actors_z_score,Cumulative freq,Diff freq
0,National government,38.36,2.83,38.36,
1,Company / Companies,18.97,1.0,57.33,-19.39
2,Trade unions,7.9,-0.04,65.23,-11.07
3,Employers' organisations,7.29,-0.1,72.52,-0.61
4,Social partners jointly,6.45,-0.18,78.97,-0.84
5,Local / regional government,4.91,-0.32,83.88,-1.54
6,Public employment service,4.53,-0.36,88.41,-0.38
7,Social insurance,3.83,-0.42,92.24,-0.7
8,Other social actors (e.g. NGOs),3.36,-0.47,95.6,-0.47
9,Public support service providers,2.57,-0.54,98.17,-0.79


Std deviation for [actors] is [10.59]

Unnamed: 0,target_groups,Relative freq,target_groups_z_score,Cumulative freq,Diff freq
0,Employees in standard employment,15.97,4.03,15.97,
1,Sector specific set of companies,14.02,3.45,29.99,-1.95
2,Particular professions,6.67,1.27,36.66,-7.35
3,Self-employed,6.6,1.25,43.26,-0.07
4,SMEs,6.13,1.11,49.39,-0.47
5,Unemployed,4.78,0.71,54.17,-1.35
6,One person or microenterprises,4.38,0.59,58.55,-0.4
7,Other groups of workers,3.17,0.23,61.72,-1.21
8,Solo-self-employed,3.03,0.19,64.75,-0.14
9,Workers in non-standard forms of employment,2.83,0.13,67.58,-0.2


Std deviation for [target_groups] is [3.37]

Unnamed: 0,funding,Relative freq,funding_z_score,Cumulative freq,Diff freq
0,National funds,46.41,2.88,46.41,
1,No special funding required,19.45,0.84,65.86,-26.96
2,Companies,10.48,0.16,76.34,-8.97
3,European Funds,8.69,0.03,85.03,-1.79
4,Employer,4.48,-0.29,89.51,-4.21
5,Regional funds,2.9,-0.41,92.41,-1.58
6,Other,1.79,-0.49,94.2,-1.11
7,Employers organisation,1.72,-0.5,95.92,-0.07
8,Local funds,1.66,-0.5,97.58,-0.06
9,Employees,1.24,-0.54,98.82,-0.42


Std deviation for [funding] is [13.23]