# EDA categorical on PWDB

## Import libraries

In [72]:
import pathlib as path
import pandas as pd
import plotly.express as px
from collections import  Counter
from IPython.display import display
from tqdm import tqdm

## Define constants

In [3]:
FOLDER  = path.Path("/home/jovyan/data/")

SRC_FILE_NAME = "covid19db.json"

OUTPUT_FILE_NAME = "eda_pwdb_result.html"

SRC_FILE_PATH = FOLDER / SRC_FILE_NAME

OUTPUT_FILE_PATH = FOLDER / OUTPUT_FILE_NAME

CATEGORICAL_COLUMNS = ['country', 'category', 'subcategory','actors', 'target_groups', 'funding']



## Load PWDB dataset

In [4]:
if SRC_FILE_PATH.exists():
    df = pd.read_json(SRC_FILE_PATH)
else:
    print("Source path is invalid!")


## Function for plot bar chart on observations

In [5]:
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)

## Function for plot pie chart on observations

In [59]:
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
    columns = observations.columns
    return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )

## Function for making observations on categorical data

In [92]:
def calc_freq_categorical_data(data : pd.Series,title : str ,relative : bool = False ):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    data.dropna(inplace=True)
    observation = pd.DataFrame(Counter(data).most_common(10),columns=[title,observation_type_name])
    if relative:
        observation[observation_type_name]/=observation[observation_type_name].sum()/100
        observation[observation_type_name]=round(observation[observation_type_name],2)
    return observation

## Function for making observations on missing data

In [90]:
def calc_freq_missing_data(data : pd.DataFrame,relative : bool = False):
    observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
    columns = data.columns
    tmp = pd.Series(dtype=object)
    for column in columns:
        series_tmp = data[column].explode()
        tmp[column]= series_tmp.isnull().sum()
        if relative:
            tmp[column]/=series_tmp.size/100
            tmp[column]=round(tmp[column],2)
    observation = pd.DataFrame(tmp[tmp>0] ,columns=[observation_type_name])
    observation.reset_index(inplace=True)
    return observation

## EDA on categorical data from loaded dataset

In [93]:
def fast_categorical_analyze(data : pd.DataFrame,data_title : str = 'Unknown'):
    abs_miss_obs = calc_freq_missing_data(data)
    display(abs_miss_obs)

    if abs_miss_obs.size>0:
        plot_pie_chart(abs_miss_obs,data_title+' missing values').show()
    data = data[CATEGORICAL_COLUMNS]
    for column_name in data.columns:
        data_column = data[column_name].explode()
        try:
            rel_obs = calc_freq_categorical_data(data_column,column_name,True)
            display(rel_obs)
            plot_bar_chart(rel_obs,column_name).show()
            plot_pie_chart(rel_obs,column_name).show()
        except:
            print('Observation on [',column_name,'] fault!')
            print('Check if column [',column_name,'] have compatible type!')


fast_categorical_analyze(df,"PWDB Dataset")





Unnamed: 0,index,Absolute freq
0,end_date,509
1,social_partner_form,1171
2,social_partner_role,1171
3,target_groups,303
4,sectors,729
5,occupations,1056
6,sources,1


Unnamed: 0,country,Relative freq
0,Spain,14.53
1,Italy,11.17
2,Greece,10.99
3,Germany,10.8
4,Austria,10.61
5,Portugal,9.87
6,France,8.75
7,Croatia,8.01
8,Lithuania,7.64
9,Norway,7.64


Unnamed: 0,category,Relative freq
0,Supporting businesses to stay afloat,28.1
1,"Protection of workers, adaptation of workplace",14.35
2,Income protection beyond short-time work,12.21
3,"Promoting the economic, labour market and soci...",10.16
4,Ensuring business continuity and support for e...,9.65
5,Employment protection and retention,9.39
6,Measures to prevent social hardship,7.09
7,Reorientation of business activities,5.72
8,Supporting businesses to get back to normal,3.33


Unnamed: 0,subcategory,Relative freq
0,Direct subsidies (full or partial),20.66
1,Access to finance,12.18
2,Income support for people in employment (e.g. ...,11.63
3,Other,9.3
4,Deferral of payments or liabilities,8.89
5,Occupational health and safety,7.93
6,Extensions of income support to workers not c...,7.66
7,Change of production/innovation,7.39
8,"Active labour market policies, incl. subsidise...",7.39
9,"Teleworking arrangements, remote working",6.98


Unnamed: 0,actors,Relative freq
0,National government,39.08
1,Company / Companies,19.32
2,Trade unions,8.04
3,Employers' organisations,7.43
4,Social partners jointly,6.57
5,Local / regional government,5.0
6,Public employment service,4.62
7,Social insurance,3.9
8,Other social actors (e.g. NGOs),3.43
9,Public support service providers,2.62


Unnamed: 0,target_groups,Relative freq
0,Employees in standard employment,23.63
1,Sector specific set of companies,20.74
2,Particular professions,9.87
3,Self-employed,9.77
4,SMEs,9.07
5,Unemployed,7.08
6,One person or microenterprises,6.48
7,Other groups of workers,4.69
8,Solo-self-employed,4.49
9,Workers in non-standard forms of employment,4.19


Unnamed: 0,funding,Relative freq
0,National funds,46.96
1,No special funding required,19.68
2,Companies,10.61
3,European Funds,8.79
4,Employer,4.54
5,Regional funds,2.93
6,Other,1.81
7,Employers organisation,1.74
8,Local funds,1.67
9,Employees,1.26
