# EDA categorical on ireland-timeline data

## Import libraries

In [50]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import plotly.express as px

from IPython.display import display, Markdown
from sem_covid.services.data_registry import Dataset
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.categorical_analyze import fast_categorical_analyze
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.confidence_interval_analysis import (
    confidence_interval_with_mean, z_score_for_series, confidence_interval_for_proportion)
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.collision_analysis import (class_collision_in_columns,
                                                                                       class_collision)

## Define constants

In [51]:
CATEGORICAL_COLUMNS = ['keyword', 'page_type']

## Fetch the data

In [52]:
ireland_timeline = Dataset.IRELAND_ACTION_TIMELINE.fetch()

## EDA on categorical data from loaded dataset

In [53]:
eda_result = fast_categorical_analyze(ireland_timeline, CATEGORICAL_COLUMNS , 'Ireland Timeline Data')

Unnamed: 0,index,Absolute freq
0,page_type,5
1,published_date,13
2,title,3
3,content,5
4,content_links,139
5,campaigns_links,323
6,part_of_links,229
7,documents,348


Unnamed: 0,keyword,Relative freq
0,crisis,2.44
1,social impact,2.44
2,aid programme,2.44
3,health risk,2.44
4,organisation of health care,2.44
5,working environment,2.44
6,protective equipment,2.44
7,e-Health,2.44
8,air transport,2.2
9,aid to disadvantaged groups,2.2


Unnamed: 0,page_type,Relative freq
0,Press release,58.27
1,Publication,27.16
2,Speech,7.41
3,News,4.94
4,Collection,1.98
5,Form,0.25


## Analysis and visualization:
- Z score
- cumulative frequencies
- difference in neighboring frequencies

In [54]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    column_name = data.columns[1]
    z_score_column = data.columns[0] + '_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[z_score_column] = z_score_for_series(data[column_name])
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    display(data)
    px.bar(data,x=data.columns[2],y=data.columns[0]).show()
    px.bar(data,x=data.columns[0],y=data.columns[3]).show()
    px.bar(data,x=data.columns[0],y=data.columns[4]).show()

Std deviation for [keyword] is [0.68]

Unnamed: 0,keyword,Relative freq,keyword_z_score,Cumulative freq,Diff freq
0,crisis,2.44,1.38,2.44,
1,social impact,2.44,1.38,4.88,0.00
2,aid programme,2.44,1.38,7.32,0.00
3,health risk,2.44,1.38,9.76,0.00
4,organisation of health care,2.44,1.38,12.20,0.00
...,...,...,...,...,...
61,medical research,0.49,-1.53,99.06,0.00
62,hospital infection,0.24,-1.90,99.30,-0.25
63,economic aid,0.24,-1.90,99.54,0.00
64,endemic disease,0.24,-1.90,99.78,0.00


Std deviation for [page_type] is [22.58]

Unnamed: 0,page_type,Relative freq,page_type_z_score,Cumulative freq,Diff freq
0,Press release,58.27,2.02,58.27,
1,Publication,27.16,0.51,85.43,-31.11
2,Speech,7.41,-0.45,92.84,-19.75
3,News,4.94,-0.57,97.78,-2.47
4,Collection,1.98,-0.71,99.76,-2.96
5,Form,0.25,-0.8,100.01,-1.73


## Analysis of confidence intervals:
- calculating the confidence interval for each column
- calculating the confidence interval for each proportion from column
- calculating the records from column, that are overrepresented
- calculating the records from column, that are underrepresented

In [55]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    tmp_s = data[data.columns[1]].copy()
    tmp_s/=100
    ci_mean = confidence_interval_with_mean(tmp_s)
    display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
    data['Confidence Interval']= confidence_interval_for_proportion(tmp_s)
    data['z_score'] = z_score_for_series(tmp_s)
    display(data)
    display(Markdown(f"Overrepresented records from column : {key}"))
    rel_f = 'Relative freq'
    display(data.loc[data[rel_f]>ci_mean[1]])
    display(Markdown(f"Normal represented records from column : {key}"))
    display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
    display(Markdown(f"Underrepresented records from column : {key}"))
    display(data.loc[data[rel_f]<ci_mean[0]])

Confidence Interval for keyword is : [1.35%, 1.68%]

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
0,crisis,2.44,"[0.0, 6.16]",1.38
1,social impact,2.44,"[0.0, 6.16]",1.38
2,aid programme,2.44,"[0.0, 6.16]",1.38
3,health risk,2.44,"[0.0, 6.16]",1.38
4,organisation of health care,2.44,"[0.0, 6.16]",1.38
...,...,...,...,...
61,medical research,0.49,"[0.0, 2.17]",-1.53
62,hospital infection,0.24,"[0.0, 1.42]",-1.90
63,economic aid,0.24,"[0.0, 1.42]",-1.90
64,endemic disease,0.24,"[0.0, 1.42]",-1.90


Overrepresented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
0,crisis,2.44,"[0.0, 6.16]",1.38
1,social impact,2.44,"[0.0, 6.16]",1.38
2,aid programme,2.44,"[0.0, 6.16]",1.38
3,health risk,2.44,"[0.0, 6.16]",1.38
4,organisation of health care,2.44,"[0.0, 6.16]",1.38
5,working environment,2.44,"[0.0, 6.16]",1.38
6,protective equipment,2.44,"[0.0, 6.16]",1.38
7,e-Health,2.44,"[0.0, 6.16]",1.38
8,air transport,2.2,"[0.0, 5.74]",1.02
9,aid to disadvantaged groups,2.2,"[0.0, 5.74]",1.02


Normal represented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
31,communications policy,1.46,"[0.0, 4.35]",-0.08
32,health policy,1.46,"[0.0, 4.35]",-0.08
33,economic support,1.46,"[0.0, 4.35]",-0.08
34,European Centre for Disease Prevention and Con...,1.46,"[0.0, 4.35]",-0.08
35,health service,1.46,"[0.0, 4.35]",-0.08
36,research and development,1.46,"[0.0, 4.35]",-0.08
37,social participation,1.46,"[0.0, 4.35]",-0.08
38,public health,1.46,"[0.0, 4.35]",-0.08
39,occupational health,1.46,"[0.0, 4.35]",-0.08
40,disease prevention,1.46,"[0.0, 4.35]",-0.08


Underrepresented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
42,distance learning,1.22,"[0.0, 3.87]",-0.44
43,labour market,1.22,"[0.0, 3.87]",-0.44
44,disease surveillance,1.22,"[0.0, 3.87]",-0.44
45,standard of living,1.22,"[0.0, 3.87]",-0.44
46,public hygiene,1.22,"[0.0, 3.87]",-0.44
47,infectious disease,1.22,"[0.0, 3.87]",-0.44
48,social sciences,1.22,"[0.0, 3.87]",-0.44
49,viral disease,1.22,"[0.0, 3.87]",-0.44
50,self-regulation,1.22,"[0.0, 3.87]",-0.44
51,patient rights,0.98,"[0.0, 3.36]",-0.8


Confidence Interval for page_type is : [0%, 34.74%]

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
0,Press release,58.27,"[18.81, 97.73]",2.02
1,Publication,27.16,"[0.0, 62.75]",0.51
2,Speech,7.41,"[0.0, 28.37]",-0.45
3,News,4.94,"[0.0, 22.28]",-0.57
4,Collection,1.98,"[0.0, 13.13]",-0.71
5,Form,0.25,"[0.0, 4.25]",-0.8


Overrepresented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
0,Press release,58.27,"[18.81, 97.73]",2.02


Normal represented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
1,Publication,27.16,"[0.0, 62.75]",0.51
2,Speech,7.41,"[0.0, 28.37]",-0.45
3,News,4.94,"[0.0, 22.28]",-0.57
4,Collection,1.98,"[0.0, 13.13]",-0.71
5,Form,0.25,"[0.0, 4.25]",-0.8


Underrepresented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score


## Categorical data collision analysis in columns

In [56]:
class_collision_in_columns(ireland_timeline[CATEGORICAL_COLUMNS])

## Analysis of categorical data collisions in DataFrame

In [57]:
class_collision(ireland_timeline[CATEGORICAL_COLUMNS])

Collision in dataframe