# EDA categorical on eurlex data

## Import libraries

In [39]:
import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))

import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

import plotly.express as px

from IPython.display import display, Markdown
from sem_covid.services.data_registry import Dataset
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.categorical_analyze import fast_categorical_analyze
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.confidence_interval_analysis import (
    confidence_interval_with_mean, z_score_for_series, confidence_interval_for_proportion)
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.collision_analysis import (class_collision_in_columns,
                                                                                       class_collision)

## Define constants

In [40]:
CATEGORICAL_COLUMNS = ['resource_type_labels', 'eurovoc_concept_labels', 'subject_matter_labels',
                    'directory_codes_labels', 'author_labels', 'internal_comments']

## Fetch the data

In [41]:
eurlex = Dataset.EU_CELLAR.fetch()

## EDA on categorical data from loaded dataset

In [42]:
eda_result = fast_categorical_analyze(eurlex, CATEGORICAL_COLUMNS , 'Eurlex Dataset')

Unnamed: 0,index,Absolute freq
0,title,714
1,cdm_type_labels,9792
2,subject_matters,6488
3,subject_matter_labels,6488
4,directory_codes,6419
5,directory_codes_labels,6419
6,celex_numbers,5206
7,legal_elis,8476
8,authors,5050
9,author_labels,5050


Unnamed: 0,resource_type_labels,Relative freq
0,General publications,41.48
1,Study,11.45
2,Judicial information,5.81
3,Notice,4.75
4,Corrigendum,4.67
5,Regulation,2.77
6,Staff working document,2.29
7,Executive summary of a study,2.19
8,Decision,2.15
9,Implementing regulation,2.04


Unnamed: 0,eurovoc_concept_labels,Relative freq
0,epidemic,12.97
1,coronavirus disease,10.62
2,innovation,9.78
3,infectious disease,6.49
4,public health,4.36
5,air transport,4.15
6,labour market,3.98
7,research and development,3.9
8,health risk,3.48
9,economic consequence,3.06


Unnamed: 0,subject_matter_labels,Relative freq
0,Transport,9.46
1,Public health,7.95
2,Economic policy,4.55
3,Internal market - Principles,4.45
4,External relations,3.88
5,Environment,3.63
6,State aids,3.5
7,Economic and Monetary Union,3.47
8,European Free Trade Association (EFTA),3.08
9,Approximation of laws,2.72


Unnamed: 0,directory_codes_labels,Relative freq
0,Health protection,5.09
1,State aids and other subsidies,4.46
2,Economic and monetary union,4.28
3,Member countries of the European Free Trade As...,3.96
4,Conventions with non-member countries,3.38
5,Protection of health and safety,3.23
6,Motor vehicles,2.94
7,Technical and safety conditions,2.84
8,Employment and unemployment,2.84
9,Budget,2.65


Unnamed: 0,author_labels,Relative freq
0,European Commission,34.42
1,Council of the European Union,9.77
2,Court of Justice,6.09
3,European Parliament,5.34
4,Secretariat-General,3.97
5,Directorate-General for Health and Food Safety,3.85
6,Directorate-General for Mobility and Transport,3.71
7,General Court,3.03
8,Directorate-General for Translation,2.52
9,Directorate-General for Competition,2.32


Unnamed: 0,internal_comments,Relative freq
0,MAN2,58.84
1,COVID19,39.27
2,BREXIT,1.54
3,"MAN2, COVID19",0.21
4,552,0.05
5,COVID-19,0.05
6,COOVID19,0.03


## Analysis and visualization:
- Z score.
- cumulative frequencies
- difference in neighboring frequencies

In [43]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    column_name = data.columns[1]
    z_score_column = data.columns[0] + '_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[z_score_column] = z_score_for_series(data[column_name])
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    display(data)
    px.bar(data,x=data.columns[2],y=data.columns[0]).show()
    px.bar(data,x=data.columns[0],y=data.columns[3]).show()
    px.bar(data,x=data.columns[0],y=data.columns[4]).show()

Std deviation for [resource_type_labels] is [5.19]

Unnamed: 0,resource_type_labels,Relative freq,resource_type_labels_z_score,Cumulative freq,Diff freq
0,General publications,41.48,7.77,41.48,
1,Study,11.45,1.94,52.93,-30.03
2,Judicial information,5.81,0.85,58.74,-5.64
3,Notice,4.75,0.64,63.49,-1.06
4,Corrigendum,4.67,0.62,68.16,-0.08
...,...,...,...,...,...
64,Annual report,0.02,-0.28,99.96,0.00
65,Draft implementing decision,0.02,-0.28,99.98,0.00
66,Proposal for an implementing regulation,0.02,-0.28,100.00,0.00
67,Executive summary of the fitness check,0.02,-0.28,100.02,0.00


Std deviation for [eurovoc_concept_labels] is [1.49]

Unnamed: 0,eurovoc_concept_labels,Relative freq,eurovoc_concept_labels_z_score,Cumulative freq,Diff freq
0,epidemic,12.97,8.44,12.97,
1,coronavirus disease,10.62,6.86,23.59,-2.35
2,innovation,9.78,6.29,33.37,-0.84
3,infectious disease,6.49,4.08,39.86,-3.29
4,public health,4.36,2.65,44.22,-2.13
...,...,...,...,...,...
228,European Social Fund,0.01,-0.28,99.58,0.00
229,European security,0.01,-0.28,99.59,0.00
230,distribution of EU funding,0.01,-0.28,99.60,0.00
231,external border of the EU,0.01,-0.28,99.61,0.00


Std deviation for [subject_matter_labels] is [1.38]

Unnamed: 0,subject_matter_labels,Relative freq,subject_matter_labels_z_score,Cumulative freq,Diff freq
0,Transport,9.46,6.32,9.46,
1,Public health,7.95,5.22,17.41,-1.51
2,Economic policy,4.55,2.76,21.96,-3.40
3,Internal market - Principles,4.45,2.68,26.41,-0.10
4,External relations,3.88,2.27,30.29,-0.57
...,...,...,...,...,...
128,European Investment Bank (EIB),0.03,-0.52,99.91,0.00
129,Space,0.03,-0.52,99.94,0.00
130,General provisions,0.03,-0.52,99.97,0.00
131,Tourism,0.03,-0.52,100.00,0.00


Std deviation for [directory_codes_labels] is [0.86]

Unnamed: 0,directory_codes_labels,Relative freq,directory_codes_labels_z_score,Cumulative freq,Diff freq
0,Health protection,5.09,5.34,5.09,
1,State aids and other subsidies,4.46,4.61,9.55,-0.63
2,Economic and monetary union,4.28,4.40,13.83,-0.18
3,Member countries of the European Free Trade As...,3.96,4.03,17.79,-0.32
4,Conventions with non-member countries,3.38,3.35,21.17,-0.58
...,...,...,...,...,...
193,General provisions,0.04,-0.54,99.84,0.00
194,Aeronautical industry,0.04,-0.54,99.88,0.00
195,Aid to developing countries,0.04,-0.54,99.92,0.00
196,Nuclear research,0.02,-0.57,99.94,-0.02


Std deviation for [author_labels] is [4.08]

Unnamed: 0,author_labels,Relative freq,author_labels_z_score,Cumulative freq,Diff freq
0,European Commission,34.42,8.19,34.42,
1,Council of the European Union,9.77,2.10,44.19,-24.65
2,Court of Justice,6.09,1.19,50.28,-3.68
3,European Parliament,5.34,1.01,55.62,-0.75
4,Secretariat-General,3.97,0.67,59.59,-1.37
...,...,...,...,...,...
75,European Union Intellectual Property Office,0.03,-0.30,99.99,0.00
76,Single Resolution Board,0.03,-0.30,100.02,0.00
77,Directorate-General for Structural Reform Support,0.03,-0.30,100.05,0.00
78,Committee on the Internal Market and Consumer ...,0.01,-0.31,100.06,-0.02


Std deviation for [internal_comments] is [24.42]

Unnamed: 0,internal_comments,Relative freq,internal_comments_z_score,Cumulative freq,Diff freq
0,MAN2,58.84,1.97,58.84,
1,COVID19,39.27,1.11,98.11,-19.57
2,BREXIT,1.54,-0.56,99.65,-37.73
3,"MAN2, COVID19",0.21,-0.62,99.86,-1.33
4,552,0.05,-0.63,99.91,-0.16
5,COVID-19,0.05,-0.63,99.96,0.0
6,COOVID19,0.03,-0.63,99.99,-0.02


## Analysis of confidence intervals:
- calculating the confidence interval for each column
- calculating the confidence interval for each proportion from column
- calculating the records from column, that are overrepresented
- calculating the records from column, that are underrepresented

In [44]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    tmp_s = data[data.columns[1]].copy()
    tmp_s /= 100
    ci_mean = confidence_interval_with_mean(tmp_s)
    display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
    data["Confidence Interval"] = confidence_interval_for_proportion(tmp_s)
    data["z_score"] = z_score_for_series(tmp_s)
    display(data)
    display(Markdown(f"Overrepresented records from column: {key}"))
    rel_f = 'Relative freq'
    display(data.loc[data[rel_f] > ci_mean[1]])
    display(Markdown(f"Normal represented records from column : {key}"))
    display(data.loc[(data[rel_f] >= ci_mean[0]) & (data[rel_f] <= ci_mean[1])])
    display(Markdown(f"Underrepresented records from column : {key}"))
    display(data.loc[data[rel_f] < ci_mean[0]])

Confidence Interval for resource_type_labels is : [0.23%, 2.67%]

Unnamed: 0,resource_type_labels,Relative freq,Confidence Interval,z_score
0,General publications,41.48,"[29.85, 53.11]",7.77
1,Study,11.45,"[3.94, 18.96]",1.94
2,Judicial information,5.81,"[0.29, 11.33]",0.85
3,Notice,4.75,"[0.0, 9.77]",0.64
4,Corrigendum,4.67,"[0.0, 9.65]",0.62
...,...,...,...,...
64,Annual report,0.02,"[0.0, 0.35]",-0.28
65,Draft implementing decision,0.02,"[0.0, 0.35]",-0.28
66,Proposal for an implementing regulation,0.02,"[0.0, 0.35]",-0.28
67,Executive summary of the fitness check,0.02,"[0.0, 0.35]",-0.28


Overrepresented records from column: resource_type_labels

Unnamed: 0,resource_type_labels,Relative freq,Confidence Interval,z_score
0,General publications,41.48,"[29.85, 53.11]",7.77
1,Study,11.45,"[3.94, 18.96]",1.94
2,Judicial information,5.81,"[0.29, 11.33]",0.85
3,Notice,4.75,"[0.0, 9.77]",0.64
4,Corrigendum,4.67,"[0.0, 9.65]",0.62
5,Regulation,2.77,"[0.0, 6.64]",0.26


Normal represented records from column : resource_type_labels

Unnamed: 0,resource_type_labels,Relative freq,Confidence Interval,z_score
6,Staff working document,2.29,"[0.0, 5.82]",0.16
7,Executive summary of a study,2.19,"[0.0, 5.64]",0.14
8,Decision,2.15,"[0.0, 5.57]",0.14
9,Implementing regulation,2.04,"[0.0, 5.38]",0.11
10,Implementing decision,1.83,"[0.0, 4.99]",0.07
11,Report,1.77,"[0.0, 4.88]",0.06
12,Communication,1.59,"[0.0, 4.54]",0.03
13,Announcements,1.54,"[0.0, 4.45]",0.02
14,Opinion,1.2,"[0.0, 3.77]",-0.05
15,Annex to a study,1.15,"[0.0, 3.67]",-0.06


Underrepresented records from column : resource_type_labels

Unnamed: 0,resource_type_labels,Relative freq,Confidence Interval,z_score
28,Exploratory opinion,0.21,"[0.0, 1.29]",-0.24
29,Position,0.21,"[0.0, 1.29]",-0.24
30,Statement of reasons,0.21,"[0.0, 1.29]",-0.24
31,Resolution,0.21,"[0.0, 1.29]",-0.24
32,Addendum,0.18,"[0.0, 1.18]",-0.25
33,Notification,0.18,"[0.0, 1.18]",-0.25
34,Communication concerning the position of the C...,0.17,"[0.0, 1.14]",-0.25
35,Proposal for a recommendation,0.17,"[0.0, 1.14]",-0.25
36,Declaration,0.16,"[0.0, 1.1]",-0.25
37,Proposal for a directive,0.15,"[0.0, 1.06]",-0.25


Confidence Interval for eurovoc_concept_labels is : [0.24%, 0.62%]

Unnamed: 0,eurovoc_concept_labels,Relative freq,Confidence Interval,z_score
0,epidemic,12.97,"[8.66, 17.28]",8.44
1,coronavirus disease,10.62,"[6.66, 14.58]",6.86
2,innovation,9.78,"[5.97, 13.59]",6.29
3,infectious disease,6.49,"[3.33, 9.65]",4.08
4,public health,4.36,"[1.74, 6.98]",2.65
...,...,...,...,...
228,European Social Fund,0.01,"[0.0, 0.14]",-0.28
229,European security,0.01,"[0.0, 0.14]",-0.28
230,distribution of EU funding,0.01,"[0.0, 0.14]",-0.28
231,external border of the EU,0.01,"[0.0, 0.14]",-0.28


Overrepresented records from column: eurovoc_concept_labels

Unnamed: 0,eurovoc_concept_labels,Relative freq,Confidence Interval,z_score
0,epidemic,12.97,"[8.66, 17.28]",8.44
1,coronavirus disease,10.62,"[6.66, 14.58]",6.86
2,innovation,9.78,"[5.97, 13.59]",6.29
3,infectious disease,6.49,"[3.33, 9.65]",4.08
4,public health,4.36,"[1.74, 6.98]",2.65
5,air transport,4.15,"[1.59, 6.71]",2.51
6,labour market,3.98,"[1.47, 6.49]",2.39
7,research and development,3.9,"[1.41, 6.39]",2.34
8,health risk,3.48,"[1.13, 5.83]",2.05
9,economic consequence,3.06,"[0.85, 5.27]",1.77


Normal represented records from column : eurovoc_concept_labels

Unnamed: 0,eurovoc_concept_labels,Relative freq,Confidence Interval,z_score
26,economic activity,0.53,"[0.0, 1.46]",0.07
27,vaccination,0.51,"[0.0, 1.42]",0.06
28,tourism,0.49,"[0.0, 1.39]",0.04
29,social media,0.49,"[0.0, 1.39]",0.04
30,quality of life,0.49,"[0.0, 1.39]",0.04
31,socioeconomic conditions,0.48,"[0.0, 1.37]",0.04
32,disinformation,0.47,"[0.0, 1.35]",0.03
33,freedom of movement,0.42,"[0.0, 1.25]",-0.01
34,e-Health,0.41,"[0.0, 1.23]",-0.01
35,protective equipment,0.41,"[0.0, 1.23]",-0.01


Underrepresented records from column : eurovoc_concept_labels

Unnamed: 0,eurovoc_concept_labels,Relative freq,Confidence Interval,z_score
48,illness,0.21,"[0.0, 0.8]",-0.15
49,working environment,0.21,"[0.0, 0.8]",-0.15
50,teleworking,0.18,"[0.0, 0.72]",-0.17
51,public hygiene,0.15,"[0.0, 0.65]",-0.19
52,social sciences,0.13,"[0.0, 0.59]",-0.20
...,...,...,...,...
228,European Social Fund,0.01,"[0.0, 0.14]",-0.28
229,European security,0.01,"[0.0, 0.14]",-0.28
230,distribution of EU funding,0.01,"[0.0, 0.14]",-0.28
231,external border of the EU,0.01,"[0.0, 0.14]",-0.28


Confidence Interval for subject_matter_labels is : [0.52%, 0.99%]

Unnamed: 0,subject_matter_labels,Relative freq,Confidence Interval,z_score
0,Transport,9.46,"[4.49, 14.43]",6.32
1,Public health,7.95,"[3.35, 12.55]",5.22
2,Economic policy,4.55,"[1.01, 8.09]",2.76
3,Internal market - Principles,4.45,"[0.95, 7.95]",2.68
4,External relations,3.88,"[0.6, 7.16]",2.27
...,...,...,...,...
128,European Investment Bank (EIB),0.03,"[0.0, 0.32]",-0.52
129,Space,0.03,"[0.0, 0.32]",-0.52
130,General provisions,0.03,"[0.0, 0.32]",-0.52
131,Tourism,0.03,"[0.0, 0.32]",-0.52


Overrepresented records from column: subject_matter_labels

Unnamed: 0,subject_matter_labels,Relative freq,Confidence Interval,z_score
0,Transport,9.46,"[4.49, 14.43]",6.32
1,Public health,7.95,"[3.35, 12.55]",5.22
2,Economic policy,4.55,"[1.01, 8.09]",2.76
3,Internal market - Principles,4.45,"[0.95, 7.95]",2.68
4,External relations,3.88,"[0.6, 7.16]",2.27
5,Environment,3.63,"[0.45, 6.81]",2.09
6,State aids,3.5,"[0.38, 6.62]",1.99
7,Economic and Monetary Union,3.47,"[0.36, 6.58]",1.97
8,European Free Trade Association (EFTA),3.08,"[0.14, 6.02]",1.69
9,Approximation of laws,2.72,"[0.0, 5.48]",1.43


Normal represented records from column : subject_matter_labels

Unnamed: 0,subject_matter_labels,Relative freq,Confidence Interval,z_score
27,Freedom of establishment,0.98,"[0.0, 2.65]",0.17
28,Foodstuffs,0.92,"[0.0, 2.54]",0.12
29,Taxation,0.87,"[0.0, 2.45]",0.09
30,Information and verification,0.82,"[0.0, 2.35]",0.05
31,Safety at work and elsewhere,0.79,"[0.0, 2.29]",0.03
32,Energy,0.79,"[0.0, 2.29]",0.03
33,Free movement of persons,0.7,"[0.0, 2.12]",-0.04
34,Value added tax,0.65,"[0.0, 2.02]",-0.07
35,Common organisation of agricultural markets,0.65,"[0.0, 2.02]",-0.07
36,Justice and home affairs,0.62,"[0.0, 1.95]",-0.1


Underrepresented records from column : subject_matter_labels

Unnamed: 0,subject_matter_labels,Relative freq,Confidence Interval,z_score
44,Protective measures,0.47,"[0.0, 1.63]",-0.20
45,Cooperation,0.47,"[0.0, 1.63]",-0.20
46,Investments,0.46,"[0.0, 1.61]",-0.21
47,Industrial policy,0.41,"[0.0, 1.5]",-0.25
48,Commercial policy,0.36,"[0.0, 1.38]",-0.28
...,...,...,...,...
128,European Investment Bank (EIB),0.03,"[0.0, 0.32]",-0.52
129,Space,0.03,"[0.0, 0.32]",-0.52
130,General provisions,0.03,"[0.0, 0.32]",-0.52
131,Tourism,0.03,"[0.0, 0.32]",-0.52


Confidence Interval for directory_codes_labels is : [0.39%, 0.62%]

Unnamed: 0,directory_codes_labels,Relative freq,Confidence Interval,z_score
0,Health protection,5.09,"[2.03, 8.15]",5.34
1,State aids and other subsidies,4.46,"[1.58, 7.34]",4.61
2,Economic and monetary union,4.28,"[1.46, 7.1]",4.40
3,Member countries of the European Free Trade As...,3.96,"[1.24, 6.68]",4.03
4,Conventions with non-member countries,3.38,"[0.86, 5.9]",3.35
...,...,...,...,...
193,General provisions,0.04,"[0.0, 0.32]",-0.54
194,Aeronautical industry,0.04,"[0.0, 0.32]",-0.54
195,Aid to developing countries,0.04,"[0.0, 0.32]",-0.54
196,Nuclear research,0.02,"[0.0, 0.22]",-0.57


Overrepresented records from column: directory_codes_labels

Unnamed: 0,directory_codes_labels,Relative freq,Confidence Interval,z_score
0,Health protection,5.09,"[2.03, 8.15]",5.34
1,State aids and other subsidies,4.46,"[1.58, 7.34]",4.61
2,Economic and monetary union,4.28,"[1.46, 7.1]",4.4
3,Member countries of the European Free Trade As...,3.96,"[1.24, 6.68]",4.03
4,Conventions with non-member countries,3.38,"[0.86, 5.9]",3.35
5,Protection of health and safety,3.23,"[0.77, 5.69]",3.18
6,Motor vehicles,2.94,"[0.59, 5.29]",2.84
7,Technical and safety conditions,2.84,"[0.53, 5.15]",2.72
8,Employment and unemployment,2.84,"[0.53, 5.15]",2.72
9,Budget,2.65,"[0.41, 4.89]",2.5


Normal represented records from column : directory_codes_labels

Unnamed: 0,directory_codes_labels,Relative freq,Confidence Interval,z_score
41,Proprietary medicinal products,0.6,"[0.0, 1.68]",0.11
42,Working conditions,0.58,"[0.0, 1.64]",0.09
43,Common Foreign and Security Policy,0.58,"[0.0, 1.64]",0.09
44,Dissemination of information,0.54,"[0.0, 1.56]",0.04
45,Cooperation with international and non-governm...,0.52,"[0.0, 1.52]",0.02
46,Safety at work,0.46,"[0.0, 1.4]",-0.05
47,Social conditions,0.46,"[0.0, 1.4]",-0.05
48,Water protection and management,0.46,"[0.0, 1.4]",-0.05
49,Statistics,0.46,"[0.0, 1.4]",-0.05
50,The Near and Middle East,0.42,"[0.0, 1.32]",-0.1


Underrepresented records from column : directory_codes_labels

Unnamed: 0,directory_codes_labels,Relative freq,Confidence Interval,z_score
54,Commission,0.38,"[0.0, 1.24]",-0.15
55,Social policy,0.38,"[0.0, 1.24]",-0.15
56,Police and judicial cooperation in criminal an...,0.38,"[0.0, 1.24]",-0.15
57,Programmes,0.38,"[0.0, 1.24]",-0.15
58,Plant health,0.38,"[0.0, 1.24]",-0.15
...,...,...,...,...
193,General provisions,0.04,"[0.0, 0.32]",-0.54
194,Aeronautical industry,0.04,"[0.0, 0.32]",-0.54
195,Aid to developing countries,0.04,"[0.0, 0.32]",-0.54
196,Nuclear research,0.02,"[0.0, 0.22]",-0.57


Confidence Interval for author_labels is : [0.36%, 2.14%]

Unnamed: 0,author_labels,Relative freq,Confidence Interval,z_score
0,European Commission,34.42,"[24.01, 44.83]",8.19
1,Council of the European Union,9.77,"[3.26, 16.28]",2.10
2,Court of Justice,6.09,"[0.85, 11.33]",1.19
3,European Parliament,5.34,"[0.41, 10.27]",1.01
4,Secretariat-General,3.97,"[0.0, 8.25]",0.67
...,...,...,...,...
75,European Union Intellectual Property Office,0.03,"[0.0, 0.41]",-0.30
76,Single Resolution Board,0.03,"[0.0, 0.41]",-0.30
77,Directorate-General for Structural Reform Support,0.03,"[0.0, 0.41]",-0.30
78,Committee on the Internal Market and Consumer ...,0.01,"[0.0, 0.23]",-0.31


Overrepresented records from column: author_labels

Unnamed: 0,author_labels,Relative freq,Confidence Interval,z_score
0,European Commission,34.42,"[24.01, 44.83]",8.19
1,Council of the European Union,9.77,"[3.26, 16.28]",2.1
2,Court of Justice,6.09,"[0.85, 11.33]",1.19
3,European Parliament,5.34,"[0.41, 10.27]",1.01
4,Secretariat-General,3.97,"[0.0, 8.25]",0.67
5,Directorate-General for Health and Food Safety,3.85,"[0.0, 8.07]",0.64
6,Directorate-General for Mobility and Transport,3.71,"[0.0, 7.85]",0.61
7,General Court,3.03,"[0.0, 6.79]",0.44
8,Directorate-General for Translation,2.52,"[0.0, 5.95]",0.31
9,Directorate-General for Competition,2.32,"[0.0, 5.62]",0.26


Normal represented records from column : author_labels

Unnamed: 0,author_labels,Relative freq,Confidence Interval,z_score
12,EFTA Surveillance Authority,2.04,"[0.0, 5.14]",0.19
13,European Economic and Social Committee,1.88,"[0.0, 4.86]",0.16
14,"Directorate-General for Internal Market, Indus...",1.48,"[0.0, 4.13]",0.06
15,European Committee of the Regions,1.09,"[0.0, 3.37]",-0.04
16,"Directorate-General for Employment, Social Aff...",1.03,"[0.0, 3.24]",-0.05
17,Directorate-General for Migration and Home Aff...,1.02,"[0.0, 3.22]",-0.06
18,Directorate-General for Environment,0.94,"[0.0, 3.05]",-0.08
19,Directorate-General for Taxation and Customs U...,0.6,"[0.0, 2.29]",-0.16
20,"Directorate-General for Financial Stability, F...",0.58,"[0.0, 2.24]",-0.17
21,Directorate-General for Budget,0.58,"[0.0, 2.24]",-0.17


Underrepresented records from column : author_labels

Unnamed: 0,author_labels,Relative freq,Confidence Interval,z_score
30,European Personnel Selection Office,0.32,"[0.0, 1.56]",-0.23
31,Eurostat,0.26,"[0.0, 1.38]",-0.24
32,Directorate-General for Neighbourhood and Enla...,0.26,"[0.0, 1.38]",-0.24
33,EFTA Court,0.24,"[0.0, 1.31]",-0.25
34,"Commission for Social Policy, Education, Emplo...",0.23,"[0.0, 1.28]",-0.25
35,Representatives of the Governments of the Memb...,0.21,"[0.0, 1.21]",-0.26
36,Directorate-General for Trade,0.2,"[0.0, 1.18]",-0.26
37,European Systemic Risk Board,0.19,"[0.0, 1.14]",-0.26
38,Commission for Natural Resources,0.19,"[0.0, 1.14]",-0.26
39,Commission for Territorial Cohesion Policy and...,0.19,"[0.0, 1.14]",-0.26


Confidence Interval for internal_comments is : [0%, 32.38%]

Unnamed: 0,internal_comments,Relative freq,Confidence Interval,z_score
0,MAN2,58.84,"[22.38, 95.3]",1.97
1,COVID19,39.27,"[3.09, 75.45]",1.11
2,BREXIT,1.54,"[0.0, 10.66]",-0.56
3,"MAN2, COVID19",0.21,"[0.0, 3.6]",-0.62
4,552,0.05,"[0.0, 1.71]",-0.63
5,COVID-19,0.05,"[0.0, 1.71]",-0.63
6,COOVID19,0.03,"[0.0, 1.31]",-0.63


Overrepresented records from column: internal_comments

Unnamed: 0,internal_comments,Relative freq,Confidence Interval,z_score
0,MAN2,58.84,"[22.38, 95.3]",1.97
1,COVID19,39.27,"[3.09, 75.45]",1.11


Normal represented records from column : internal_comments

Unnamed: 0,internal_comments,Relative freq,Confidence Interval,z_score
2,BREXIT,1.54,"[0.0, 10.66]",-0.56
3,"MAN2, COVID19",0.21,"[0.0, 3.6]",-0.62
4,552,0.05,"[0.0, 1.71]",-0.63
5,COVID-19,0.05,"[0.0, 1.71]",-0.63
6,COOVID19,0.03,"[0.0, 1.31]",-0.63


Underrepresented records from column : internal_comments

Unnamed: 0,internal_comments,Relative freq,Confidence Interval,z_score


## Categorical data collision analysis in columns

In [45]:
class_collision_in_columns(eurlex[CATEGORICAL_COLUMNS])

Collision in column : resource_type_labels

Collision in column : eurovoc_concept_labels

Collision in column : subject_matter_labels

Collision in column : directory_codes_labels

Collision in column : author_labels

Collision in column : internal_comments

## Analysis of categorical data collisions in DataFrame

In [None]:
class_collision(eurlex[CATEGORICAL_COLUMNS])

Collision in dataframe