# EDA on eurlex data

Import libraries


In [4]:
import json
from IPython.display import display, Markdown
import statsmodels.api as sm

import plotly.express as px
import pandas as pd

from ml_experiments.config import config
from ml_experiments.adapters.minio_adapter import MinioAdapter
from ml_experiments.entrypoints.notebooks.EDA.eda_wrangling.categorical_analyze import fast_categorical_analyze
from ml_experiments.entrypoints.notebooks.EDA.eda_wrangling.confidence_interval_analysis import (
    confidence_interval_with_mean, confidence_interval_for_proportion, z_score_for_series)
from ml_experiments.entrypoints.notebooks.EDA.eda_wrangling.collision_analysis import (class_collision_in_columns,
                                                                                       class_collision)


In [5]:
minio = MinioAdapter(config.MINIO_URL,
                     config.MINIO_ACCESS_KEY,
                     config.MINIO_SECRET_KEY,
                     config.IRISH_TIMELINE_BUCKET_NAME)

irish_timeline_json = json.loads(minio.get_object(config.IRISH_TIMELINE_JSON))

irish_timeline_dataframe = pd.DataFrame.from_records(irish_timeline_json)

CATEGORICAL_COLUMNS = ['keyword', 'page_type']

## EDA on categorical data from loaded dataset


In [6]:
eda_result = fast_categorical_analyze(irish_timeline_dataframe, CATEGORICAL_COLUMNS ,"Eurlex Dataset")

Unnamed: 0,index,Absolute freq
0,content,1
1,content_links,11
2,campaigns_links,17
3,part_of_links,14
4,documents,22


Unnamed: 0,keyword,Relative freq
0,crisis,31.25
1,covid,31.25
2,pandemic,28.12
3,virus,9.38


Unnamed: 0,page_type,Relative freq
0,Press release,78.12
1,Publication,18.75
2,,3.12


## Analysis and visualization:
- Z score.
- cumulative frequencies
- difference in neighboring frequencies

In [7]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    column_name = data.columns[1]
    zscore_column = data.columns[0]+'_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[zscore_column] = round((data[column_name]-data[column_name].mean())/data[column_name].std(),2)
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    display(data)
    px.bar(data,x=data.columns[2],y=data.columns[0]).show()
    px.bar(data,x=data.columns[0],y=data.columns[3]).show()
    px.bar(data,x=data.columns[0],y=data.columns[4]).show()

Std deviation for [keyword] is [10.52]

Unnamed: 0,keyword,Relative freq,keyword_z_score,Cumulative freq,Diff freq
0,crisis,31.25,0.59,31.25,
1,covid,31.25,0.59,62.5,0.0
2,pandemic,28.12,0.3,90.62,-3.13
3,virus,9.38,-1.49,100.0,-18.74


Std deviation for [page_type] is [39.57]

Unnamed: 0,page_type,Relative freq,page_type_z_score,Cumulative freq,Diff freq
0,Press release,78.12,1.13,78.12,
1,Publication,18.75,-0.37,96.87,-59.37
2,,3.12,-0.76,99.99,-15.63


## Additional functions for confidence interval analysis:

In [8]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    n = data.size
    tmp_s = data[data.columns[1]].copy()
    tmp_s/=100
    ci_mean = confidence_interval_with_mean(tmp_s)
    display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
    data["Confidence Interval"]= confidence_interval_for_proportion(tmp_s)
    data["z_score"] = z_score_for_series(tmp_s)
    display(data)
    display(Markdown(f"Overrepresented records from column : {key}"))
    rel_f = 'Relative freq'
    display(data.loc[data[rel_f]>ci_mean[1]])
    display(Markdown(f"Normal represented records from column : {key}"))
    display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
    display(Markdown(f"Underrepresented records from column : {key}"))
    display(data.loc[data[rel_f]<ci_mean[0]])

Confidence Interval for keyword is : [14.69%, 35.31%]

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
0,crisis,31.25,"[0.0, 76.67]",0.69
1,covid,31.25,"[0.0, 76.67]",0.69
2,pandemic,28.12,"[0.0, 72.18]",0.34
3,virus,9.38,"[0.0, 37.95]",-1.71


Overrepresented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score


Normal represented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
0,crisis,31.25,"[0.0, 76.67]",0.69
1,covid,31.25,"[0.0, 76.67]",0.69
2,pandemic,28.12,"[0.0, 72.18]",0.34


Underrepresented records from column : keyword

Unnamed: 0,keyword,Relative freq,Confidence Interval,z_score
3,virus,9.38,"[0.0, 37.95]",-1.71


Confidence Interval for page_type is : [-11.45%, 78.11%]

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
0,Press release,78.12,"[31.34, 100.0]",1.39
1,Publication,18.75,"[0.0, 62.92]",-0.45
2,,3.12,"[0.0, 22.79]",-0.94


Overrepresented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
0,Press release,78.12,"[31.34, 100.0]",1.39


Normal represented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score
1,Publication,18.75,"[0.0, 62.92]",-0.45
2,,3.12,"[0.0, 22.79]",-0.94


Underrepresented records from column : page_type

Unnamed: 0,page_type,Relative freq,Confidence Interval,z_score


## Categorical data collision analysis in columns

In [11]:
class_collision_in_columns(irish_timeline_dataframe[CATEGORICAL_COLUMNS])

## Analysis of categorical data collisions in DataFrame

In [13]:
class_collision(irish_timeline_dataframe[CATEGORICAL_COLUMNS])

Collision in dataframe