# EDA categorical on enriched eu-timeline data

## Import libraries

In [2]:
# import sys
# sys.path.append("/home/jovyan/work/sem-covid/")
# sys.path = list(set(sys.path))
#
# import os
# os.getcwd()
# os.chdir('/home/jovyan/work/sem-covid/')

import plotly.express as px

from IPython.display import display, Markdown
from sem_covid.services.data_registry import Dataset
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.categorical_analyze import fast_categorical_analyze
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.confidence_interval_analysis import (
    confidence_interval_with_mean, z_score_for_series, confidence_interval_for_proportion)
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.collision_analysis import (class_collision_in_columns,
                                                                                       class_collision)
from sem_covid.entrypoints.notebooks.EDA.eda_wrangling.cramer_analysis import get_cramer_corr_matrix

In [None]:
px.defaults.width = 800
px.defaults.height = 400

## Define constants

In [None]:
CATEGORICAL_COLUMNS = [ 'category', 'subcategory', 'businesses', 'citizens', 'workers', 'type_of_measure']

CRAMER_ANALYSIS_COLUMNS = ['category', 'subcategory', 'businesses', 'citizens', 'workers', 'type_of_measure']

## Fetch the data

In [None]:
eu_timeline_enriched = Dataset.EU_ACTION_TIMELINE_ENRICHED.fetch()

## EDA on categorical data from loaded dataset

In [None]:
eda_result = fast_categorical_analyze(eu_timeline_enriched, CATEGORICAL_COLUMNS , 'Eu Timeline Data')

## Analysis and visualization:
- Z score
- cumulative frequencies
- difference in neighboring frequencies

In [None]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    column_name = data.columns[1]
    z_score_column = data.columns[0] + '_z_score'
    cumulative_freq = 'Cumulative freq'
    diff_freq = 'Diff freq'
    data[z_score_column] = z_score_for_series(data[column_name])
    data[cumulative_freq] = data[column_name].cumsum()
    data[diff_freq] = data[column_name].diff()
    display(Markdown(f"Std deviation for [{key}] is [{round(data[column_name].std(),2)}]"))
    display(data)
    px.bar(data,x=data.columns[2],y=data.columns[0],color_discrete_sequence=['#003d66']).show()
    px.bar(data,x=data.columns[0],y=data.columns[3],color_discrete_sequence=['#003d66']).show()
    px.bar(data,x=data.columns[0],y=data.columns[4],color_discrete_sequence=['#003d66']).show()

## Analysis of confidence intervals:
- calculating the confidence interval for each column
- calculating the confidence interval for each proportion from column
- calculating the records from column, that are overrepresented
- calculating the records from column, that are underrepresented

In [None]:
for key in eda_result.keys():
    data = eda_result[key].copy()
    tmp_s = data[data.columns[1]].copy()
    tmp_s/=100
    ci_mean = confidence_interval_with_mean(tmp_s)
    display(Markdown(f"Confidence Interval for {key} is : [{ci_mean[0]}%, {ci_mean[1]}%]"))
    data['Confidence Interval']= confidence_interval_for_proportion(tmp_s)
    data['z_score'] = z_score_for_series(tmp_s)
    display(data)
    display(Markdown(f"Overrepresented records from column : {key}"))
    rel_f = 'Relative freq'
    display(data.loc[data[rel_f]>ci_mean[1]])
    display(Markdown(f"Normal represented records from column : {key}"))
    display(data.loc[(data[rel_f]>=ci_mean[0])&(data[rel_f]<=ci_mean[1])])
    display(Markdown(f"Underrepresented records from column : {key}"))
    display(data.loc[data[rel_f]<ci_mean[0]])

## Categorical data collision analysis in columns

In [None]:
class_collision_in_columns(eu_timeline_enriched[CATEGORICAL_COLUMNS])

## Analysis of categorical data collisions in DataFrame

In [None]:
class_collision(eu_timeline_enriched[CATEGORICAL_COLUMNS])

## Cramer analysis

In [None]:
get_cramer_corr_matrix(eu_timeline_enriched[CRAMER_ANALYSIS_COLUMNS])
