# Classificação de Variáveis

In [None]:
%%capture
!pip install pingouin
!pip install researchpy

In [None]:
import sys
import warnings
sys.path.append('..')
import seaborn as sns
import pingouin as pg
import researchpy as rpy
from myst_nb import glue
from global_configurations import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

conf_dict = read_config_file(DATASET_INFO_PATH)
df = read_dataset(conf_dict, DATASET_PATH)

df, numerical, categorical, categorical_dt, datetime, target = infer_column_types(df, conf_dict)

In [None]:
eta_sq = {}
for v in numerical:
    aov = pg.anova(dv=v, between=target, data=df, detailed=True)
    if 'np2' not in aov.columns:
        eta_sq[v] = 0.0
        continue
    eta_sq[v] = aov['np2'][0]
eta_sq = sorted(eta_sq.items(), key=operator.itemgetter(1), reverse=True)

fig = plt.figure(figsize=(15, 5))
ax = sns.barplot(x='Variável', y='Eta-Squared', data=pd.DataFrame(eta_sq, columns=['Variável', 'Eta-Squared']))
ax.tick_params(axis='x', rotation=90)

glue("numerical_eta_sq_plot", fig, display=False)

## Top 40 variáveis numéricas com maior associação com o alvo
```{glue:figure} numerical_eta_sq_plot
```

In [None]:
cramers_v = {}
# TODO Identifier variables
for var in categorical[3:]:
    crosstab, res = rpy.crosstab(df[target], 
                                 df[var], test='chi-square')
    cramers_v[var] = res['results'][2]
cramers_v = sorted(cramers_v.items(), key=operator.itemgetter(1), reverse=True)

fig = plt.figure(figsize=(15, 5))
ax = sns.barplot(x='Variável', y='Cramers V', data=pd.DataFrame(cramers_v, columns=['Variável', 'Cramers V']).iloc[:40])
ax.tick_params(axis='x', rotation=90)

glue("categorical_cramers_v_plot", fig, display=False)

## Top 40 variáveis categóricas com maior associação com o alvo
```{glue:figure} categorical_cramers_v_plot
```