# Seleção de Features usando o teste chi quadrado de independência - abordagem 2

## 1. Carregando módulos e dados

In [5]:
# Manipulação dos dados
import numpy  as np
import scipy  as sp
import pandas as pd

# Visualização de dados
import matplotlib.pyplot as plt
import seaborn           as sns
%matplotlib inline

from scipy.stats import chi2_contingency

In [6]:
# Carregando os dados
dados = pd.read_csv('../../Dados/abordagem2.csv')

In [7]:
# Visualização da tabela
dados.head(5)

Unnamed: 0,desfecho,Idade,TTO_anterior_TB_triagem_enfermeiro_Não,TTO_anterior_TB_triagem_enfermeiro_Sim,TTO_anterior_TB_triagem_enfermeiro_ignorado,Cicatriz_BCG_Não,Cicatriz_BCG_Sim,Cicatriz_BCG_ignorado,Sexo_Feminino,Sexo_Masculino,...,Contato_TBP_2anos_Não,Contato_TBP_2anos_Sim,Contato_TBP_2anos_ignorado,Fuma_Ex-fumante,Fuma_Fumante,Fuma_Jamais fumante,Fuma_ignorado,CAGE_Negativo,CAGE_Positivo,CAGE_ignorado
0,TB+,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,TB+,53,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TB+,84,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,TB+,18,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,TB+,36,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## 2. Seleção de variáveis utilizando o teste do chi quadrado de independência

O teste do chi quadrado verifica se existe a relação entre duas variáveis categóricas. Dessa forma, será verificada a relação entre as variáveis presentes e o desfecho, ou seja, se o atributo possui alguma relação com o diagnóstico da doença ou não. O teste do Chi-quadrado é realizado para a verificação de independência entre duas variáveis categóricas:

- Hipótese nula $H_0$ : As variáveis são independentes.
- Hipótese alternativa $H_a$: As variáveis são dependentes

In [8]:
# Criando um DataFrame para o armazenamento dos valores

dict_features_ignorados = dict()

for feature in list(dados.columns)[1:]:
        
    print('Tabela de Contingência entre o desfecho e {}\n'.format(feature))
    
    # Contingency table from pandas
    cross_tab = pd.crosstab(dados[feature],dados['desfecho'],margins=True)
    print(pd.crosstab(dados[feature],dados['desfecho'],margins=True))
    print()
    
    # Chi Squared Test 
    chi2, p, dof, expected =  chi2_contingency(cross_tab)
    print('Chi2: {}\tp-value: {}\tDOF: {}\nExpected_val: \n{}\n\n'.format(chi2,p,dof,expected))
    
    dict_features_ignorados[feature] = [chi2,p,dof,(len(dados[feature].unique()) - 1),]

Tabela de Contingência entre o desfecho e Idade

desfecho  TB+   TB-   All
Idade                    
9           0     1     1
12          2     1     3
13          4     5     9
14          7    16    23
15          9     7    16
16         13    12    25
17         12    17    29
18         19    24    43
19         17    19    36
20         18    18    36
21         20    22    42
22         28    22    50
23         22    30    52
24         26    26    52
25         11    20    31
26         22    22    44
27         27    26    53
28         15    33    48
29         16    23    39
30         24    30    54
31         19    14    33
32         16    31    47
33         26    30    56
34         22    27    49
35         18    27    45
36         15    36    51
37         11    34    45
38         17    25    42
39         13    18    31
40         15    23    38
...       ...   ...   ...
65          5    36    41
66          4    20    24
67          7    26    33
68          6  

In [13]:
# Criando um DataFrame a partir do dicionário criado anteriormente
chi2_dict = pd.DataFrame.from_dict(dict_features_ignorados,orient='index')
chi2_dict.columns = ['chi2','pval','dof','dof_real']

In [14]:
variaveis_dependentes   = chi2_dict[chi2_dict.pval <= 0.05]
variaveis_independentes = chi2_dict[chi2_dict.pval > 0.05]

In [15]:
variaveis_dependentes

Unnamed: 0,chi2,pval,dof,dof_real
Semanas_tosse_Sim,58.151622,7.091746e-12,4,1
Dispnéia_Sim,20.340945,0.0004276529,4,1
Cicatriz_BCG_Sim,14.609215,0.005584296,4,1
Cicatriz_BCG_Não,16.081225,0.002912087,4,1
Perda_de_apetite_Não,30.755863,3.433273e-06,4,1
Prisão_2anos_Não,29.959147,4.989065e-06,4,1
Contato_TBP_2anos_Sim,16.7594,0.002152442,4,1
Dispnéia_Não,20.891141,0.0003328004,4,1
Contato_TBP_2anos_Não,14.267672,0.006487985,4,1
Delegacia_2anos_Sim,20.448214,0.0004072664,4,1
