# Seleção de Features usando o teste chi quadrado de independência

## 1. Carregando módulos e dados

In [1]:
# Manipulação dos dados
import numpy  as np
import scipy  as sp
import pandas as pd

# Visualização de dados
import matplotlib.pyplot as plt
import seaborn           as sns
%matplotlib inline

In [2]:
# Carregando os dados
data_normal_ohe   = pd.read_csv('../../Dados/one-hot.csv')
data_full_ohe = pd.read_csv('../../Dados/one-hot-total.csv')

In [3]:
# Visualização da tabela
data_normal_ohe.head(5)

Unnamed: 0,desfecho,TTO_anterior_TB_triagem_enfermeiro,Cicatriz_BCG,Idade,Sexo,Raça,Tem_companheiro,Tosse,Semanas_tosse,Expectoração,...,CAGE,Estado_civil_Casado,Estado_civil_Separado,Estado_civil_Solteiro,Estado_civil_ignorado,Estado_civil_viúvo,Fuma_Ex-fumante,Fuma_Fumante,Fuma_Jamais fumante,Fuma_ignorado
0,TB+,Sim,Sim,30,Masculino,Não branco,Não,Sim,ignorado,Não,...,Não,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,TB+,Sim,ignorado,53,Feminino,Branco,Não,Sim,Não,Sim,...,Não,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,TB+,Não,Não,84,Feminino,Branco,Não,Sim,Sim,Sim,...,Não,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,TB+,Não,Sim,18,Feminino,Branco,Sim,Sim,Sim,Não,...,Não,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,TB+,Sim,ignorado,36,Masculino,Não branco,Não,Sim,Não,Sim,...,Sim,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
# Visualização da tabela
data_full_ohe.head(5)

Unnamed: 0,desfecho,Idade,TTO_anterior_TB_triagem_enfermeiro_Não,TTO_anterior_TB_triagem_enfermeiro_Sim,TTO_anterior_TB_triagem_enfermeiro_ignorado,Cicatriz_BCG_Não,Cicatriz_BCG_Sim,Cicatriz_BCG_ignorado,Sexo_Feminino,Sexo_Masculino,...,Contato_TBP_2anos_Não,Contato_TBP_2anos_Sim,Contato_TBP_2anos_ignorado,Fuma_Ex-fumante,Fuma_Fumante,Fuma_Jamais fumante,Fuma_ignorado,CAGE_Não,CAGE_Sim,CAGE_ignorado
0,TB+,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,TB+,53,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TB+,84,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,TB+,18,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,TB+,36,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## 2. Seleção de variáveis utilizando o teste do chi quadrado de independência

O teste do chi quadrado verifica se existe a relação entre duas variáveis categóricas. Dessa forma, será verificada a relação entre as variáveis presentes e o desfecho, ou seja, se o atributo possui alguma relação com o diagnóstico da doença ou não. 

O teste do Chi-quadrado é realizado para a verificação de independência entre duas variáveis categóricas.

- Hipótese nula $H_0$ : As variáveis são independentes.
- Hipótese alternativa $H_a$: As variáveis são dependentes

### 2.1 Abordagem conservadora: Deletar os valores ignorados

In [5]:
# Criando um novo dataframe
dados_sem_ignorados = data_normal_ohe.copy()

# Substituindo os valores por nulos 
dados_sem_ignorados.replace(['ignorado'],np.nan,inplace=True)

# Remover todos os casos ignorados
dados_sem_ignorados = dados_sem_ignorados.dropna(how='any')

In [6]:
from scipy.stats import chi2_contingency

# Criando um DataFrame para o armazenamento dos valores

dict_features = dict()

for feature in list(dados_sem_ignorados.columns)[1:]:
    
#     if dados_con[feature].dtype != 'O':
#         continue
    
    print('Tabela de Contingência entre o desfecho e {}\n'.format(feature))
    
    # Contingency table from pandas
    cross_tab = pd.crosstab(dados_sem_ignorados[feature],dados_sem_ignorados['desfecho'],margins=True)
    print(pd.crosstab(dados_sem_ignorados[feature],dados_sem_ignorados['desfecho'],margins=True))
    print()
    
    # Chi Squared Test 
    chi2, p, dof, expected =  chi2_contingency(cross_tab)
    print('Chi2: {}\tp-value: {}\tDOF: {}\nExpected_val: \n{}\n\n'.format(chi2,p,dof,expected))
    
    dict_features[feature] = [chi2,p,dof,(len(dados_sem_ignorados[feature].unique()) - 1)]

Tabela de Contingência entre o desfecho e TTO_anterior_TB_triagem_enfermeiro

desfecho                            TB+  TB-   All
TTO_anterior_TB_triagem_enfermeiro                
Não                                 351  741  1092
Sim                                  81  234   315
All                                 432  975  1407

Chi2: 4.7486250000000005	p-value: 0.3140758065337092	DOF: 4
Expected_val: 
[[  335.28358209   756.71641791  1092.        ]
 [   96.71641791   218.28358209   315.        ]
 [  432.           975.          1407.        ]]


Tabela de Contingência entre o desfecho e Cicatriz_BCG

desfecho      TB+  TB-   All
Cicatriz_BCG                
Não           144  444   588
Sim           288  531   819
All           432  975  1407

Chi2: 18.33183431952663	p-value: 0.0010627700874393108	DOF: 4
Expected_val: 
[[  180.53731343   407.46268657   588.        ]
 [  251.46268657   567.53731343   819.        ]
 [  432.           975.          1407.        ]]


Tabela de Contingê

In [7]:
# Criando um DataFrame a partir do dicionário criado anteriormente
chi2_dict = pd.DataFrame.from_dict(dict_features,orient='index')
chi2_dict.columns = ['chi2','pval','dof','dof_real']

# Dataframe criado
chi2_dict

Unnamed: 0,chi2,pval,dof,dof_real
Fuma_Fumante,1.073085,0.8985187,4,1
Cicatriz_BCG,18.331834,0.00106277,4,1
Idade,139.157302,0.8571087,158,78
Prisão_2anos,19.333824,0.0006756972,4,1
Fuma_Jamais fumante,2.781989,0.5949457,4,1
Hemoptise,2.710957,0.6072987,4,1
Perda_de_apetite,6.557697,0.1611911,4,1
Internação_hospital_2anos,0.623353,0.9604337,4,1
Tosse,0.11665,0.9983638,4,1
Estado_civil_ignorado,0.558024,0.967612,4,1


In [8]:
variaveis_dependentes   = chi2_dict[chi2_dict.pval < 0.05]
variaveis_independentes = chi2_dict[chi2_dict.pval > 0.05]

In [9]:
variaveis_dependentes

Unnamed: 0,chi2,pval,dof,dof_real
Cicatriz_BCG,18.331834,0.00106277,4,1
Prisão_2anos,19.333824,0.0006756972,4,1
Estado_civil_viúvo,11.737643,0.01941267,4,1
Dispnéia,11.179443,0.02461965,4,1
Semanas_tosse,33.193965,1.090085e-06,4,1
Febre,37.044453,1.763716e-07,4,1
Perda_peso_10percent,111.468003,3.5389470000000006e-23,4,1
Delegacia_2anos,17.674704,0.00142845,4,1
Sexo,25.371503,4.235617e-05,4,1
Estado_civil_Casado,10.877413,0.02797683,4,1


In [10]:
variaveis_independentes

Unnamed: 0,chi2,pval,dof,dof_real
Fuma_Fumante,1.073085,0.898519,4,1
Idade,139.157302,0.857109,158,78
Fuma_Jamais fumante,2.781989,0.594946,4,1
Hemoptise,2.710957,0.607299,4,1
Perda_de_apetite,6.557697,0.161191,4,1
Internação_hospital_2anos,0.623353,0.960434,4,1
Tosse,0.11665,0.998364,4,1
Estado_civil_ignorado,0.558024,0.967612,4,1
Contato_TBP_2anos,6.904814,0.141005,4,1
Fuma_Ex-fumante,0.72433,0.948295,4,1


In [29]:
len(variaveis_dependentes)

13

In [30]:
len(variaveis_independentes)

18

### 2.2 Considerando os casos ignorados

In [13]:
# Criando um DataFrame para o armazenamento dos valores

dict_features_ignorados = dict()

for feature in list(data_normal_ohe.columns)[1:]:
    
#     if dad[feature].dtype != 'O':
#         continue
    
    print('Tabela de Contingência entre o desfecho e {}\n'.format(feature))
    
    # Contingency table from pandas
    cross_tab = pd.crosstab(data_normal_ohe[feature],data_normal_ohe['desfecho'],margins=True)
    print(pd.crosstab(data_normal_ohe[feature],data_normal_ohe['desfecho'],margins=True))
    print()
    
    # Chi Squared Test 
    chi2, p, dof, expected =  chi2_contingency(cross_tab)
    print('Chi2: {}\tp-value: {}\tDOF: {}\nExpected_val: \n{}\n\n'.format(chi2,p,dof,expected))
    
    dict_features_ignorados[feature] = [chi2,p,dof,(len(data_normal_ohe[feature].unique()) - 1),]

Tabela de Contingência entre o desfecho e TTO_anterior_TB_triagem_enfermeiro

desfecho                            TB+   TB-   All
TTO_anterior_TB_triagem_enfermeiro                 
Não                                 668  1388  2056
Sim                                 202   453   655
ignorado                              1    15    16
All                                 871  1856  2727

Chi2: 5.508782301963685	p-value: 0.48039588021335167	DOF: 6
Expected_val: 
[[  656.68353502  1399.31646498  2056.        ]
 [  209.20608728   445.79391272   655.        ]
 [    5.1103777     10.8896223     16.        ]
 [  871.          1856.          2727.        ]]


Tabela de Contingência entre o desfecho e Cicatriz_BCG

desfecho      TB+   TB-   All
Cicatriz_BCG                 
Não           222   614   836
Sim           434   780  1214
ignorado      215   462   677
All           871  1856  2727

Chi2: 19.267143857915137	p-value: 0.0037354693229748413	DOF: 6
Expected_val: 
[[  267.01723506   568.9

In [14]:
# Criando um DataFrame a partir do dicionário criado anteriormente
chi2_dict_ignorados = pd.DataFrame.from_dict(dict_features_ignorados,orient='index')
chi2_dict_ignorados.columns = ['chi2','pval','dof','dof_real']

# Dataframe criado
chi2_dict_ignorados

Unnamed: 0,chi2,pval,dof,dof_real
Fuma_Fumante,3.705622,0.4473087,4,1
Cicatriz_BCG,19.267144,0.003735469,6,2
Idade,209.25384,0.00974427,164,81
Prisão_2anos,39.278211,6.312696e-07,6,2
Fuma_Jamais fumante,8.903652,0.06355336,4,1
Hemoptise,14.441106,0.02507872,6,2
Perda_de_apetite,37.388323,1.479082e-06,6,2
Internação_hospital_2anos,1.063507,0.9830899,6,2
Tosse,3.643238,0.7248251,6,2
Estado_civil_ignorado,0.541689,0.9693177,4,1


In [15]:
variaveis_dependentes_ignorados   = chi2_dict_ignorados[chi2_dict.pval < 0.05]
variaveis_independentes_ignorados = chi2_dict_ignorados[chi2_dict.pval > 0.05]

In [16]:
variaveis_dependentes_ignorados

Unnamed: 0,chi2,pval,dof,dof_real
Cicatriz_BCG,19.267144,0.003735469,6,2
Prisão_2anos,39.278211,6.312696e-07,6,2
Estado_civil_viúvo,24.049972,7.805348e-05,4,1
Dispnéia,21.087264,0.001769541,6,2
Semanas_tosse,47.31824,1.61664e-08,6,2
Febre,94.59521,3.356747e-18,6,2
Perda_peso_10percent,202.580545,5.356408999999999e-41,6,2
Delegacia_2anos,20.875915,0.001931197,6,2
Sexo,53.659613,8.640003e-10,6,2
Estado_civil_Casado,23.673339,9.286502e-05,4,1


In [17]:
variaveis_independentes_ignorados

Unnamed: 0,chi2,pval,dof,dof_real
Fuma_Fumante,3.705622,0.447309,4,1
Idade,209.25384,0.009744,164,81
Fuma_Jamais fumante,8.903652,0.063553,4,1
Hemoptise,14.441106,0.025079,6,2
Perda_de_apetite,37.388323,1e-06,6,2
Internação_hospital_2anos,1.063507,0.98309,6,2
Tosse,3.643238,0.724825,6,2
Estado_civil_ignorado,0.541689,0.969318,4,1
Contato_TBP_2anos,17.496413,0.007622,6,2
Fuma_Ex-fumante,1.802774,0.771975,4,1


In [18]:
len(variaveis_dependentes_ignorados)

13

In [19]:
len(variaveis_independentes_ignorados)

18

### 2.3 Abordagem utilizando One-Hot Encoding

Na MINHA OPINIÃO, não faz sentido.

In [23]:
# Criando um DataFrame para o armazenamento dos valores

dict_features_ohe = dict()

for feature in list(data_full_ohe.columns)[1:]:
    
    print('Tabela de Contingência entre o desfecho e {}\n'.format(feature))
    
    # Contingency table from pandas
    cross_tab = pd.crosstab(data_full_ohe[feature],data_full_ohe['desfecho'],margins=True)
    print(pd.crosstab(data_full_ohe[feature],data_full_ohe['desfecho'],margins=True))
    print()
    
    # Chi Squared Test 
    chi2, p, dof, expected =  chi2_contingency(cross_tab)
    print('Chi2: {}\tp-value: {}\tDOF: {}\nExpected_val: \n{}\n\n'.format(chi2,p,dof,expected))
    
    dict_features_ohe[feature] = [chi2,p,dof,(len(data_full_ohe[feature].unique()) - 1),]

Tabela de Contingência entre o desfecho e Idade

desfecho  TB+   TB-   All
Idade                    
9           0     1     1
12          2     1     3
13          4     5     9
14          7    16    23
15          9     7    16
16         13    12    25
17         12    17    29
18         19    24    43
19         17    19    36
20         18    18    36
21         20    22    42
22         28    22    50
23         22    30    52
24         26    26    52
25         11    20    31
26         22    22    44
27         27    26    53
28         15    33    48
29         16    23    39
30         24    30    54
31         19    14    33
32         16    31    47
33         26    30    56
34         22    27    49
35         18    27    45
36         15    36    51
37         11    34    45
38         17    25    42
39         13    18    31
40         15    23    38
...       ...   ...   ...
64          7    36    43
65          5    36    41
66          4    20    24
67          7  

In [25]:
# Criando um DataFrame a partir do dicionário criado anteriormente
chi2_dict_ohe = pd.DataFrame.from_dict(dict_features_ohe,orient='index')
chi2_dict_ohe.columns = ['chi2','pval','dof','dof_real']

# Dataframe criado
chi2_dict_ohe

Unnamed: 0,chi2,pval,dof,dof_real
CAGE_Não,8.979776,6.160699e-02,4,1
Sudorese_noturna_Sim,44.503437,5.042623e-09,4,1
TTO_anterior_TB_triagem_enfermeiro_Sim,0.479985,9.754200e-01,4,1
Hemoptóicos_Não,0.063836,9.995013e-01,4,1
Prisão_2anos_ignorado,0.000151,1.000000e+00,4,1
Hemoptise_Não,3.981800,4.084746e-01,4,1
Fuma_Ex-fumante,1.802774,7.719747e-01,4,1
Expectoração_Não,3.282287,5.117436e-01,4,1
Tem_companheiro_ignorado,0.027435,9.999068e-01,4,1
Dispnéia_Não,20.891141,3.328004e-04,4,1


In [26]:
variaveis_dependentes_ohe   = chi2_dict_ohe[chi2_dict_ohe.pval < 0.05]
variaveis_independentes_ohe = chi2_dict_ohe[chi2_dict_ohe.pval > 0.05]

In [27]:
variaveis_dependentes_ohe

Unnamed: 0,chi2,pval,dof,dof_real
Sudorese_noturna_Sim,44.503437,5.042623e-09,4,1
Dispnéia_Não,20.891141,0.0003328004,4,1
Semanas_tosse_Sim,46.817708,1.664222e-09,4,1
Tem_companheiro_Não,9.64644,0.04682367,4,1
Sexo_Masculino,52.819153,9.297558e-11,4,1
Raça_Branco,10.532149,0.03235701,4,1
Estado_civil_Casado,23.673339,9.286502e-05,4,1
Prisão_2anos_Sim,39.272006,6.120937e-08,4,1
Dispnéia_Sim,20.340945,0.0004276529,4,1
Sudorese_noturna_Não,44.478725,5.102603e-09,4,1


In [28]:
variaveis_independentes_ohe

Unnamed: 0,chi2,pval,dof,dof_real
CAGE_Não,8.979776,0.061607,4,1
TTO_anterior_TB_triagem_enfermeiro_Sim,0.479985,0.97542,4,1
Hemoptóicos_Não,0.063836,0.999501,4,1
Prisão_2anos_ignorado,0.000151,1.0,4,1
Hemoptise_Não,3.9818,0.408475,4,1
Fuma_Ex-fumante,1.802774,0.771975,4,1
Expectoração_Não,3.282287,0.511744,4,1
Tem_companheiro_ignorado,0.027435,0.999907,4,1
Dor_torácica_Não,0.01669,0.999965,4,1
CAGE_ignorado,0.005371,0.999996,4,1


In [29]:
features_normal_ohe = list(variaveis_dependentes.T.keys())

In [30]:
features_normal_ohe.append('desfecho')

In [31]:
features_full_ohe = list(variaveis_dependentes_ohe.T.keys())

In [32]:
features_full_ohe.append('desfecho')

In [33]:
data_full_ohe[features_full_ohe]

Unnamed: 0,Sudorese_noturna_Sim,Dispnéia_Não,Semanas_tosse_Sim,Tem_companheiro_Não,Sexo_Masculino,Raça_Branco,Estado_civil_Casado,Prisão_2anos_Sim,Dispnéia_Sim,Sudorese_noturna_Não,...,Perda_peso_10percent_Sim,Contato_TBP_2anos_Não,Estado_civil_Solteiro,Semanas_tosse_Não,Prisão_2anos_Não,Febre_Não,Contato_TBP_2anos_Sim,Perda_de_apetite_ignorado,Delegacia_2anos_Sim,desfecho
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TB+
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,TB+
2,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,TB+
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,TB+
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,TB+
5,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TB+
6,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TB+
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,TB+
8,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TB+
9,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TB+


In [35]:
features_full_ohe.append('Idade')
features_full_ohe.append('desfecho')

In [36]:
data_normal_ohe[features_normal_ohe]

Unnamed: 0,Cicatriz_BCG,Prisão_2anos,Estado_civil_viúvo,Dispnéia,Semanas_tosse,Febre,Perda_peso_10percent,Delegacia_2anos,Sexo,Estado_civil_Casado,Expectoração,Sudorese_noturna,Estado_civil_Solteiro,desfecho
0,Sim,Não,0.0,Não,ignorado,Não,Não,Não,Masculino,0.0,Não,Não,1.0,TB+
1,ignorado,Não,0.0,Sim,Não,Não,Não,Não,Feminino,0.0,Sim,Não,0.0,TB+
2,Não,Não,1.0,Não,Sim,Sim,Sim,Não,Feminino,0.0,Sim,Não,0.0,TB+
3,Sim,Não,0.0,Sim,Sim,Não,Não,Não,Feminino,0.0,Não,Não,1.0,TB+
4,ignorado,Não,0.0,Sim,Não,Não,Não,Não,Masculino,0.0,Sim,Sim,1.0,TB+
5,Sim,Não,0.0,Sim,Sim,Sim,Não,Não,Masculino,1.0,Sim,Sim,0.0,TB+
6,Não,Não,0.0,Sim,Sim,Não,Não,Não,Masculino,0.0,Sim,Não,1.0,TB+
7,Sim,Não,0.0,Sim,Não,Não,Não,Não,Feminino,0.0,Não,Não,1.0,TB+
8,Não,Não,0.0,Sim,Sim,Sim,Sim,Não,Masculino,1.0,Sim,Sim,0.0,TB+
9,Sim,Não,0.0,Sim,Sim,Não,Não,Não,Masculino,0.0,Não,Não,1.0,TB+


In [37]:
# Salvando as features utilizando o chi2 normal, com ohe em somente duas features
data_normal_ohe[features_normal_ohe].to_csv('../../Dados/one-hot-chi2.csv',index=False,columns=list(data_normal_ohe[features_normal_ohe].columns))

In [38]:
data_full_ohe[features_full_ohe].to_csv('../../Dados/one-hot-full-chi2.csv',index=False,columns=list(data_full_ohe[features_full_ohe].columns))

In [39]:
data_normal_ohe[features_normal_ohe]

Unnamed: 0,Cicatriz_BCG,Prisão_2anos,Estado_civil_viúvo,Dispnéia,Semanas_tosse,Febre,Perda_peso_10percent,Delegacia_2anos,Sexo,Estado_civil_Casado,Expectoração,Sudorese_noturna,Estado_civil_Solteiro,desfecho
0,Sim,Não,0.0,Não,ignorado,Não,Não,Não,Masculino,0.0,Não,Não,1.0,TB+
1,ignorado,Não,0.0,Sim,Não,Não,Não,Não,Feminino,0.0,Sim,Não,0.0,TB+
2,Não,Não,1.0,Não,Sim,Sim,Sim,Não,Feminino,0.0,Sim,Não,0.0,TB+
3,Sim,Não,0.0,Sim,Sim,Não,Não,Não,Feminino,0.0,Não,Não,1.0,TB+
4,ignorado,Não,0.0,Sim,Não,Não,Não,Não,Masculino,0.0,Sim,Sim,1.0,TB+
5,Sim,Não,0.0,Sim,Sim,Sim,Não,Não,Masculino,1.0,Sim,Sim,0.0,TB+
6,Não,Não,0.0,Sim,Sim,Não,Não,Não,Masculino,0.0,Sim,Não,1.0,TB+
7,Sim,Não,0.0,Sim,Não,Não,Não,Não,Feminino,0.0,Não,Não,1.0,TB+
8,Não,Não,0.0,Sim,Sim,Sim,Sim,Não,Masculino,1.0,Sim,Sim,0.0,TB+
9,Sim,Não,0.0,Sim,Sim,Não,Não,Não,Masculino,0.0,Não,Não,1.0,TB+


In [40]:
data_full_ohe[features_full_ohe]

Unnamed: 0,Sudorese_noturna_Sim,Dispnéia_Não,Semanas_tosse_Sim,Tem_companheiro_Não,Sexo_Masculino,Raça_Branco,Estado_civil_Casado,Prisão_2anos_Sim,Dispnéia_Sim,Sudorese_noturna_Não,...,Prisão_2anos_Não,Febre_Não,Contato_TBP_2anos_Sim,Perda_de_apetite_ignorado,Delegacia_2anos_Sim,desfecho,Idade,desfecho.1,Idade.1,desfecho.2
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,TB+,30,TB+,30,TB+
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,TB+,53,TB+,53,TB+
2,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,TB+,84,TB+,84,TB+
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,TB+,18,TB+,18,TB+
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,TB+,36,TB+,36,TB+
5,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,TB+,71,TB+,71,TB+
6,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,TB+,26,TB+,26,TB+
7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,TB+,22,TB+,22,TB+
8,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,TB+,54,TB+,54,TB+
9,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,TB+,52,TB+,52,TB+


In [94]:
data_normal_ohe[features_normal_ohe].head(5)

Unnamed: 0,Cicatriz_BCG,Estado_civil_Casado,Febre,Prisão_2anos,Dispnéia,Perda_peso_10percent,Sexo,Sudorese_noturna,Estado_civil_Solteiro,Delegacia_2anos,Expectoração,Semanas_tosse,Estado_civil_viúvo,Idade,desfecho
0,Sim,0.0,Não,Não,Não,Não,Masculino,Não,1.0,Não,Não,ignorado,0.0,30,TB+
1,ignorado,0.0,Não,Não,Sim,Não,Feminino,Não,0.0,Não,Sim,Não,0.0,53,TB+
2,Não,0.0,Sim,Não,Não,Sim,Feminino,Não,0.0,Não,Sim,Sim,1.0,84,TB+
3,Sim,0.0,Não,Não,Sim,Não,Feminino,Não,1.0,Não,Não,Sim,0.0,18,TB+
4,ignorado,0.0,Não,Não,Sim,Não,Masculino,Sim,1.0,Não,Sim,Não,0.0,36,TB+


In [41]:
data_normal_ohe[features_normal_ohe].shape

(2727, 14)

In [42]:
data_full_ohe[features_full_ohe].shape

(2727, 34)

In [43]:
data_normal_ohe

Unnamed: 0,desfecho,TTO_anterior_TB_triagem_enfermeiro,Cicatriz_BCG,Idade,Sexo,Raça,Tem_companheiro,Tosse,Semanas_tosse,Expectoração,...,CAGE,Estado_civil_Casado,Estado_civil_Separado,Estado_civil_Solteiro,Estado_civil_ignorado,Estado_civil_viúvo,Fuma_Ex-fumante,Fuma_Fumante,Fuma_Jamais fumante,Fuma_ignorado
0,TB+,Sim,Sim,30,Masculino,Não branco,Não,Sim,ignorado,Não,...,Não,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,TB+,Sim,ignorado,53,Feminino,Branco,Não,Sim,Não,Sim,...,Não,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,TB+,Não,Não,84,Feminino,Branco,Não,Sim,Sim,Sim,...,Não,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,TB+,Não,Sim,18,Feminino,Branco,Sim,Sim,Sim,Não,...,Não,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,TB+,Sim,ignorado,36,Masculino,Não branco,Não,Sim,Não,Sim,...,Sim,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,TB+,Não,Sim,71,Masculino,Não branco,Sim,Sim,Sim,Sim,...,Não,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,TB+,Não,Não,26,Masculino,Branco,Sim,Sim,Sim,Sim,...,Não,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7,TB+,Não,Sim,22,Feminino,Branco,Não,Sim,Não,Não,...,Não,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,TB+,Não,Não,54,Masculino,Não branco,Não,Sim,Sim,Sim,...,Sim,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,TB+,Sim,Sim,52,Masculino,Não branco,Não,Sim,Sim,Não,...,Sim,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [44]:
data_normal_ohe.columns

Index(['desfecho', 'TTO_anterior_TB_triagem_enfermeiro', 'Cicatriz_BCG',
       'Idade', 'Sexo', 'Raça', 'Tem_companheiro', 'Tosse', 'Semanas_tosse',
       'Expectoração', 'Hemoptóicos', 'Hemoptise', 'Sudorese_noturna', 'Febre',
       'Dispnéia', 'Perda_de_apetite', 'Perda_peso_10percent', 'Dor_torácica',
       'Internação_hospital_2anos', 'Prisão_2anos', 'Delegacia_2anos',
       'Contato_TBP_2anos', 'CAGE', 'Estado_civil_Casado',
       'Estado_civil_Separado', 'Estado_civil_Solteiro',
       'Estado_civil_ignorado', 'Estado_civil_viúvo', 'Fuma_Ex-fumante',
       'Fuma_Fumante', 'Fuma_Jamais fumante', 'Fuma_ignorado'],
      dtype='object')

In [45]:
data_full_ohe

Unnamed: 0,desfecho,Idade,TTO_anterior_TB_triagem_enfermeiro_Não,TTO_anterior_TB_triagem_enfermeiro_Sim,TTO_anterior_TB_triagem_enfermeiro_ignorado,Cicatriz_BCG_Não,Cicatriz_BCG_Sim,Cicatriz_BCG_ignorado,Sexo_Feminino,Sexo_Masculino,...,Contato_TBP_2anos_Não,Contato_TBP_2anos_Sim,Contato_TBP_2anos_ignorado,Fuma_Ex-fumante,Fuma_Fumante,Fuma_Jamais fumante,Fuma_ignorado,CAGE_Não,CAGE_Sim,CAGE_ignorado
0,TB+,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,TB+,53,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TB+,84,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,TB+,18,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,TB+,36,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,TB+,71,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6,TB+,26,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7,TB+,22,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,TB+,54,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,TB+,52,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [46]:
variaveis_dependentes

Unnamed: 0,chi2,pval,dof,dof_real
Cicatriz_BCG,18.331834,0.00106277,4,1
Prisão_2anos,19.333824,0.0006756972,4,1
Estado_civil_viúvo,11.737643,0.01941267,4,1
Dispnéia,11.179443,0.02461965,4,1
Semanas_tosse,33.193965,1.090085e-06,4,1
Febre,37.044453,1.763716e-07,4,1
Perda_peso_10percent,111.468003,3.5389470000000006e-23,4,1
Delegacia_2anos,17.674704,0.00142845,4,1
Sexo,25.371503,4.235617e-05,4,1
Estado_civil_Casado,10.877413,0.02797683,4,1
