In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import graphviz
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import copy
from collections import Counter

In [2]:
#Lectura del csv
data_raw=pd.read_csv('200512COVID19MEXICO.csv',index_col=1 , sep=';', engine='python')
#Se eligen los registros que se sabe que tienen coronavirus
data_raw = data_raw[data_raw["RESULTADO"] == 1]
data_raw.head()

Unnamed: 0_level_0,FECHA_ACTUALIZACION,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,FECHA_INGRESO,...,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,RESULTADO,MIGRANTE,PAIS_NACIONALIDAD,PAIS_ORIGEN,UCI
ID_REGISTRO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,12/05/2020,2,1,9,1,9,9,16,1,25/04/2020,...,2,2,2,2,99,1,99,MÃ©xico,99,97
2,12/05/2020,2,3,2,2,2,2,2,1,30/03/2020,...,2,2,2,2,1,1,99,MÃ©xico,99,97
3,12/05/2020,2,3,15,2,15,15,39,1,17/03/2020,...,2,2,2,2,2,1,99,MÃ©xico,99,97
4,12/05/2020,2,3,2,2,2,2,2,1,21/03/2020,...,2,2,2,2,1,1,99,MÃ©xico,99,97
5,12/05/2020,2,3,27,2,27,27,4,1,30/03/2020,...,2,2,2,2,1,1,99,MÃ©xico,99,97


In [3]:
#Se seleccionan las variables que nos interesan
vars_de_int = ['EDAD','DIABETES','EPOC','ASMA','INMUSUPR',
               'HIPERTENSION','CARDIOVASCULAR','OBESIDAD',
               'RENAL_CRONICA','TABAQUISMO','OTRA_COM',
               'SEXO','FECHA_DEF','UCI','TIPO_PACIENTE']
data_raw = data_raw[vars_de_int]
print(data_raw.columns)
data_raw.head()

Index(['EDAD', 'DIABETES', 'EPOC', 'ASMA', 'INMUSUPR', 'HIPERTENSION',
       'CARDIOVASCULAR', 'OBESIDAD', 'RENAL_CRONICA', 'TABAQUISMO', 'OTRA_COM',
       'SEXO', 'FECHA_DEF', 'UCI', 'TIPO_PACIENTE'],
      dtype='object')


Unnamed: 0_level_0,EDAD,DIABETES,EPOC,ASMA,INMUSUPR,HIPERTENSION,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRA_COM,SEXO,FECHA_DEF,UCI,TIPO_PACIENTE
ID_REGISTRO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,65,2,2,2,2,1,2,2,2,2,2,1,3/05/2020,97,1
2,23,2,2,2,2,2,2,2,2,2,2,2,9999-99-99,97,1
3,51,2,2,2,2,1,2,2,2,2,2,2,9999-99-99,97,1
4,52,2,2,2,2,2,2,2,2,2,2,2,9999-99-99,97,1
5,54,1,2,2,2,2,2,2,2,2,2,2,9999-99-99,97,1


In [4]:
#Decodificaci[on de las variables]
def dicotomia(x):
    if x==1:
        #Sí
        return 1
    elif x==2:
        #No
        return 0
    else:
        #No se sabe
        return np.NaN
    
def sex(x):
    if x==1:
        #Mujer
        return 1
    elif x==2:
        #Hombre
        return 0
    else:
        #Otro
        return 2
    
def defuncion(x):
    if x=='9999-99-99':
        #vivo
        return 0
    else:
        #muerto
        return 1
    
def tipific(x):
    if x==1:
        #Ambulatorio
        return 0
    elif x==2:
        #Hospitalizado
        return 1
    else:
        #no se sabe
        return np.NaN
    
var_si_no = ['DIABETES','EPOC','ASMA','INMUSUPR','HIPERTENSION',
             'CARDIOVASCULAR','OBESIDAD','RENAL_CRONICA','TABAQUISMO',
             'OTRA_COM','UCI']
for var in var_si_no:
    data_raw[var] = data_raw[var].apply(dicotomia)
data_raw['UCI'] = data_raw['UCI'].apply(lambda x: 0 if pd.isnull(x) else x)
data_raw['SEXO'] = data_raw['SEXO'].apply(sex)
data_raw['FECHA_DEF'] = data_raw['FECHA_DEF'].apply(defuncion)
data_raw['TIPO_PACIENTE'] = data_raw['TIPO_PACIENTE'].apply(tipific)
data_raw=data_raw.rename(columns={'EPOC':'Enf. pulm. obstrusiva crónica',
                          'FECHA_DEF': 'Muerto',
                          'INMUSUPR': 'Inmunosuprimido',
                          'TIPO_PACIENTE': 'Hospitalización',
                          'SEXO': 'Sexo',
                          'ASMA': 'Asma',
                          'HIPERTENSION': 'Hipertensión',
                          'OTRA_COM': 'Otras comorbilidades',
                          'CARDIOVASCULAR': 'Enf. cardiovascular',
                          'TABAQUISMO': 'Tabaquismo',
                          'OBESIDAD': 'Obesidad',
                          'EDAD': 'Edad',
                          'DIABETES': 'Diabetes',
                          'RENAL_CRONICA': 'Insuf. renal crónica'
                         })
data_raw=data_raw.dropna()
data_raw.head()

Unnamed: 0_level_0,Edad,Diabetes,Enf. pulm. obstrusiva crónica,Asma,Inmunosuprimido,Hipertensión,Enf. cardiovascular,Obesidad,Insuf. renal crónica,Tabaquismo,Otras comorbilidades,Sexo,Muerto,UCI,Hospitalización
ID_REGISTRO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,65,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0
2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0
3,51,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0
4,52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0
5,54,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0


In [5]:
data_raw.corr()['Muerto']

Edad                             0.265468
Diabetes                         0.177501
Enf. pulm. obstrusiva crónica    0.085935
Asma                            -0.012662
Inmunosuprimido                  0.046018
Hipertensión                     0.170435
Enf. cardiovascular              0.064529
Obesidad                         0.064523
Insuf. renal crónica             0.106728
Tabaquismo                       0.009568
Otras comorbilidades             0.024454
Sexo                            -0.068797
Muerto                           1.000000
UCI                              0.234910
Hospitalización                  0.346610
Name: Muerto, dtype: float64

In [6]:
#Declaración variables categóricas
var_cat=['Enf. pulm. obstrusiva crónica','Muerto','Inmunosuprimido','Hospitalización','Sexo','Asma','Hipertensión',
         'Otras comorbilidades','Enf. cardiovascular','Tabaquismo','Obesidad','Diabetes','Insuf. renal crónica','UCI']
for var in var_cat:
    data_raw[var] = data_raw[var].astype('category')
    
caracteristicas=['Enf. pulm. obstrusiva crónica','Inmunosuprimido','Sexo','Asma','Hipertensión','Edad',
         'Otras comorbilidades','Enf. cardiovascular','Tabaquismo','Obesidad','Diabetes','Insuf. renal crónica']
objetivo=['Muerto']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    data_raw[caracteristicas] , data_raw[objetivo], test_size=0.2, random_state=1405)
print(X_train.head())
y_train.head()

            Enf. pulm. obstrusiva crónica Inmunosuprimido Sexo Asma  \
ID_REGISTRO                                                           
2889                                  0.0             0.0    0  0.0   
12196                                 0.0             0.0    1  0.0   
15155                                 0.0             0.0    0  0.0   
21533                                 0.0             0.0    0  0.0   
37621                                 0.0             0.0    0  0.0   

            Hipertensión  Edad Otras comorbilidades Enf. cardiovascular  \
ID_REGISTRO                                                               
2889                 0.0    37                  0.0                 0.0   
12196                0.0    42                  0.0                 0.0   
15155                0.0    42                  0.0                 0.0   
21533                0.0    60                  0.0                 0.0   
37621                0.0    54                  0.0 

Unnamed: 0_level_0,Muerto
ID_REGISTRO,Unnamed: 1_level_1
2889,0
12196,0
15155,0
21533,0
37621,0


In [9]:
oversampler=RandomOverSampler(sampling_strategy=0.5,random_state=805)
X_train, y_train=oversampler.fit_resample(X_train, y_train)

In [10]:
tree_depth=6
crit='entropy'

cl=RandomForestClassifier(random_state = 1505, n_estimators=1000, criterion=crit,
                          max_depth=tree_depth, max_features=0.8, oob_score=True,
                         n_jobs=-1)
#cl=tree.DecisionTreeClassifier(random_state = 1505, criterion = crit, max_depth = tree_depth)
cl.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=6, max_features=0.8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=True, random_state=1505, verbose=0,
                       warm_start=False)

In [11]:
preds=cl.predict(X_val)
print(classification_report(y_true=y_val,y_pred=preds))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89      6816
           1       0.25      0.46      0.33       749

    accuracy                           0.81      7565
   macro avg       0.59      0.66      0.61      7565
weighted avg       0.87      0.81      0.83      7565



In [12]:
print('Matriz de confusión del bosque:\n',confusion_matrix(y_true=y_val,y_pred=preds))

Matriz de confusión del árbol:
 [[5773 1043]
 [ 401  348]]


In [28]:
print('Variables más importantes del bosque:\n',
      '\n'.join(str(j).ljust(51)+str(i) for i, j in sorted(zip(cl.feature_importances_ , caracteristicas), key = lambda x: x[0], reverse=True)), sep='')

Variables más importantes del bosque:
Edad                                               0.6982622635480568
Diabetes                                           0.1321367913159455
Sexo                                               0.048984905065296465
Obesidad                                           0.036302316818782446
Hipertensión                                       0.0350030837059075
Insuf. renal crónica                               0.02673549242633653
Otras comorbilidades                               0.0052183424618134095
Enf. pulm. obstrusiva crónica                      0.00520453181618114
Inmunosuprimido                                    0.004964091783854387
Tabaquismo                                         0.0028162006631532364
Enf. cardiovascular                                0.0025218943870753095
Asma                                               0.0018500860075972266
