# ML Model

In [1]:
from covid_data import daily_covid, convert_to_state_date
import health_data
import mlpipeline as pipeline
import pandas as pd

covid = daily_covid()

Requesting data to datos abiertos Mexico
Getting zip raw data into directory, will delete soon
Raw data deleted. If you specified filename, clean data will be saved in data directory


In [2]:
covid_positive = covid[covid.resultado==1]

# Separate features to use in the model
features_covid = ['entidad_res', 'municipio_res', 'embarazo', 'edad', 'diabetes', 'epoc', 
                  'asma', 'inmusupr', 'hipertension', 'cardiovascular', 'obesidad', 'tabaquismo', 'hospitalizado','muertos']

data_model = covid_positive.loc[:, features_covid]

#generate municipality id
data_model['CVE_MUN'] = data_model['municipio_res'] + data_model['entidad_res'] * 1000
mun = data_model.pop('CVE_MUN')
data_model.insert(2, 'CVE_MUN', mun)

print(data_model.shape)
data_model.sort_values(by='CVE_MUN' ,ascending=True).head()

(74560, 15)


Unnamed: 0,entidad_res,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,cardiovascular,obesidad,tabaquismo,hospitalizado,muertos
34580,1,1,1001,0.0,24,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
58238,1,1,1001,0.0,28,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0
29630,1,1,1001,0.0,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
14985,1,1,1001,0.0,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
14983,1,1,1001,0.0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [3]:
#merge with poverty data
poverty_data = health_data.get_poverty_data()
data_model = pd.merge(data_model, poverty_data[['CVE_MUN', 'pobreza']], on='CVE_MUN', how='left')

# add population density 
mun_territory = health_data.get_mun_territory()
conapo_mun = health_data.get_conapo_mun()
pop_den = pd.merge(mun_territory, conapo_mun, on='CVE_MUN', how='left')
pop_den['Densidad_pob'] = pop_den['POB'] / pop_den['superficie']
data_model = pd.merge(data_model, pop_den[['CVE_MUN', 'POB', 'Densidad_pob']], 
                      on='CVE_MUN', how='left')

#add hospital data
hospitals = health_data.get_hospital_data()
hospitals['num_medicos'] = (hospitals['Número de Médicos Generales'] +
                           hospitals['Número deMédicos Neumólogos'] + 
                           hospitals['Número de Médicos Infectólogos'] +
                           hospitals['Número de Médicos Urgenciólogos']+
                           hospitals['Número de Médicos Epidemiólogos'])
hospitals['CVE_MUN'] = hospitals['Clave Municipio'] + hospitals['Clave Estado']*1000
hosp_cols = ['TOTAL CAMAS AREA HOSPITALIZACIÓN', 'num_medicos', 
             'Total enfermeras en contacto con el paciente']
hospitals = hospitals.groupby('CVE_MUN').sum()


data_model = pd.merge(data_model, hospitals[hosp_cols], 
                      on='CVE_MUN', how='left')
data_model['medicos'] = data_model['num_medicos'] /  data_model['POB'] * 10000
data_model['camas_hosp'] = data_model['TOTAL CAMAS AREA HOSPITALIZACIÓN'] /  data_model['POB'] * 10000
data_model['enfermeras'] = data_model['Total enfermeras en contacto con el paciente'] /  data_model['POB'] * 10000

#drop cols that we don't need
data_model.drop(['POB', 'TOTAL CAMAS AREA HOSPITALIZACIÓN', 
                 'num_medicos', 'Total enfermeras en contacto con el paciente'], axis=1, inplace=True)

print(data_model.shape)
data_model.head()

(74560, 20)


Unnamed: 0,entidad_res,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,cardiovascular,obesidad,tabaquismo,hospitalizado,muertos,pobreza,Densidad_pob,medicos,camas_hosp,enfermeras
0,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,36.3,430.537057,5.44881,13.92624,41.575909
1,30,135,30135,0.0,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,76.5,918.759398,,,
2,25,6,25006,0.0,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,31.2,152.715706,2.637944,10.489463,30.502528
3,14,63,14063,0.0,65,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,1,36.6,437.050315,0.942871,6.977249,22.911776
4,25,1,25001,0.0,51,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,33.8,120.301546,1.747619,7.88509,19.598297


In [4]:
# How many missing values do we have?
pipeline.count_nan(data_model)

entidad_res          0
municipio_res        0
CVE_MUN              0
embarazo             0
edad                 0
diabetes             0
epoc                 0
asma                 0
inmusupr             0
hipertension         0
cardiovascular       0
obesidad             0
tabaquismo           0
hospitalizado        0
muertos              0
pobreza             26
Densidad_pob        32
medicos           6040
camas_hosp        6040
enfermeras        6040
dtype: int64

In [5]:
#impute missing values
#poverty
data_model['pobreza'] = data_model.groupby('entidad_res')['pobreza'].apply(lambda x:x.fillna(x.mean()))
data_model['Densidad_pob'] = data_model.groupby('entidad_res')['Densidad_pob'].apply(lambda x:x.fillna(x.mean()))
data_model['camas_hosp'] = data_model.groupby('entidad_res')['camas_hosp'].apply(lambda x:x.fillna(x.mean()))
data_model['medicos'] = data_model.groupby('entidad_res')['medicos'].apply(lambda x:x.fillna(x.mean()))
data_model['enfermeras'] = data_model.groupby('entidad_res')['enfermeras'].apply(lambda x:x.fillna(x.mean()))

pipeline.count_nan(data_model)

entidad_res       0
municipio_res     0
CVE_MUN           0
embarazo          0
edad              0
diabetes          0
epoc              0
asma              0
inmusupr          0
hipertension      0
cardiovascular    0
obesidad          0
tabaquismo        0
hospitalizado     0
muertos           0
pobreza           0
Densidad_pob      0
medicos           0
camas_hosp        0
enfermeras        0
dtype: int64