# ML Model

In [1]:
from covid_data import daily_covid, convert_to_state_date
import health_data
import mlpipeline as pipeline
import pandas as pd

covid = daily_covid()

Requesting data to datos abiertos Mexico
Getting zip raw data into directory, will delete soon
Raw data deleted. If you specified filename, clean data will be saved in data directory


In [2]:
# Separate features to use in the model
features_covid = ['entidad_um', 'municipio_res', 'embarazo', 'edad', 'diabetes', 'epoc', 
                  'asma', 'inmusupr', 'hipertension', 'cardiovascular', 'obesidad', 'tabaquismo', 'muertos']
data_model = covid.loc[:, features_covid]

#generate municipality id
data_model['CVE_MUN'] = data_model['municipio_res'] + data_model['entidad_um'] * 1000
mun = data_model.pop('CVE_MUN')
data_model.insert(2, 'CVE_MUN', mun)
print(data_model.shape)
data_model.head()

(235129, 14)


Unnamed: 0,entidad_um,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,cardiovascular,obesidad,tabaquismo,muertos
0,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,30,135,30135,0.0,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,25,6,25006,0.0,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,14,63,14063,0.0,65,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
4,25,1,25001,0.0,51,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
#merge with poverty data
poverty_data = health_data.get_poverty_data()
data_model = pd.merge(data_model, poverty_data[['CVE_MUN', 'pobreza']], on='CVE_MUN', how='left')

# add population density 
mun_territory = health_data.get_mun_territory()
conapo_mun = health_data.get_conapo_mun()
pop_den = pd.merge(mun_territory, conapo_mun, on='CVE_MUN', how='left')
pop_den['Densidad_pob'] = pop_den['POB'] / pop_den['superficie']
data_model = pd.merge(data_model, pop_den[['CVE_MUN', 'Densidad_pob']], 
                      on='CVE_MUN', how='left')
print(data_model.shape)
data_model.head()

(235129, 16)


Unnamed: 0,entidad_um,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,cardiovascular,obesidad,tabaquismo,muertos,pobreza,Densidad_pob
0,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057
1,30,135,30135,0.0,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,76.5,918.759398
2,25,6,25006,0.0,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,31.2,152.715706
3,14,63,14063,0.0,65,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,36.6,437.050315
4,25,1,25001,0.0,51,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,33.8,120.301546


In [4]:
#add hospital data (lots of missing values)
hospitals = health_data.get_hospital_data()
hospitals['CVE_MUN'] = hospitals['Clave Municipio'] + hospitals['Clave Estado']*1000
data_model = pd.merge(data_model, hospitals[['CVE_MUN', 'TOTAL CAMAS AREA HOSPITALIZACIÓN']], 
                      on='CVE_MUN', how='left')
data_model.head()

Unnamed: 0,entidad_um,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,cardiovascular,obesidad,tabaquismo,muertos,pobreza,Densidad_pob,TOTAL CAMAS AREA HOSPITALIZACIÓN
0,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057,142.0
1,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057,60.0
2,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057,92.0
3,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057,32.0
4,27,4,27004,0.0,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,36.3,430.537057,82.0


In [5]:
# How many missing values do we have?
pipeline.count_nan(data_model)

entidad_um                              0
municipio_res                           0
CVE_MUN                                 0
embarazo                                0
edad                                    0
diabetes                                0
epoc                                    0
asma                                    0
inmusupr                                0
hipertension                            0
cardiovascular                          0
obesidad                                0
tabaquismo                              0
muertos                                 0
pobreza                             12906
Densidad_pob                        12929
TOTAL CAMAS AREA HOSPITALIZACIÓN    37031
dtype: int64

In [6]:
#Split training and testing data 
train, test = pipeline.split_data(data_model, 0.2)
print('test obs:', test.shape[0])
print('train obs:', train.shape[0])

test obs: 45130
train obs: 180520
