In [1]:
import pandas as pd
import janitor

In [2]:
# Group and keep columns with 'as_index'
# https://stackoverflow.com/questions/31569549/how-to-groupby-a-dataframe-in-pandas-and-keep-columns

### Daily report data

In [3]:
data = pd.read_csv('filter.csv', encoding='utf-8')
data = data.drop(['MUNICIPIO_RES'], axis=1) # readily drop this column

In [4]:
print(data.set_index(['ID_REGISTRO']).index.is_unique) # ids are not unique... it's unclear how reports are aggregated

False


In [5]:
data['FECHA_INGRESO'] = pd.to_datetime(data.FECHA_INGRESO, format='%Y-%m-%d', errors='coerce')
data['FECHA_INGRESO'] = data['FECHA_INGRESO'].dt.strftime('%Y-%m-%d')
data = data.set_index(pd.DatetimeIndex(data['FECHA_INGRESO'])).sort_index()

In [6]:
len(data)

243756

### Note
Daily report for april 20 (ie, reported in 21 april) stops using testing site (ENTIDAD_UM),
<br>
And starts using region of residence (ENTIDAD_RES) so we shall treat both sets of data accordingly

In [7]:
dat = data.copy()

In [8]:
tbl_x = dat.filter_date('FECHA_INGRESO','2020-01-06','2020-04-20')
tbl_x = tbl_x.drop(['ENTIDAD_RES'], axis=1) # drop by region
tbl_x = tbl_x.set_index(
    ['FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'ENTIDAD_UM', 'ID_REGISTRO', 'PAIS_ORIGEN']
)
tbl_x = tbl_x.groupby(
    ['FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'ENTIDAD_UM', 'ID_REGISTRO', 'PAIS_ORIGEN'], as_index=True).count().reset_index()
tbl_x = tbl_x.rename(columns={'ENTIDAD_UM':'Region_ID'})

In [9]:
tbl_y = dat.filter_date('FECHA_INGRESO','2020-04-21','2022-12-31') # the long con
tbl_y = tbl_y.drop(['ENTIDAD_UM'], axis=1) # drop by region
tbl_y = tbl_y.set_index(
    ['FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'ENTIDAD_RES', 'ID_REGISTRO', 'PAIS_ORIGEN']
)
tbl_y = tbl_y.groupby(
    ['FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'ENTIDAD_RES', 'ID_REGISTRO', 'PAIS_ORIGEN'], as_index=True).count().reset_index()
tbl_y = tbl_y.rename(columns={'ENTIDAD_RES':'Region_ID'})

In [10]:
tbl = pd.concat([tbl_x, tbl_y])

In [11]:
tbl

Unnamed: 0,FECHA_INGRESO,FECHA_SINTOMAS,FECHA_DEF,Region_ID,ID_REGISTRO,PAIS_ORIGEN,RESULTADO
0,2020-01-06,2020-01-06,9999-99-99,18,09f6d9,Local,13
1,2020-01-08,2020-01-08,9999-99-99,30,1c29f8,Local,12
2,2020-01-14,2020-01-14,9999-99-99,1,0ceb0f,Local,5
3,2020-01-27,2020-01-27,9999-99-99,15,1879e0,Local,1
4,2020-02-07,2020-02-07,9999-99-99,11,03e780,Local,10
...,...,...,...,...,...,...,...
9055,2020-05-01,2020-04-29,9999-99-99,21,10320c,Local,1
9056,2020-05-01,2020-04-30,9999-99-99,4,09c19e,Local,1
9057,2020-05-01,2020-04-30,9999-99-99,19,05d5f6,Local,1
9058,2020-05-01,2020-04-30,9999-99-99,19,178d36,Local,1


In [12]:
tbl = tbl.rename(
    columns={
        'FECHA_INGRESO':'Date_Confirmed',
        'FECHA_SINTOMAS':'Date_Symptoms',
        'ID_REGISTRO':'Case_ID',
        'RESULTADO':'Status',
        'PAIS_ORIGEN':'Origin'
    })

In [13]:
tbl['Status'] = '1'

In [14]:
tbl

Unnamed: 0,Date_Confirmed,Date_Symptoms,FECHA_DEF,Region_ID,Case_ID,Origin,Status
0,2020-01-06,2020-01-06,9999-99-99,18,09f6d9,Local,1
1,2020-01-08,2020-01-08,9999-99-99,30,1c29f8,Local,1
2,2020-01-14,2020-01-14,9999-99-99,1,0ceb0f,Local,1
3,2020-01-27,2020-01-27,9999-99-99,15,1879e0,Local,1
4,2020-02-07,2020-02-07,9999-99-99,11,03e780,Local,1
...,...,...,...,...,...,...,...
9055,2020-05-01,2020-04-29,9999-99-99,21,10320c,Local,1
9056,2020-05-01,2020-04-30,9999-99-99,4,09c19e,Local,1
9057,2020-05-01,2020-04-30,9999-99-99,19,05d5f6,Local,1
9058,2020-05-01,2020-04-30,9999-99-99,19,178d36,Local,1


In [15]:
#tbl[~tbl.FECHA_DEF.str.startswith(('9999'))]
tbl = tbl[tbl.FECHA_DEF.str.startswith(('9999'))] # Not deceased

In [16]:
tbl = tbl.drop(['FECHA_DEF'], axis=1)

In [17]:
tbl

Unnamed: 0,Date_Confirmed,Date_Symptoms,Region_ID,Case_ID,Origin,Status
0,2020-01-06,2020-01-06,18,09f6d9,Local,1
1,2020-01-08,2020-01-08,30,1c29f8,Local,1
2,2020-01-14,2020-01-14,1,0ceb0f,Local,1
3,2020-01-27,2020-01-27,15,1879e0,Local,1
4,2020-02-07,2020-02-07,11,03e780,Local,1
...,...,...,...,...,...,...
9055,2020-05-01,2020-04-29,21,10320c,Local,1
9056,2020-05-01,2020-04-30,4,09c19e,Local,1
9057,2020-05-01,2020-04-30,19,05d5f6,Local,1
9058,2020-05-01,2020-04-30,19,178d36,Local,1


In [21]:
geo = pd.read_csv('../data/geo/entidades.csv')

In [22]:
geo.keys()

Index(['CLAVE_ENTIDAD', 'ENTIDAD_FEDERATIVA', 'ABREVIATURA'], dtype='object')

In [23]:
geo = geo.rename(columns={'CLAVE_ENTIDAD':'Region_ID', 'ENTIDAD_FEDERATIVA':'Region', 'ABREVIATURA':'Region_Key'})

In [24]:
output = pd.merge(tbl, geo, on='Region_ID') 

In [25]:
output = output[['Date_Confirmed', 'Date_Symptoms', 'Region', 'Region_ID', 'Region_Key', 'Origin']]

In [26]:
output

Unnamed: 0,Date_Confirmed,Date_Symptoms,Region,Region_ID,Region_Key,Origin
0,2020-01-06,2020-01-06,NAYARIT,18,NT,Local
1,2020-03-18,2020-03-17,NAYARIT,18,NT,Local
2,2020-03-19,2020-03-10,NAYARIT,18,NT,Local
3,2020-03-23,2020-03-19,NAYARIT,18,NT,Local
4,2020-03-24,2020-03-21,NAYARIT,18,NT,Local
...,...,...,...,...,...,...
21523,2020-04-30,2020-04-26,TLAXCALA,29,TL,Local
21524,2020-04-30,2020-04-26,TLAXCALA,29,TL,Local
21525,2020-04-30,2020-04-28,TLAXCALA,29,TL,Local
21526,2020-04-30,2020-04-30,TLAXCALA,29,TL,Local


In [27]:
output['Region'] = output['Region'].str.replace('COAHUILA DE ZARAGOZA','COAHUILA')
output['Region'] = output['Region'].str.replace('MICHOACÁN DE OCAMPO','MICHOACAN')
output['Region'] = output['Region'].str.replace('VERACRUZ DE IGNACIO DE LA LLAVE','VERACRUZ')

In [28]:
output = output.sort_values(['Date_Confirmed', 'Date_Symptoms', 'Region'])

In [29]:
output.Region.unique()

array(['NAYARIT', 'VERACRUZ', 'AGUASCALIENTES', 'MÉXICO', 'GUANAJUATO',
       'BAJA CALIFORNIA', 'CIUDAD DE MÉXICO', 'COAHUILA', 'SINALOA',
       'CHIAPAS', 'YUCATÁN', 'MORELOS', 'NUEVO LEÓN', 'QUERÉTARO',
       'PUEBLA', 'DURANGO', 'JALISCO', 'QUINTANA ROO', 'CAMPECHE',
       'SAN LUIS POTOSÍ', 'GUERRERO', 'OAXACA', 'CHIHUAHUA', 'COLIMA',
       'SONORA', 'TAMAULIPAS', 'HIDALGO', 'TABASCO', 'MICHOACAN',
       'BAJA CALIFORNIA SUR', 'ZACATECAS', 'TLAXCALA'], dtype=object)

In [30]:
output.Origin.unique()

array(['Local', 'Alemania', 'Estados Unidos de América',
       'Estados Unidos de Am�rica', 'República de Honduras',
       'Rep�blica de Honduras', 'Venezuela', 'Cuba', 'Guatemala', 'Otro',
       'Perú', 'Per�', 'Camerún', 'Camer�n', 'Chile', 'El Salvador',
       'Canad�', 'Brasil', 'Colombia', 'Ecuador'], dtype=object)

In [31]:
output.to_csv('../../latest.csv', index=False)