In [2]:
# Produces time series files for COVID-19 (starting march 5, when mayor changes to reports were made).
# Before judging this mess, pleace note that daily report data is very inconsistent.
# So each report is treated as a case-by-case basis.

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Last data with old format
df5 = pd.read_csv('mexico-covid19-2020-04-05.csv')

In [5]:
# New data format starts at 06/04/2020
df6 = pd.read_csv('../data/mexico_covid_19_data/csv/raw_covid_mex_20200406.csv') # Date_Arrival is now removed from reports

In [6]:
# The raw files were basically converted from pdf to csv using zamzar
df6 = df6.drop(['Case_ID'], axis=1) # Remove this column to merge
df6['Origin'].fillna('Contacto', inplace=True) # New data doesn't specify but we'll assume it remains by contact unless otherwise specified
df6.index = np.arange(1, (len(df6)+1)) # New index that doesn't start from 0
df6 = df6.reset_index() # Add index to columns
df6 = df6.rename(columns={'index':'Case_ID'}) # Rename index
len(df6)

2439

In [7]:
# https://stackoverflow.com/questions/19125091/pandas-merge-how-to-avoid-duplicating-columns
df6 = df6.merge(df5, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match for confirmed dates from previous report
# https://stackoverflow.com/questions/19071199/drop-columns-whose-name-contains-a-specific-string-from-pandas-dataframe
df6 = df6.loc[:,~df6.columns.str.contains('_y', case=False)] # Drop suffix
len(df6)

2439

In [8]:
# Number of new cases should be:
len(df6) - len(df5)

296

In [9]:
df6

Unnamed: 0,Case_ID,Region,Sex,Age,Date_Symptoms,Tested,Origin,Date_Arrival,Date_Confirmed
0,1,AGUASCALIENTES,F,31,23/03/2020,Confirmado,Contacto,02/22/2020,02/27/2020
1,2,MEXICO,M,21,20/03/2020,Confirmado,Contacto,02/21/2020,02/28/2020
2,3,COAHUILA,M,40,25/03/2020,Confirmado,Contacto,02/22/2020,02/29/2020
3,4,CHIHUAHUA,M,27,19/03/2020,Confirmado,Contacto,02/25/2020,02/29/2020
4,5,SONORA,F,45,15/03/2020,Confirmado,Contacto,02/25/2020,03/01/2020
...,...,...,...,...,...,...,...,...,...
2434,2435,QUINTANA ROO,F,30,29/03/2020,Confirmado,Contacto,,
2435,2436,JALISCO,M,25,17/03/2020,Confirmado,Estados,,
2436,2437,JALISCO,F,34,21/03/2020,Confirmado,Contacto,,
2437,2438,TABASCO,M,59,26/03/2020,Confirmado,Contacto,,


In [10]:
# https://stackoverflow.com/questions/37313691/how-to-remove-a-pandas-dataframe-from-another-dataframe
out6 = df6.append(df5) # The resulting length after this opreration should match the new cases!
out6 = out6[~out6.index.duplicated(keep=False)] # Had to test several times to guess...
out6.Date_Confirmed = out6.Date_Confirmed.fillna('04/06/2020')
out6 = pd.concat([df5, out6], sort=True)
out6[['Case_ID', 'Age']] = out6[['Case_ID', 'Age']].astype(int)
out6 = out6[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out6.to_csv('mexico-covid19-2020-04-06.csv', index=False)

In [11]:
# From the analysis above we can deduce that the new data is probably just being appended to the end of the last report (?)
# Will do the same method for the next few days, and if it remains consistent, will make it functional
# But until now it is more prudent treat each report as a case-by-case basis, sadly

In [12]:
df7 = pd.read_csv('../data/mexico_covid_19_data/csv/raw_covid_mex_20200407.csv')
df7['Origin'].fillna('Contacto', inplace=True)

In [13]:
df7

Unnamed: 0,Case_ID,Region,Sex,Age,Date_Symptoms,Tested,Origin
0,1,CIUDAD DE MÉXICO,M,40,17/03/2020,Confirmado,Contacto
1,2,CIUDAD DE MÉXICO,F,29,26/03/2020,Confirmado,Contacto
2,3,MÉXICO,F,84,26/03/2020,Confirmado,Contacto
3,4,NUEVO LEÓN,M,54,20/03/2020,Confirmado,Contacto
4,5,VERACRUZ,F,65,18/03/2020,Confirmado,España
...,...,...,...,...,...,...,...
2780,2781,JALISCO,M,31,,Confirmado,Contacto
2781,2782,NUEVO LEÓN,M,36,,Confirmado,Contacto
2782,2783,COAHUILA,F,92,,Confirmado,Estados Unidos
2783,2784,DURANGO,M,74,,Confirmado,Estados Unidos


In [14]:
# 07/04/2020
out7 = df7.append(out6) # The resulting length after this opreration should match the new cases!
out7 = out7[~out7.index.duplicated(keep=False)] # Had to test several times to guess...
out7.Date_Confirmed = out7.Date_Confirmed.fillna('04/07/2020')
out7 = pd.concat([out6, out7], sort=True) # Use last df in memory, sorry
out7.Case_ID = pd.to_numeric(out7.Case_ID, errors='coerce').fillna(0).astype(np.int64) #https://stackoverflow.com/questions/42719749/pandas-convert-string-to-int
out7.Age = pd.to_numeric(out7.Age, errors='coerce').fillna(0).astype(np.int64)
out7 = out7[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out7.to_csv('mexico-covid19-2020-04-07.csv', index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [15]:
# 08/04/2020
df8 = pd.read_csv('../data/mexico_covid_19_data/csv/raw_covid_mex_20200408.csv') # Origin is now removed from the daily report
df8 = df8.merge(df7, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df8 = df8.loc[:,~df8.columns.str.contains('_y', case=False)] # Drop suffix
df8['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out8 = df8.append(out7) # The resulting length after this opreration should match the new cases!
out8 = out8[~out8.index.duplicated(keep=False)] # Had to test several times to guess...
out8.Date_Confirmed = out8.Date_Confirmed.fillna('04/08/2020')
out8 = pd.concat([out7, out8], sort=True) # Use last df in memory, sorry
out8.Case_ID = pd.to_numeric(out8.Case_ID, errors='coerce').fillna(0).astype(np.int64)
out8.Age = pd.to_numeric(out8.Age, errors='coerce').fillna(0).astype(np.int64)
out8 = out8[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out8.to_csv('mexico-covid19-2020-04-08.csv', index=False)

In [16]:
# 09/04/2020
df9 = pd.read_csv('../data/mexico_covid_19_data/csv/raw_covid_mex_20200409.csv') # Origin is now removed from the daily report
df9 = df9.merge(df8, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df9 = df9.loc[:,~df9.columns.str.contains('_y', case=False)] # Drop suffix
df9['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out9 = df9.append(out8)
out9 = out9[~out9.index.duplicated(keep=False)]
out9.Date_Confirmed = out9.Date_Confirmed.fillna('04/09/2020')
out9 = pd.concat([out8, out9], sort=True) # Use last df in memory, sorry
out9.to_csv('mexico-covid19-2020-04-09.csv', index=False)

In [17]:
# 10/04/2020
df10 = pd.read_csv('../data/mexico_covid_19_data/csv/raw_covid_mex_20200410.csv') # Origin is now removed from the daily report
df10 = df10.merge(df9, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df10 = df10.loc[:,~df10.columns.str.contains('_y', case=False)] # Drop suffix
df10['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out10 = df10.append(out9)
out10 = out10[~out10.index.duplicated(keep=False)]
out10.Date_Confirmed = out10.Date_Confirmed.fillna('04/10/2020')
out10 = pd.concat([out9, out10], sort=True) # Use last df in memory, sorry
out10.to_csv('mexico-covid19-2020-04-10.csv', index=False)