In [40]:
# Produces time series files for COVID-19 (starting march 5, when mayor changes to reports were made).
# Before judging this mess, pleace note that daily report data is very inconsistent.
# Thus, each report is treated individually, until data reports seem stable.

In [41]:
import pandas as pd
import numpy as np

In [42]:
# Last data with old format
df5 = pd.read_csv('mexico-covid19-2020-04-05.csv')

In [43]:
# New data format starts at 2020-04-06
df6 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-06.csv') # Date_Arrival is now removed from reports
df6['Date_Symptoms'] = pd.to_datetime(df6.Date_Symptoms)
df6['Date_Symptoms'] = df6['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df6['Date_Symptoms'] = df6['Date_Symptoms'].replace('NaT', '')

In [44]:
# The raw files were basically converted from pdf to csv using zamzar
df6 = df6.drop(['Case_ID'], axis=1) # Remove this column to merge
df6['Origin'].fillna('Contacto', inplace=True) # New data doesn't specify but we'll assume it remains by contact unless otherwise specified
df6.index = np.arange(1, (len(df6)+1)) # New index that doesn't start from 0
df6 = df6.reset_index() # Add index to columns
df6 = df6.rename(columns={'index':'Case_ID'}) # Rename index
len(df6)

2439

In [45]:
# https://stackoverflow.com/questions/19125091/pandas-merge-how-to-avoid-duplicating-columns
df6 = df6.merge(df5, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match for confirmed dates from previous report
# https://stackoverflow.com/questions/19071199/drop-columns-whose-name-contains-a-specific-string-from-pandas-dataframe
df6 = df6.loc[:,~df6.columns.str.contains('_y', case=False)] # Drop suffix
len(df6)

2439

In [46]:
# Number of new cases should be:
len(df6) - len(df5)

296

In [47]:
df6

Unnamed: 0,Case_ID,Region,Sex,Age,Date_Symptoms,Tested,Origin,Date_Arrival,Date_Confirmed
0,1,AGUASCALIENTES,F,31,03-23-2020,Confirmado,Contacto,02-22-2020,02-27-2020
1,2,MEXICO,M,21,03-20-2020,Confirmado,Contacto,02-21-2020,02-28-2020
2,3,COAHUILA,M,40,03-25-2020,Confirmado,Contacto,02-22-2020,02-29-2020
3,4,CHIHUAHUA,M,27,03-19-2020,Confirmado,Contacto,02-25-2020,02-29-2020
4,5,SONORA,F,45,03-15-2020,Confirmado,Contacto,02-25-2020,03-01-2020
...,...,...,...,...,...,...,...,...,...
2434,2435,QUINTANA ROO,F,30,03-29-2020,Confirmado,Contacto,,
2435,2436,JALISCO,M,25,03-17-2020,Confirmado,Estados,,
2436,2437,JALISCO,F,34,03-21-2020,Confirmado,Contacto,,
2437,2438,TABASCO,M,59,03-26-2020,Confirmado,Contacto,,


In [48]:
# https://stackoverflow.com/questions/37313691/how-to-remove-a-pandas-dataframe-from-another-dataframe
out6 = df6.append(df5) # The resulting length after this opreration should match the new cases!
out6 = out6[~out6.index.duplicated(keep=False)] # Had to test several times to guess...
out6.Date_Confirmed = out6.Date_Confirmed.fillna('04-06-2020')
out6 = pd.concat([df5, out6], sort=True)
out6[['Case_ID', 'Age']] = out6[['Case_ID', 'Age']].astype(int)
out6 = out6[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out6.to_csv('mexico-covid19-2020-04-06.csv', index=False)

In [49]:
# From the analysis above we can deduce that the new data is probably just being appended to the end of the last report (?)
# Will do the same method for the next few days/week, and if it remains consistent, will make it functional!

In [50]:
# 2020-04-07
df7 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-07.csv')
df7['Date_Symptoms'] = pd.to_datetime(df7.Date_Symptoms)
df7['Date_Symptoms'] = df7['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df7['Date_Symptoms'] = df7['Date_Symptoms'].replace('NaT', '')

out7 = df7.append(out6) # The resulting length after this opreration should match the new cases!
out7 = out7[~out7.index.duplicated(keep=False)] # Had to test several times to guess...
out7.Date_Confirmed = out7.Date_Confirmed.fillna('04-07-2020')
out7 = pd.concat([out6, out7], sort=True) # Use last df in memory, sorry
out7.Case_ID = pd.to_numeric(out7.Case_ID, errors='coerce').fillna(0).astype(np.int64) #https://stackoverflow.com/questions/42719749/pandas-convert-string-to-int
out7.Age = pd.to_numeric(out7.Age, errors='coerce').fillna(0).astype(np.int64)
out7 = out7[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out7.to_csv('mexico-covid19-2020-04-07.csv', index=False)

In [51]:
# 2020-04-08
df8 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-08.csv') # Origin is now removed from the daily report
df8['Date_Symptoms'] = pd.to_datetime(df8.Date_Symptoms)
df8['Date_Symptoms'] = df8['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df8['Date_Symptoms'] = df8['Date_Symptoms'].replace('NaT', '')

df8 = df8.merge(df7, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df8 = df8.loc[:,~df8.columns.str.contains('_y', case=False)] # Drop suffix
df8['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out8 = df8.append(out7) # The resulting length after this opreration should match the new cases!
out8 = out8[~out8.index.duplicated(keep=False)] # Had to test several times to guess...
out8.Date_Confirmed = out8.Date_Confirmed.fillna('04-08-2020')
out8 = pd.concat([out7, out8], sort=True) # Use last df in memory, sorry
out8.Case_ID = pd.to_numeric(out8.Case_ID, errors='coerce').fillna(0).astype(np.int64)
out8.Age = pd.to_numeric(out8.Age, errors='coerce').fillna(0).astype(np.int64)
out8 = out8[['Case_ID','Region','Sex','Age','Date_Symptoms','Tested','Origin','Date_Arrival','Date_Confirmed']]
out8.to_csv('mexico-covid19-2020-04-08.csv', index=False)

In [52]:
# 2020-04-09
df9 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-09.csv') # Origin is now removed from the daily report
df9['Date_Symptoms'] = pd.to_datetime(df9.Date_Symptoms)
df9['Date_Symptoms'] = df9['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df9['Date_Symptoms'] = df9['Date_Symptoms'].replace('NaT', '')

df9 = df9.merge(df8, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df9 = df9.loc[:,~df9.columns.str.contains('_y', case=False)] # Drop suffix
df9['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out9 = df9.append(out8)
out9 = out9[~out9.index.duplicated(keep=False)]
out9.Date_Confirmed = out9.Date_Confirmed.fillna('04-09-2020')
out9 = pd.concat([out8, out9], sort=True) # Use last df in memory, sorry
out9.to_csv('mexico-covid19-2020-04-09.csv', index=False)

In [53]:
# 2020-04-10
df10 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-10.csv') # Origin is now removed from the daily report
df10['Date_Symptoms'] = pd.to_datetime(df10.Date_Symptoms)
df10['Date_Symptoms'] = df10['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df10['Date_Symptoms'] = df10['Date_Symptoms'].replace('NaT', '')

df10 = df10.merge(df9, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df10 = df10.loc[:,~df10.columns.str.contains('_y', case=False)] # Drop suffix
df10['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out10 = df10.append(out9)
out10 = out10[~out10.index.duplicated(keep=False)]
out10.Date_Confirmed = out10.Date_Confirmed.fillna('04-10-2020')
out10 = pd.concat([out9, out10], sort=True) # Use last df in memory, sorry
out10.to_csv('mexico-covid19-2020-04-10.csv', index=False)

In [54]:
# 2020-04-11
df11 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-11.csv') # Origin is now removed from the daily report
df11['Date_Symptoms'] = pd.to_datetime(df11.Date_Symptoms)
df11['Date_Symptoms'] = df11['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df11['Date_Symptoms'] = df11['Date_Symptoms'].replace('NaT', '')

df11 = df11.merge(df10, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df11 = df11.loc[:,~df11.columns.str.contains('_y', case=False)] # Drop suffix
df11['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out11 = df11.append(out10)
out11 = out11[~out11.index.duplicated(keep=False)]
out11.Date_Confirmed = out11.Date_Confirmed.fillna('04-11-2020')
out11 = pd.concat([out10, out11], sort=True) # Use last df in memory, sorry
out11.to_csv('mexico-covid19-2020-04-11.csv', index=False)

In [55]:
# 2020-04-12
df12 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-12.csv') # Origin is now removed from the daily report
df12['Date_Symptoms'] = pd.to_datetime(df12.Date_Symptoms)
df12['Date_Symptoms'] = df12['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df12['Date_Symptoms'] = df12['Date_Symptoms'].replace('NaT', '')

df12 = df12.merge(df11, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df12 = df12.loc[:,~df12.columns.str.contains('_y', case=False)] # Drop suffix
df12['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out12 = df12.append(out11)
out12 = out12[~out12.index.duplicated(keep=False)]
out12.Date_Confirmed = out12.Date_Confirmed.fillna('04-12-2020')
out12 = pd.concat([out11, out12], sort=True) # Use last df in memory, sorry
out12.to_csv('mexico-covid19-2020-04-12.csv', index=False)

In [56]:
# 2020-04-13
df13 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-13.csv') # Origin is now removed from the daily report
df13['Date_Symptoms'] = pd.to_datetime(df13.Date_Symptoms)
df13['Date_Symptoms'] = df13['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df13['Date_Symptoms'] = df13['Date_Symptoms'].replace('NaT', '')

df13 = df13.merge(df12, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df13 = df13.loc[:,~df13.columns.str.contains('_y', case=False)] # Drop suffix
df13['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out13 = df13.append(out12)
out13 = out13[~out13.index.duplicated(keep=False)]
out13.Date_Confirmed = out13.Date_Confirmed.fillna('04-13-2020')
out13 = pd.concat([out12, out13], sort=True) # Use last df in memory, sorry
out13.to_csv('mexico-covid19-2020-04-13.csv', index=False)

In [57]:
# 2020-04-14
df14 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-14.csv') # Origin is now removed from the daily report
df14['Date_Symptoms'] = pd.to_datetime(df14.Date_Symptoms)
df14['Date_Symptoms'] = df14['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df14['Date_Symptoms'] = df14['Date_Symptoms'].replace('NaT', '')

df14 = df14.merge(df13, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df14 = df14.loc[:,~df14.columns.str.contains('_y', case=False)] # Drop suffix
df14['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out14 = df14.append(out13)
out14 = out14[~out14.index.duplicated(keep=False)]
out14.Date_Confirmed = out14.Date_Confirmed.fillna('04-14-2020')
out14 = pd.concat([out13, out14], sort=True) # Use last df in memory, sorry
out14.to_csv('mexico-covid19-2020-04-14.csv', index=False)

In [58]:
# 2020-04-15
df15 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-15.csv') # Origin is now removed from the daily report
df15['Date_Symptoms'] = pd.to_datetime(df15.Date_Symptoms)
df15['Date_Symptoms'] = df15['Date_Symptoms'].dt.strftime('%m-%d-%Y')
df15['Date_Symptoms'] = df15['Date_Symptoms'].replace('NaT', '')

df15 = df15.merge(df13, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match deprecated columns from last report
df15 = df15.loc[:,~df15.columns.str.contains('_y', case=False)] # Drop suffix
df15['Origin'].fillna('Contacto', inplace=True) # Fill after merge

out15 = df15.append(out13)
out15 = out15[~out15.index.duplicated(keep=False)]
out15.Date_Confirmed = out15.Date_Confirmed.fillna('04-15-2020')
out15 = pd.concat([out14, out15], sort=True) # Use last df in memory, sorry
out15.to_csv('mexico-covid19-2020-04-15.csv', index=False)