In [1]:
# https://github.com/pandas-dev/pandas/issues/7509
# https://github.com/pandas-dev/pandas/issues/2626

In [2]:
import pandas as pd
import numpy as np

### Old reports
Tried different approaches but using _'combine_first()'_ works ok.

In [3]:
df1 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-01.csv')
df2 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-02.csv')
df3 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-03.csv')
df4 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-04.csv')
df5 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-05.csv')

In [4]:
my_dfs = [df1, df2, df3, df4, df5]
for df in my_dfs:
    for column in df.columns:
        if 'Case_ID' in column:
            df.drop(column, axis = 1, inplace=True)

            df['Date_Confirmed'] = pd.to_datetime(df.Date_Confirmed)
            df['Date_Confirmed'] = df['Date_Confirmed'].dt.strftime('%d-%m-%Y')
            df['Date_Confirmed'] = df['Date_Confirmed'].replace('NaT', '')

            df['Date_Symptoms'] = pd.to_datetime(df.Date_Symptoms)
            df['Date_Symptoms'] = df['Date_Symptoms'].dt.strftime('%d-%m-%Y')
            df['Date_Symptoms'] = df['Date_Symptoms'].replace('NaT', '')

In [5]:
def combine_data(df_x, df_y):
    out = df_x.combine_first(df_y)
    out['Case_ID'] = np.arange(1, (len(out)+1))
    out['Age'] = out['Age'].astype(int)
    out = out [['Case_ID','Region','Sex','Age','Origin','Date_Symptoms','Date_Confirmed']]
    return out

In [6]:
def remove_ascii(df):
    df.Region = df.Region.str.replace('Á', 'A')
    df.Region = df.Region.str.replace('É', 'E')
    df.Region = df.Region.str.replace('Í', 'I')
    df.Region = df.Region.str.replace('Ó', 'O')
    df.Region = df.Region.str.replace('Ú', 'U')
    return df

In [7]:
out1 = df1

In [8]:
out2 = combine_data(out1, df2)

In [9]:
out3 = combine_data(out2, df3)

In [10]:
out4 = combine_data(out3, df4)

In [11]:
out5 = combine_data(out4, df5)

In [12]:
out5

Unnamed: 0,Case_ID,Region,Sex,Age,Origin,Date_Symptoms,Date_Confirmed
0,1,CIUDAD DE MÉXICO,M,35,Italia,22-02-2020,27-02-2020
1,2,SINALOA,M,41,Italia,22-02-2020,28-02-2020
2,3,CIUDAD DE MÉXICO,M,59,Italia,23-02-2020,29-02-2020
3,4,COAHUILA,F,20,Italia,27-02-2020,29-02-2020
4,5,CHIAPAS,F,18,Italia,25-02-2020,01-03-2020
...,...,...,...,...,...,...,...
2138,2139,CIUDAD DE MÉXICO,F,68,Contacto,20-03-2020,04-05-2020
2139,2140,CIUDAD DE MÉXICO,M,34,Contacto,20-03-2020,04-05-2020
2140,2141,CIUDAD DE MÉXICO,F,65,Contacto,21-03-2020,04-05-2020
2141,2142,CIUDAD DE MÉXICO,M,60,Contacto,22-03-2020,04-05-2020


In [13]:
(remove_ascii(out5)).to_csv('mexico-covid19-2020-04-05.csv', index=False)

### Test
1. Find if cases in new reports match the ones already reported
2. If that is the case, remove them from the new report
3. Finally concat to the previous report

In [14]:
old_data = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-05.csv')

In [15]:
new_data = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-06.csv')

In [16]:
new_data = new_data.drop(['Case_ID'], axis=1) # Remove this column before merge
new_data['Origin'].fillna('Contacto', inplace=True) # Doesn't specify but we'll assume NA means 'Contact'
new_data.index = np.arange(1, (len(new_data)+1)) # New index that doesn't start from 0
new_data = new_data.reset_index() # Add index to columns
new_data = new_data.rename(columns={'index':'Case_ID'}) # Rename index
len(new_data)

2439

In [17]:
new_data = new_data.merge(old_data, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y')) # Match cases from previous report
new_data = new_data.loc[:,~new_data.columns.str.contains('_y', case=False)] # Drop suffix
len(new_data)

2439

In [18]:
# Number of new cases should be...
len(new_data) - len(old_data)

296

In [19]:
test = new_data.append(old_data, sort=True) # The resulting length after appending
test = test[~test.index.duplicated(keep=False)] # And removing duplicate should match number of new cases
len(test)

296

### Make new reports

In [20]:
df6 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-06.csv')
df6 = df6.drop(['Origin'], axis=1)
df7 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-07.csv')
df7 = df7.drop(['Origin'], axis=1)

In [21]:
df8 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-08.csv')
df9 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-09.csv')

In [22]:
df10 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-10.csv')
df11 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-11.csv')
df12 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-12.csv')
df13 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-13.csv')
df14 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-14.csv')
df15 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-15.csv')
df16 = pd.read_csv('../data/mexico_covid19_data/raw-mexico-covid19-2020-04-16.csv')

In [23]:
my_new_dfs = [df10, df11, df12, df13, df14, df15, df16]
for df in my_new_dfs:
    df['Sex'] = df['Sex'].str.replace('FEMENINO','F')
    df['Sex'] = df['Sex'].str.replace('MASCULINO','M')

In [24]:
all_new_dfs = [df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16]
for df in my_new_dfs:
    df['Date_Symptoms'] = pd.to_datetime(df.Date_Symptoms)
    df['Date_Symptoms'] = df['Date_Symptoms'].dt.strftime('%d-%m-%Y')
    df['Date_Symptoms'] = df['Date_Symptoms'].replace('NaT', '')
    df.index = np.arange(1, (len(df)+1))
    df = df.reset_index()
    df = df.rename(columns={'index':'Case_ID'})

In [25]:
def make_new_report(new_data, old_data, new_date):
    
    new_data['Date_Symptoms'] = pd.to_datetime(new_data.Date_Symptoms)
    new_data['Date_Symptoms'] = new_data['Date_Symptoms'].dt.strftime('%d-%m-%Y')
    new_data['Date_Symptoms'] = new_data['Date_Symptoms'].replace('NaT', '')
    
    old_data['Date_Symptoms'] = pd.to_datetime(new_data.Date_Symptoms)
    old_data['Date_Symptoms'] = old_data['Date_Symptoms'].dt.strftime('%d-%m-%Y')
    old_data['Date_Symptoms'] = old_data['Date_Symptoms'].replace('NaT', '')
    
    new_data = new_data.merge(old_data, on='Case_ID', left_index=True, right_index=True, how='outer', suffixes=('', '_y'))
    new_data = new_data.loc[:,~new_data.columns.str.contains('_y', case=False)]
    
    app_new_data = new_data.append(old_data, sort=True)
    
    new_cases = app_new_data[~app_new_data.index.duplicated(keep=False)]
    new_cases.Date_Confirmed = new_cases.Date_Confirmed.fillna(new_date)
    
    new_report = pd.concat([old_data, new_cases], sort=True)
    new_report = new_report[['Case_ID','Region','Sex','Age','Date_Symptoms','Date_Confirmed']]
    
    new_report['Date_Symptoms'] = pd.to_datetime(new_report.Date_Symptoms)
    new_report['Date_Symptoms'] = new_report['Date_Symptoms'].dt.strftime('%d-%m-%Y')
    new_report['Date_Symptoms'] = new_report['Date_Symptoms'].replace('NaT', '')
    
    new_report['Age'] = new_report['Age'].astype(int)
    
    return new_report

In [26]:
out6 = make_new_report(df6, out5, '06-04-2020')
(remove_ascii(out6)).to_csv('mexico-covid19-2020-04-06.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [27]:
out7 = make_new_report(df7, out6, '07-04-2020')
(remove_ascii(out7)).to_csv('mexico-covid19-2020-04-07.csv', index=False)

In [28]:
out8 = make_new_report(df8, out7, '08-04-2020')
(remove_ascii(out8)).to_csv('mexico-covid19-2020-04-08.csv', index=False)

In [29]:
out9 = make_new_report(df9, out8, '09-04-2020')
(remove_ascii(out9)).to_csv('mexico-covid19-2020-04-09.csv', index=False)

In [30]:
out10 = make_new_report(df10, out9, '10-04-2020')
(remove_ascii(out10)).to_csv('mexico-covid19-2020-04-10.csv', index=False)

In [31]:
out11 = make_new_report(df11, out10, '11-04-2020')
(remove_ascii(out11)).to_csv('mexico-covid19-2020-04-11.csv', index=False)

In [32]:
out12 = make_new_report(df12, out11, '12-04-2020')
(remove_ascii(out12)).to_csv('mexico-covid19-2020-04-12.csv', index=False)

In [33]:
out13 = make_new_report(df13, out12, '13-04-2020')
(remove_ascii(out13)).to_csv('mexico-covid19-2020-04-13.csv', index=False)

In [34]:
out14 = make_new_report(df14, out13, '14-04-2020')
(remove_ascii(out14)).to_csv('mexico-covid19-2020-04-14.csv', index=False)

In [35]:
out15 = make_new_report(df15, out14, '15-04-2020')
(remove_ascii(out15)).to_csv('mexico-covid19-2020-04-15.csv', index=False)

In [36]:
out16 = make_new_report(df16, out15, '16-04-2020')
(remove_ascii(out16)).to_csv('mexico-covid19-2020-04-16.csv', index=False)