In [170]:
import pandas as pd
import datetime

In [171]:
df = pd.read_csv('../datos/tablas_procesadas/tabla_completa.tsv',
                 parse_dates=['fecha_inicio_sintomas', 'fecha_llegada_mx', 'fecha_caso_nuevo'],
                 sep='\t')

In [172]:
df.head()

Unnamed: 0,pseudo_indice,estado,sexo,edad,fecha_inicio_sintomas,id_rt-pcr,procedencia,fecha_llegada_mx,fecha_caso_nuevo
0,20200316_1,Ciudad de México,M,35,2020-02-22,confirmado,Italia,2020-02-22,NaT
1,20200316_2,Sinaloa,M,41,2020-02-22,confirmado,Italia,2020-02-21,NaT
2,20200316_3,Ciudad de México,M,59,2020-02-23,confirmado,Italia,2020-02-22,NaT
3,20200316_4,Coahuila,F,20,2020-02-27,confirmado,Italia,2020-02-25,NaT
4,20200316_5,Chiapas,F,18,2020-02-25,confirmado,Italia,2020-02-25,NaT


In [173]:
def fix_dates(df, column_name):
    
    nulls = len(df[df[column_name].isnull()])
    print(f'original nulls: {nulls}')
    
    print('Suspicious dates:')
    for date in df[column_name].value_counts().index:
        if (date.month is not 3) and (date.month is not 2):
            print(date)
    print('')
    
    new_dts = []
    
    for column, row in df.iterrows():
        
        # fecha_inicio_sintomas
        if row[column_name] is pd.NaT:
            new_dt = pd.NaT
            
        elif (row[column_name].month is not 3) and (row[column_name].month is not 2):
            year = row[column_name].year
            month = row[column_name].day
            day = row[column_name].month
            
            new_dt = datetime.datetime(year, month, day)
        
        else:
            new_dt = row[column_name]
    
        new_dts.append(new_dt)
    
    df[column_name] = new_dts
    
    new_nulls = len(df[df[column_name].isnull()])
    print(f'new nulls: {new_nulls}')
    
    print('Suspicious dates:')
    for date in df[column_name].value_counts().index:
        if (date.month is not 3) and (date.month is not 2):
            print(date)
    
    return df

## Columna `fecha_inicio_sintomas`

In [174]:
df2 = fix_dates(df, 'fecha_inicio_sintomas')

original nulls: 0
Suspicious dates:
2020-09-03 00:00:00
2020-12-03 00:00:00
2020-10-03 00:00:00
2020-11-03 00:00:00
2020-08-03 00:00:00
2020-07-03 00:00:00
2020-06-03 00:00:00
2020-05-03 00:00:00
2020-04-03 00:00:00
2020-01-03 00:00:00

new nulls: 0
Suspicious dates:


In [175]:
df2

Unnamed: 0,pseudo_indice,estado,sexo,edad,fecha_inicio_sintomas,id_rt-pcr,procedencia,fecha_llegada_mx,fecha_caso_nuevo
0,20200316_1,Ciudad de México,M,35,2020-02-22,confirmado,Italia,2020-02-22,NaT
1,20200316_2,Sinaloa,M,41,2020-02-22,confirmado,Italia,2020-02-21,NaT
2,20200316_3,Ciudad de México,M,59,2020-02-23,confirmado,Italia,2020-02-22,NaT
3,20200316_4,Coahuila,F,20,2020-02-27,confirmado,Italia,2020-02-25,NaT
4,20200316_5,Chiapas,F,18,2020-02-25,confirmado,Italia,2020-02-25,NaT
...,...,...,...,...,...,...,...,...,...
847,20200328_844,Ciudad de México,F,43,2020-03-15,confirmado,Contacto,NaT,2020-03-28
848,20200328_845,Ciudad de México,M,27,2020-03-14,confirmado,Estados Unidos,2020-03-15,2020-03-28
849,20200328_846,México,M,35,2020-03-26,confirmado,Estados Unidos,2020-03-15,2020-03-28
850,20200328_847,Ciudad de México,M,43,2020-03-11,confirmado,Estados Unidos,2020-08-03,2020-03-28


## Columna `fecha_llegada_mx`

In [177]:
df3 = fix_dates(df2, 'fecha_llegada_mx')

original nulls: 303
Suspicious dates:
2020-09-03 00:00:00
2020-12-03 00:00:00
2020-10-03 00:00:00
2020-11-03 00:00:00
2020-08-03 00:00:00
2020-07-03 00:00:00
2020-06-03 00:00:00
2020-04-03 00:00:00
2020-01-03 00:00:00
2020-05-03 00:00:00
2020-10-02 00:00:00
2020-05-02 00:00:00

new nulls: 303
Suspicious dates:


In [179]:
df3.to_csv('../datos/tablas_procesadas/tabla_completa.tsv',
           sep='\t',
           index=None)