In [1]:
import pandas as pd 
import geopandas as gpd
import numpy as np 
import os 
import matplotlib.pyplot as plt 
from datetime import datetime
from unidecode import unidecode


def parse_df(province_name):
    # import data
    df = pd.read_csv("./raw-epi-data/" + province_name + ".csv", sep="\t", encoding='utf-16', skiprows=1, names=range(575))
    
    # get years, months, days and parse dates
    years, months, days = df.iloc[0,1:].values, df.iloc[1,1:].values, df.iloc[2,1:].values
    dates = [datetime.strptime('-'.join([yy, mm, dd]), "%Y-%B-%d") for yy, mm, dd in zip(years, months, days)]

    # get data
    tot_dates, tot_names, tot_cum_cases, tot_new_cases = [], [], [], []
    for j in np.arange(3, len(df)):
        row = df.iloc[j]
        # get municipality name
        name = row[0]

        # get cases
        cases = [int(el.replace(',', '')) for el in row[1:]]
        tot_cum_cases.extend(cases)
        tot_new_cases.extend(np.diff(cases, prepend=0))
        tot_dates.extend(dates)
        tot_names.extend([name] * len(row[1:]))
        
    df_parsed = pd.DataFrame(data={"name": tot_names, "date": tot_dates, "new_cases": tot_new_cases, "tot_cases": tot_cum_cases})
    df_parsed.date = pd.to_datetime(df_parsed.date)
    df_parsed.to_csv("./processed-by-province/" + province_name + ".csv", index=False)
    return df_parsed

In [2]:
def parse_gid2(name):
    return matcher[name]

# import GADM data for matching
gadm2 = gpd.read_file(
        "../master-files/master_file.shp").join(pd.read_csv(
        "../master-files/master_file.csv"))

gadm2["NAME_1"] = [unidecode(e) for e in gadm2["NAME_1"].values]
gadm2["NAME_2"] = [unidecode(e).lower() for e in gadm2["NAME_2"].values]

tot_dates, tot_names, tot_cum_cases, tot_new_cases, tot_gid1, tot_gid2 = [], [], [], [], [], []
# process data by province (add GID_2 and GID_1)
files = os.listdir("./raw-epi-data/")
for file in files:
    if ".csv" in file:
        df_parsed = parse_df(file.replace(".csv", ""))        
            
        # add GID_1
        gid1 = gadm2.loc[gadm2.NAME_1 == file.replace(".csv", "")].GID_1.values[0]
        df_parsed["GID_1"] = gid1
        
        # add GID_2
        df_parsed['name'] = [unidecode(e).lower() for e in df_parsed['name'].values]
        gadm1 = gadm2.loc[gadm2.NAME_1 == file.replace(".csv", "")]
        matcher = {}
        for name in df_parsed.name.unique():
            
            if name == "san crtistobal":
                matchname = "san cristobal"
                
            elif name == "salitre":
                matchname = "urbina jado"
                
            elif name == "puebloviejo":
                matchname = "pueblo viejo"
                
            elif name == "distrito metropolitano de quito":
                matchname = "quito"
                
            else:
                matchname = name

            # match
            gid_2 = gadm1.loc[gadm1.NAME_2 == matchname].GID_2.values[0]
            matcher[name] = gid_2
            
        df_parsed["GID_2"] = df_parsed.name.apply(parse_gid2)
        tot_dates.extend(df_parsed["date"].values)
        tot_names.extend(df_parsed["name"].values)
        tot_cum_cases.extend(df_parsed["tot_cases"].values)
        tot_new_cases.extend(df_parsed["new_cases"].values)
        tot_gid1.extend(df_parsed["GID_1"].values)
        tot_gid2.extend(df_parsed["GID_2"].values)
        
df_tot = pd.DataFrame(data={"name": tot_names, "date": tot_dates, "new_cases": tot_new_cases, 
                            "tot_cases": tot_cum_cases, "GID_2": tot_gid2, "GID_1": tot_gid1})

df_tot.date = pd.to_datetime(df_tot.date)
df_tot.to_csv("./df_new_data_municipio.csv", index=False)