In [41]:
import pandas as pd

In [44]:
def clean_summon_data(
    raw_df: pd.DataFrame,
) -> pd.DataFrame:
    df = raw_df.copy()

    df = df.rename(
        columns={
            "tipolista": "list_type",
            "subtipo": "list_subtype",
            "norden": "order",
            "puntos": "points",
            "dfuncion": "function",
            "fcpublico": "publication_date",
            "cfunc": "function_full_code",
            "ittipoacto": "act_type",
            "nombre": "name",
            "papellid": "first_surname",
            "sapellid": "second_surname",
        }
    )
    df = df.drop(
        columns=[
            "desempate",
            "acceso",
            "colectivo",
            "resp",
            "cdtpvia",
            "domicilio",
            "nmvial",
            "dcuerpo",
            "strpuntos",
            "fcFinPreSol",
            "fcIniPreSol",
            "exp",
            "perfilespecifico",
            "nombre",
            "papellid",
            "sapellid",
        ]
    )
    # interpret publication_date
    df["publication_date"] = pd.to_datetime(df["publication_date"], format="%Y-%m-%d")
    df["function_code"] = df["function_full_code"].apply(lambda x: int(str(x)[-3:]))
    df["list_full_code"] = df["list_type"].astype(str).str.cat(df["list_subtype"].astype(str), sep=".")
    df["points"] = df["points"].astype(float).apply(lambda x: round(x, 5))
    df["full_name"] = df["name"].str.cat(df["first_surname"].str.cat(df["second_surname"], sep=" "), sep=" ")
    return df

In [45]:
for list_type in ("O", "V", "E"):
    raw_df = pd.read_json(f"data/raw/summon-{list_type}.json")
    clean_df = clean_summon_data(raw_df)
    clean_df.to_csv(f"data/processed/summon-{list_type}.csv", index=False)

In [59]:
def clean_assign_data(
    raw_df: pd.DataFrame,
) -> pd.DataFrame:
    df = raw_df.copy()

    df = df.rename(
        columns={
            "lista": "list_type",
            "subtipo": "list_subtype",
            #"norden": "order",
            #"puntos": "points",
            "dfuncion": "function",
            "fcpublico": "publication_date",
            "fhasta": "end_date",
            "cfunc": "function_full_code",
            "ittipoacto": "act_type",
            "nombre": "name",
            "papellid": "first_surname",
            "sapellid": "second_surname",
            "dloc": "location",
            "centro": "school",
        }
    )
    
    df = df.drop(
        columns=[
            "desempate",
            "acceso",
            "resp",
            "cdtpvia",
            "domicilio",
            "nmvial",
            "dcuerpo",
            "perfilespecifico",
            "cloc",
            "numasig",
            "cdabrev",
            "ccentro",
            "observaciones",
            "miniDat",
            "year",
            "strfhasta",
            "act_type",
            "distrito",
        ]
    )
    # interpret publication_date
    df["publication_date"] = pd.to_datetime(df["publication_date"], format="%Y-%m-%d")
    df["end_date"] = pd.to_datetime(df["end_date"], format="%Y-%m-%d")
    df["function_code"] = df["function_full_code"].apply(lambda x: int(str(x)[-3:]))
    df["list_full_code"] = df["list_type"].astype(str).str.cat(df["list_subtype"].astype(str), sep=".")
    df["full_name"] = df["name"].str.cat(df["first_surname"].str.cat(df["second_surname"], sep=" "), sep=" ")
    return df

In [60]:
for list_type in ("O", "V", "E"):
    raw_df = pd.read_json(f"data/raw/assign-{list_type}.json")
    clean_df = clean_assign_data(raw_df)
    clean_df.to_csv(f"data/processed/assign-{list_type}.csv", index=False)