# Session attendees exploration

Notebook for merging two datasets:

- [../data/session_29-12-2020_attendees.csv](../data/session_29-12-2020_attendees.csv): dataset with the list of attendees to the session (including not only senators but also the session's president and secretaires). This dataset was manually enhanced with the senators vote decission.
- [../data/senators_data.csv](../data/senators_data.csv): dataset downloaded form the [Senate webpage](https://www.senado.gob.ar/), with information about the senators (their labor period, their party, etc).

In [None]:
import os
import pandas as pd

In [None]:
here = os.getcwd()
project_path = os.path.dirname(here)
data_path = os.path.join(project_path, "data")

In [None]:
session_attendees_path = os.path.join(data_path, "session_29-12-2020_attendees.csv")
session_attendees = pd.read_csv(
    session_attendees_path, sep="|", names=["rol", "name", "vote"]
    )
session_attendees.head(2)

In [None]:
session_attendees.shape

In [None]:
senators_data_path = os.path.join(data_path, "senators_data.csv")
senators_data = pd.read_csv(senators_data_path,)
senators_data.head(2)

In [None]:
senators_data.shape

In [None]:
def preprocess_name(name: str) -> str:
    name = name.lower()
    name = name.translate(
        str.maketrans("áéíóúàèìòùäëïöü","aeiou"*3)
    )
    tokens = sorted(name.split())
    return " ".join(tokens)

In [None]:
session_attendees["prep_name"] = session_attendees.name.apply(preprocess_name)
senators_data["prep_senador"] = senators_data.Senador.apply(preprocess_name) 

In [None]:
names_mapping = (
    senators_data[["prep_senador","Senador"]]
    .set_index("prep_senador")
    .to_dict("index")
)

In [None]:
def map_name(name: str, mapping: dict=names_mapping) -> str:
    canon = list()
    name_set = set(name.split())
    for key, value in mapping.items():
        key_set = set(key.split())
        if name_set.issubset(key_set) or key_set.issubset(name_set):
            canon.append(value["Senador"])
    return canon

In [None]:
def postprocess_name(name: list):
    if name:
        assert len(name)==1, f"{name} with several matches."
        return " ".join(name)
    else:
        return None

In [None]:
session_attendees["senador"] = session_attendees.prep_name.apply(map_name)
session_attendees["senador"] = session_attendees.senador.apply(postprocess_name)
session_attendees.head(2)

In [None]:
session_attendees = (
    session_attendees
    .merge(
        senators_data,
        how= "left",
        left_on = "senador",
        right_on = "Senador"
    )
    .drop(columns=["prep_name", "prep_senador", "senador"])
)
session_attendees.head(2)

In [None]:
session_attendees = (
    session_attendees.loc[
        session_attendees.rol=="PRESENTES",
        ["name", "vote", "Senador", "Provincia", "Partido Político o alianza"]
    ]
    .rename(columns={
        "Senador": "senator",
        "Provincia": "province",
        "Partido Político o alianza": "party"
    })
)

In [None]:
session_senators_path = session_attendees_path.replace("_attendees.csv", "_senators.csv")
session_attendees.to_csv(session_senators_path, index=False)