# Data description

In [None]:
import os
import json
import bs4 as bs
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain
import matplotlib.pyplot as plt

from utilities import *

In [None]:
here = os.getcwd()
project_path = os.path.dirname(here)
data_path = os.path.join(project_path, "data")
visualizations_path = os.path.join(project_path, "visualizations")

## Senators distribution

In [None]:
senators_path = os.path.join(data_path, "session_29-12-2020_senators.csv")
senators = pd.read_csv(senators_path)
senators.head(2)

In [None]:
party_correction = {"FRENTE TODOS": "FRENTE DE TODOS"}
senators["party"] = senators.party.map(
    lambda x: party_correction.get(x, x)
)

In [None]:
senators.describe()

In [None]:
senators_party_count = senators.party.value_counts().to_frame().reset_index()
save_dataframe(
    senators_party_count, folder=visualizations_path, filename="senators_party_count"
)
senators_province_count = senators.province.value_counts().to_frame().reset_index()
save_dataframe(
    senators_province_count, folder=visualizations_path, filename="senators_province_count"
)

## Vote information

In [None]:
senators_vote = (
    senators
    .groupby(["party", "vote"])
    .size()
    .reset_index()
    .merge(senators_count)
    .assign(party=lambda x: x.party.str.title())
    .rename(columns={0:"n_votes", "count":"n_senators"})
    .sort_values(by=["n_senators","party"], ascending=[False,True])
    .reset_index(drop=True)
    .pivot_table(
        index=["party"], columns=["vote"],
        values=["n_votes"],fill_value=0, margins_name=["chicho", "cacho"])
)
senators_vote.columns = senators_vote.columns.get_level_values(1)
senators_vote.columns.name = ""

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
senators_vote.plot(kind='bar', stacked=True, ax=ax)
ax.set_xlabel("")
ax.set_ylabel("Cantidad de senadores")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.legend(title="Voto")
plt.title("Voto por alianza o partido")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"senators_vote.png"))

In [None]:
senators_vote.reset_index(inplace=True)
save_dataframe(
    senators_vote[["party","positivo", "negativo", "ausente", "abstención"]],
    folder=visualizations_path, filename="senators_vote"
)

## Speech data

In [None]:
speech_path = os.path.join(data_path, "session_29-12-2020_discourse.xml")

with open(speech_path, "r") as f:
    speech_text = f.read() 

speech = bs.BeautifulSoup(speech_text, "lxml")

In [None]:
speakers = [s.get_attribute_list("speaker") for s in speech.find_all("discourse", {"speech":"true"})]
speakers = list(set(chain(*speakers)))
speakers_map = dict(zip(list(map(preprocess_name,speakers)), speakers))

In [None]:
# check that each speaker match with only one senator
senators["speaker"] = senators.name.apply(
    lambda x: match_senator_name(x, list(speakers_map.keys()))
)
senators["n_speaker"] = senators.speaker.str.len().sort_values()
senators.sort_values(by=["n_speaker"], ascending=True, inplace=True)

In [None]:
# data to review
#senators[["name", "speaker"]].set_index("name").to_json(
#    "map_name2speaker.json", orient="index", indent=4, force_ascii=False
#)
#senators.drop(columns=["speaker", "n_speaker"], inplace=True)

In [None]:
with open("map_name2speaker.json", "r") as f:
    map_name2speaker = json.load(f)

In [None]:
senators["speaker"] = senators.name.apply(lambda x:
    speakers_map.get(map_name2speaker.get(x).get("speaker"))
)
senators.sort_index(inplace=True)

In [None]:
senators["speech"] = senators.speaker.apply(lambda x: assign_speech(x, speech))
senators["speech_prep"] = senators.speech.apply(lambda x: list(map(preprocess_text, x)))
senators["n_interventions"] = senators.speech_prep.str.len()
senators["n_tokens_interventions"] = senators.speech_prep.apply(
    lambda x: list(map(count_tokens, x))
)
senators["n_unique_tokens_interventions"] = senators.speech_prep.apply(
    lambda x: list(map(lambda z: count_tokens(z, unique=True), x))
)
senators["mean_tokens_interventions"] = senators["n_tokens_interventions"].apply(
    lambda x: np.mean(x) if len(x)>0 else 0
)
senators["mean_unique_tokens_interventions"] = senators["n_unique_tokens_interventions"].apply(
    lambda x: np.mean(x) if len(x)>0 else 0
)

In [None]:
cols = ["mean_tokens_interventions", "mean_unique_tokens_interventions"]
titles = ["Tokens totales", "Tokens únicos"]

fig, axs = plt.subplots(2,1)

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.suptitle("Promedio de tokens por intervención")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"tokens_global.png"))

In [None]:
cols = ["mean_tokens_interventions", "mean_unique_tokens_interventions"]
titles = ["Tokens totales", "Tokens únicos"]

fig, axs = plt.subplots(2,1, figsize=(10,8), sharex=True)

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x="party", y=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("")
    axs[i].set_ylabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.xticks(rotation=45, horizontalalignment="right")
plt.suptitle("Promedio de tokens por intervención por partido")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"tokens_party.png"))

In [None]:
cols = ["mean_tokens_interventions", "mean_unique_tokens_interventions"]
titles = ["Tokens totales", "Tokens únicos"]

fig, axs = plt.subplots(1,2, figsize=(8,4))

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x="vote", y=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("Voto")
    axs[i].set_ylabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.suptitle("Promedio de tokens por intervención por voto")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"tokens_vote.png"))