# Descripción de datos de texto

In [51]:
import json
import os

import numpy as np
import pandas as pd

from config import DATA_PATH, VISUALIZATIONS_PATH
from utilities import SPACY_MODEL, preprocess_text, count_tokens

## Preprocesado

In [11]:
session_speech = pd.read_csv(
    os.path.join(DATA_PATH, "session_speech_speaker_info.csv"),
    converters={"speech": eval},
    )
session_speech.head(2)

Unnamed: 0,name,vote,senator,province,party,party_family,speaker,speech
0,Ana Claudia Almirón,positivo,ALMIRÓN ANA CLAUDIA,CORRIENTES,ALIANZA FRENTE PARA LA VICTORIA,Frente para la victoria,Almirón,"[Hace dos años, tuvimos una sesión histórica c..."
1,Roberto Gustavo Basualdo,negativo,BASUALDO ROBERTO GUSTAVO,SAN JUAN,ALIANZA CAMBIEMOS SAN JUAN,Juntos por el cambio,Basualdo,"[Gracias, señor presidente. Hoy es un día en e..."


In [12]:
session_speech = (
    session_speech
    .explode("speech")
    .dropna(subset=["speech"], ignore_index=True)
)
session_speech.head(2)

Unnamed: 0,name,vote,senator,province,party,party_family,speaker,speech
0,Ana Claudia Almirón,positivo,ALMIRÓN ANA CLAUDIA,CORRIENTES,ALIANZA FRENTE PARA LA VICTORIA,Frente para la victoria,Almirón,"Hace dos años, tuvimos una sesión histórica cu..."
1,Roberto Gustavo Basualdo,negativo,BASUALDO ROBERTO GUSTAVO,SAN JUAN,ALIANZA CAMBIEMOS SAN JUAN,Juntos por el cambio,Basualdo,"Gracias, señor presidente. Hoy es un día en el..."


In [16]:
session_speech["speech_preprocessed"] = session_speech.speech.apply(preprocess_text)

In [17]:
out_file = os.path.join(DATA_PATH, "words2lemmas.json")

with open(out_file, "r") as f:
    words2lemmas = json.load(f)

words2lemmas["es"]

{'VERB': {'lemma': 'ser', 'count': 1247}}

In [18]:
def select_lemma_and_pos(text: str) -> tuple[str, str]:
    lemmatized_text, pos_text = "", ""
    tokens = SPACY_MODEL(text)
    for t in tokens:
        token = preprocess_text(t.text)
        lemmas = words2lemmas.get(token)
        if lemmas is None:
            continue
        if len(lemmas) == 1:
            lemma_value = list(lemmas.values())[0]
            pos = list(lemmas.keys())[0]
        else:
            lemma_value = lemmas.get(t.pos_)
            if lemma_value is None:
                lemma_count = 0
                for key, value in lemmas.items():
                    if value["count"] > lemma_count:
                        lemma_count = value["count"]
                        pos = key
                        lemma_value = value
        lemma = lemma_value.get("lemma")
        if (lemma is None) or (pos is None):
            raise Exception(
                f"Error while processing token {token} ({t.pos}), lemmas found {lemmas}"
            )
        lemmatized_text += f"{lemma} "
        pos_text += f"{pos} "
    return lemmatized_text.strip(), pos_text.strip()

In [19]:
session_speech["speech_lemmas"], session_speech["speech_pos"] = (
    zip(*session_speech.speech.apply(select_lemma_and_pos))
)
session_speech[["speech", "speech_lemmas", "speech_pos"]].head(2)

Unnamed: 0,speech,speech_lemmas,speech_pos
0,"Hace dos años, tuvimos una sesión histórica cu...",hacer año tener una sesión histórica cuando de...,VERB NOUN VERB DET NOUN ADJ SCONJ VERB ADP ADJ...
1,"Gracias, señor presidente. Hoy es un día en el...",gracia señor presidente hoy ser un día en el q...,NOUN NOUN NOUN ADV VERB DET NOUN ADP DET SCONJ...


In [20]:
out_file = os.path.join(DATA_PATH, "session_speech.csv")
session_speech.to_csv(out_file, index=False)

## Métricas de resumen

### Distribución de tokens

In [44]:
columns = session_speech.filter(regex="^speech_").columns.to_list()
metrics = ["count", "count_unique"]

for col in columns:
    for metric in metrics:
        new_col = f"{col}_{metric}"
        print(f"* Calculando {metric} para {col}")
        print(f"** Nueva columna: {new_col}")
        unique = True if "unique" in metric else False
        session_speech[new_col] = session_speech[col].apply(lambda x: count_tokens(x, unique=unique))

* Calculando count para speech_preprocessed
** Nueva columna: speech_preprocessed_count
* Calculando count_unique para speech_preprocessed
** Nueva columna: speech_preprocessed_count_unique
* Calculando count para speech_lemmas
** Nueva columna: speech_lemmas_count
* Calculando count_unique para speech_lemmas
** Nueva columna: speech_lemmas_count_unique
* Calculando count para speech_pos
** Nueva columna: speech_pos_count
* Calculando count_unique para speech_pos
** Nueva columna: speech_pos_count_unique


In [45]:
session_speech.filter(regex="count").head()

Unnamed: 0,speech_preprocessed_count,speech_preprocessed_count_unique,speech_lemmas_count,speech_lemmas_count_unique,speech_pos_count,speech_pos_count_unique
0,1723,602,1684,459,1684,10
1,411,187,405,146,405,10
2,1155,443,1144,365,1144,10
3,1075,452,1070,397,1070,10
4,1486,576,1479,460,1479,10


In [66]:
(
    session_speech
    .filter(regex=r"name|count")
    .groupby("name")
    .agg([np.mean, np.median, np.std])
    .fillna(0)
).head()

Unnamed: 0_level_0,speech_preprocessed_count,speech_preprocessed_count,speech_preprocessed_count,speech_preprocessed_count_unique,speech_preprocessed_count_unique,speech_preprocessed_count_unique,speech_lemmas_count,speech_lemmas_count,speech_lemmas_count,speech_lemmas_count_unique,speech_lemmas_count_unique,speech_lemmas_count_unique,speech_pos_count,speech_pos_count,speech_pos_count,speech_pos_count_unique,speech_pos_count_unique,speech_pos_count_unique
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Alberto Edgardo Weretilneck,1155.0,145.0,1874.577019,359.0,88.0,545.561179,1123.333333,144.0,1820.615647,263.333333,85.0,383.017406,1123.333333,144.0,1820.615647,7.0,9.0,4.358899
Alfredo Héctor Luenzo,784.0,784.0,219.203102,287.5,287.5,74.246212,780.5,780.5,218.495995,239.5,239.5,57.275649,780.5,780.5,218.495995,10.0,10.0,0.0
Alfredo Luis De Angeli,844.0,844.0,0.0,345.0,345.0,0.0,835.0,835.0,0.0,262.0,262.0,0.0,835.0,835.0,0.0,10.0,10.0,0.0
Ana Claudia Almirón,1723.0,1723.0,0.0,602.0,602.0,0.0,1684.0,1684.0,0.0,459.0,459.0,0.0,1684.0,1684.0,0.0,10.0,10.0,0.0
Ana María Ianni,218.2,8.0,290.745077,95.6,8.0,124.66074,216.2,8.0,288.109701,79.6,8.0,101.82976,216.2,8.0,288.109701,6.2,5.0,3.63318


In [67]:
(
    session_speech
    .filter(regex=r"vote|count")
    .groupby("vote")
    .agg([np.mean, np.median, np.std])
    .fillna(0)
).head()

Unnamed: 0_level_0,speech_preprocessed_count,speech_preprocessed_count,speech_preprocessed_count,speech_preprocessed_count_unique,speech_preprocessed_count_unique,speech_preprocessed_count_unique,speech_lemmas_count,speech_lemmas_count,speech_lemmas_count,speech_lemmas_count_unique,speech_lemmas_count_unique,speech_lemmas_count_unique,speech_pos_count,speech_pos_count,speech_pos_count,speech_pos_count_unique,speech_pos_count_unique,speech_pos_count_unique
Unnamed: 0_level_1,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std,mean,median,std
vote,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
abstención,192.0,192.0,0.0,116.0,116.0,0.0,190.0,190.0,0.0,109.0,109.0,0.0,190.0,190.0,0.0,10.0,10.0,0.0
ausente,1143.0,1143.0,0.0,443.0,443.0,0.0,1140.0,1140.0,0.0,351.0,351.0,0.0,1140.0,1140.0,0.0,10.0,10.0,0.0
negativo,349.761364,8.0,661.138151,136.897727,8.0,224.504412,346.238636,8.0,653.820496,111.215909,8.0,175.469515,346.227273,8.0,653.781568,6.215909,5.0,3.217941
positivo,516.711712,13.0,826.768753,185.108108,12.0,269.485953,510.945946,13.0,816.148748,144.468468,12.0,203.290514,510.927928,13.0,816.123717,6.576577,6.0,3.369784


### Intervenciones por voto

In [63]:
(
    session_speech
    .groupby("vote")["speech"]
    .apply(lambda x: len(pd.Series(x).to_list()))
    .sort_values(ascending=False)
    .to_frame()
)

Unnamed: 0_level_0,speech
vote,Unnamed: 1_level_1
positivo,111
negativo,88
abstención,1
ausente,1


## Visualizaciones

In [None]:
fig, ax = plt.subplots(figsize=(5,3))
sns.histplot(data=senators, x="n_interventions", ax=ax, alpha=0.7)
ax.set_xlabel("Cantidad de intervenciones")
ax.set_ylabel("Cantidad de senadores")
plt.suptitle("Distribución de intervenciones")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_histplot_interventions.png"))

fig, ax = plt.subplots(figsize=(5,3))
sns.boxplot(data=senators, x="n_interventions", ax=ax, boxprops=dict(alpha=.7))
ax.set_xlabel("Cantidad de intervenciones")
ax.set_ylabel("Cantidad de senadores")
plt.suptitle("Distribución de intervenciones")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_boxplot_interventions.png"))

In [None]:
# INTERVENTIONS
(
    senators
    .n_interventions
    .describe()
    .to_frame()
    .T
    .drop(columns=["count"])
    .assign(
        median=np.median(senators.n_interventions),
        mode=mode(senators.n_interventions)
    )
    .reset_index(drop=True)
)

In [None]:
senators_with_no_speech = (senators.speech.str.len()==0).sum()
print(f"{senators_with_no_speech} senadores no intervinieron en la sesión.")

In [None]:
cols = ["n_tokens_interventions", "n_unique_tokens_interventions"]
titles = ["Distribución de tokens en cada intervención", "Distribución de tokens únicos en cada intervención"]

for col, title in zip(cols, titles):
    fig, ax = plt.subplots(figsize=(7,3))
    data = list(chain(*senators[col].to_list()))
    mean, meadian, std = calculate_univariant_metrics(data, round_=3)
    sns.histplot(data=data, ax=ax, bins=10)
    ax.set_xlabel("Cantidad de tokens")
    ax.set_ylabel("Cantidad de intervenciones")
    x_lim, y_lim = ax.get_xlim()[1], ax.get_ylim()[0]
    plt.suptitle(title)
    plt.text(
        x_lim + x_lim*0.05, y_lim,
        f"Media={mean}\nMediana={meadian}\nDesvío Estándar={std}",
        horizontalalignment='left',
        fontsize=10
    )
    plt.tight_layout()
    plt.savefig(os.path.join(visualizations_path,f"distrib_histplot_{col}.png"))

In [None]:
cols = ["n_tokens_interventions", "n_unique_tokens_interventions"]
titles = ["Distribución de tokens en cada intervención", "Distribución de tokens únicos en cada intervención"]

for col, title in zip(cols, titles):
    fig, ax = plt.subplots(figsize=(6,2))
    data = list(chain(*senators[col].to_list()))
    mean, meadian, std = calculate_univariant_metrics(data, round_=3)
    sns.boxplot(x=data, ax=ax)
    ax.set_xlabel("")
    ax.tick_params(left=False)
    x_lim, y_lim = ax.get_xlim()[1], ax.get_ylim()[0]
    plt.text(
        x_lim + x_lim*0.05, y_lim,
        f"Media={mean}\nMediana={meadian}\nDesvío Estándar={std}",
        horizontalalignment='left',
        fontsize=10
    )
    plt.suptitle(title)
    plt.tight_layout()
    plt.savefig(os.path.join(visualizations_path,f"distrib_boxplot_{col}.png"))

In [None]:
cols = ["mean_tokens_interventions", "median_tokens_interventions"]
titles = ["Distribución de medias", "Distribución de medianas"]

fig, axs = plt.subplots(2,1, figsize=(5,3))

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.suptitle("Tokens totales por intervención")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_boxplot_tokens.png"))

In [None]:
cols = ["mean_unique_tokens_interventions", "median_unique_tokens_interventions"]
titles = ["Distribución de medias", "Distribución de medianas"]

fig, axs = plt.subplots(2,1, figsize=(5,3))

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.suptitle("Tokens únicos por intervención")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_boxplot_tokens_uniq.png"))

In [None]:
cols = ["mean_tokens_interventions", "mean_unique_tokens_interventions"]
titles = ["Tokens totales", "Tokens únicos"]

fig, axs = plt.subplots(2,1, figsize=(10,8), sharex=True)

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x="party", y=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("")
    axs[i].set_ylabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.xticks(rotation=45, horizontalalignment="right")
plt.suptitle("Distribución de medias de tokens")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_boxplot_mean_tokens_party.png"))

In [None]:
cols = ["mean_tokens_interventions", "mean_unique_tokens_interventions"]
titles = ["Tokens totales", "Tokens únicos"]

fig, axs = plt.subplots(1,2, figsize=(8,4))

i = 0
for col,title in zip(cols, titles):
    sns.boxplot(data=senators, x="vote", y=col, ax=axs[i])
    axs[i].set_title(title)
    axs[i].set_xlabel("Voto")
    axs[i].set_ylabel("Tokens")
    axs[i].tick_params(left=False)
    i += 1
plt.suptitle("Distribución de medias de tokens")
plt.tight_layout()
plt.savefig(os.path.join(visualizations_path,"distrib_boxplot_mean_tokens_party_family.png"))

In [None]:
out_file = os.path.join(data_path, "session_speech.csv")
senators[["name", "vote", "speech"]].to_csv(out_file, index = False)