# _Datasets_ de entrenamiento y testeo

In [3]:
import os
import re
from typing import Any
from copy import deepcopy
from string import punctuation

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

from config import DATA_PATH, VISUALIZATIONS_PATH, MODELS_PATH

In [4]:
sns.set_style("whitegrid")

In [5]:
os.makedirs(MODELS_PATH, exist_ok=True)

In [6]:
data_path = os.path.join(DATA_PATH, "session_speech.csv")
data = pd.read_csv(data_path)
data.columns

Index(['name', 'vote', 'senator', 'province', 'party', 'party_family',
       'speaker', 'speech', 'speech_preprocessed', 'speech_lemmas',
       'speech_pos'],
      dtype='object')

In [7]:
data.head(2)

Unnamed: 0,name,vote,senator,province,party,party_family,speaker,speech,speech_preprocessed,speech_lemmas,speech_pos
0,Ana Claudia Almirón,positivo,ALMIRÓN ANA CLAUDIA,CORRIENTES,ALIANZA FRENTE PARA LA VICTORIA,Frente para la victoria,Almirón,"Hace dos años, tuvimos una sesión histórica cu...",hace dos años tuvimos una sesión histórica cua...,hacer año tener una sesión histórica cuando de...,VERB NOUN VERB DET NOUN ADJ SCONJ VERB ADP ADJ...
1,Roberto Gustavo Basualdo,negativo,BASUALDO ROBERTO GUSTAVO,SAN JUAN,ALIANZA CAMBIEMOS SAN JUAN,Juntos por el cambio,Basualdo,"Gracias, señor presidente. Hoy es un día en el...",gracias señor presidente hoy es un día en el q...,gracia señor presidente hoy ser un día en el q...,NOUN NOUN NOUN ADV VERB DET NOUN ADP DET SCONJ...


In [8]:
data = (
    data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
    .reset_index(drop=True)
)
data.shape

(199, 11)

In [9]:
data.vote.value_counts(normalize=True)

vote
positivo    0.557789
negativo    0.442211
Name: proportion, dtype: float64

## Separación en _train_ y _test_

In [10]:
X_train_index, X_test_index = train_test_split(
    data.index,
    test_size=.2,
    random_state=6300,
    shuffle=True,
    stratify=data.vote
)

In [11]:
for index, name in zip([X_train_index, X_test_index], ["entrenamiento", "testeo"]):
    print(f"** Cantidad de datos en conjunto de {name}: {index.shape[0]}")
    print("** Distribución de la variable target:")
    print(f"{data.loc[index, 'vote'].value_counts(normalize=True).to_frame()}", end="\n\n")

** Cantidad de datos en conjunto de entrenamiento: 159
** Distribución de la variable target:
          proportion
vote                
positivo    0.559748
negativo    0.440252

** Cantidad de datos en conjunto de testeo: 40
** Distribución de la variable target:
          proportion
vote                
positivo        0.55
negativo        0.45



In [12]:
INDEX = os.path.join(MODELS_PATH, "index")
os.makedirs(INDEX, exist_ok=True)

for file in ["X_train_index", "X_test_index"]:
    dataset = eval(file)
    dataset.to_series().to_csv(os.path.join(INDEX, f"{file}.csv"), header=None, index=False)

## _Encoding_ de variables

### Predictoras

#### Etiquetas POS

In [13]:
vectorizer = TfidfVectorizer(norm="l1", use_idf=False, smooth_idf=False, sublinear_tf=False)
X = vectorizer.fit_transform(data.speech_pos)
pos = (
    pd.DataFrame(X.toarray(), columns=map(str.upper,vectorizer.get_feature_names_out()))
    [["ADJ", "ADV", "NOUN", "VERB"]]
)
pos.head()

Unnamed: 0,ADJ,ADV,NOUN,VERB
0,0.065321,0.051069,0.204869,0.179929
1,0.061728,0.079012,0.167901,0.246914
2,0.074301,0.060315,0.217657,0.151224
3,0.101869,0.042056,0.236449,0.139252
4,0.091278,0.067613,0.18526,0.187289


#### Lemmas

In [14]:
STATS = os.path.join(VISUALIZATIONS_PATH, "stats")

##### Proporciones sin _stopwords_ (Zipf)

In [21]:
proportions = pd.read_csv(os.path.join(STATS, "proporciones_sin_stopwords_zipf.csv"))
proportions.head()

Unnamed: 0,word,total,diff,pos,neg
0,abajo,0.693147,-3.1e-05,3.9e-05,7e-05
1,abandonada,0.0,3.9e-05,3.9e-05,0.0
2,abandonado,0.0,3.9e-05,3.9e-05,0.0
3,abandonar,0.0,-7e-05,0.0,7e-05
4,abandono,0.0,-7e-05,0.0,7e-05


In [27]:
pos_words = proportions.nlargest(n=250, columns=["diff"], keep="all").word.to_list()
neg_words = proportions.nsmallest(n=250, columns=["diff"], keep="all").word.to_list()

In [31]:
# cantidad de rasgos
proportions_words = pos_words+neg_words
len(proportions_words)

505

##### Ratio de _log-odds_

In [32]:
log_odds = pd.read_csv(os.path.join(STATS, "log_odds.csv"))
log_odds.head()

Unnamed: 0,word,total,diff,pos,neg
0,abajo,0.693147,-0.615795,1.9e-05,3.5e-05
1,abandonada,0.0,inf,1.9e-05,0.0
2,abandonado,0.0,inf,1.9e-05,0.0
3,abandonar,0.0,-inf,0.0,3.5e-05
4,abandono,0.0,-inf,0.0,3.5e-05


In [42]:
log_odds.nlargest(n=200, columns=["diff", "total"], keep="all") #.word.to_list()
log_odds.nsmallest(n=200, columns=["diff", "total"], keep="all") #.word.to_list()

Unnamed: 0,word,total,diff,pos,neg
3,abandonar,0.0,-inf,0.0,0.000035
4,abandono,0.0,-inf,0.0,0.000035
8,aberración,0.0,-inf,0.0,0.000035
11,abiertamente,0.0,-inf,0.0,0.000035
14,abocado,0.0,-inf,0.0,0.000035
...,...,...,...,...,...
4749,zimmermann,0.0,-inf,0.0,0.000035
4751,zoom,0.0,-inf,0.0,0.000035
4754,ángulo,0.0,-inf,0.0,0.000035
4762,éxito,0.0,-inf,0.0,0.000035


### Variable _target_

In [None]:
le = LabelEncoder()

data["target"] = le.fit_transform(data.vote)

In [None]:
for value in data.vote.unique():
    print(f"Categoría {value} ---> {le.transform([value])[0]}")

# Parameters Selection

In [None]:
def build_gridsearch(clf: Any, clf_params: dict[str, list]) -> GridSearchCV:
    params = {
        "tfidf__min_df": [0.1, 0.3, 0.5, 0.7],
        "tfidf__norm": ["l1", "l2"],
        "tfidf__smooth_idf": [False, True],
        "tfidf__sublinear_tf": [False, True],
        "tfidf__min_df": [0.05, 0.1],
        **clf_params
    }
    pipeline = Pipeline([
        (
            'tfidf',
            TfidfVectorizer(
                lowercase=True,
                preprocessor=preprocess
            )
        ),
        (
            'clf',
            clf
        )
    ])
    return GridSearchCV(
        pipeline,
        params,
        scoring='f1',
        cv=5,
        return_train_score=True,
        n_jobs=-1
    )


def build_results_df(cv_results: dict[str, np.ndarray]) -> pd.DataFrame:
    df_cv_results = pd.DataFrame(cv_results)
    param_cols = df_cv_results.filter(regex="param_").columns.tolist()
    df_cv_results.drop(columns=param_cols, inplace=True)
    df_cv_results["params"] = (
        df_cv_results
        .params
        .apply(lambda x: "-".join([f"{k}={v}" for k, v in x.items()]))
    )
    df_cv_results.set_index("params", inplace=True)
    df_cv_results.sort_values(by="rank_test_score", inplace=True)
    df_cv_results = df_cv_results[df_cv_results.columns.sort_values().tolist()]
    return df_cv_results


def build_results_df2plot(results_df: pd.DataFrame) -> pd.DataFrame:
    split_cols = results_df.filter(regex=r"(split\d+\_test)").columns.to_list()
    return (
        pd.concat([
            results_df.nlargest(3, "mean_test_score"),
            results_df.nsmallest(3, "std_test_score")
        ])
        .drop_duplicates(keep="first")
        .reset_index()
        .melt(
            id_vars = ["params"],
            value_vars = split_cols,
            var_name = "measure"
        )
        .merge(
            results_df[["mean_test_score", "std_test_score"]],
            left_on = "params", right_index = True
        )
        .reset_index(drop=True)
    )


def plot_results(results_df2plot: pd.DataFrame, clf: str, file_name: str) -> None:
    fig, axs = plt.subplots(1, 2, figsize=(9,3))
    for params, df in results_df2plot.groupby("params"):
        _df = df[["params", "mean_test_score", "std_test_score"]].drop_duplicates(keep="first")
        axs[0].errorbar(
            x=_df.mean_test_score,
            xerr=_df.std_test_score,
            y=_df.params,
            fmt="o",
            label=params
        )
        axs[1].errorbar(
            x=df.measure,
            y=df.value,
            label=params
        )
    axs[0].set_yticklabels("F1 score")
    axs[0].set_yticks([])
    axs[0].set_xlabel("")
    axs[0].set_title("Promedio y desvío")
    axs[1].set_xticklabels([
        "split {n}".format(n=re.search(r'\d+', label.get_text()).group())
        for label in axs[1].get_xticklabels()
    ])
    axs[1].set_xlabel("")
    axs[1].set_title("Por split")
    title = fig.suptitle(f"{clf}: F1-score en test en validación cruzada", y=1.1)
    lgd = plt.legend(loc="lower center", bbox_to_anchor=(-0.1, -0.8))
    fig.savefig(
    f"{project_path}/visualizations/{file_name}.png",
    bbox_extra_artists=[lgd,title], bbox_inches='tight'
)

def print_best_estimator_info(gs_estimators: GridSearchCV) -> None:
    best_params = "\n\t-- ".join(
        [f"{k}: {v}" for k, v in gs_estimators.best_params_.items()]
    )
    print(f"""
    - Best Cross-Validation score : {gs_estimators.best_score_}
    - Best parameters set:\n\t-- {best_params}
    """)

## Multinomial Naive Bayes

In [None]:
nb_gridsearch = build_gridsearch(
    clf=MultinomialNB(),
    clf_params={"clf__alpha": [0.01, 0.1, 1.0]}
)
nb_gridsearch.fit(X, y)

In [None]:
nb_gridsearch_cv_results = build_results_df(nb_gridsearch.cv_results_)
nb_gridsearch_cv_results.iloc[0]

In [None]:
nb_gridsearch_cv_results2plot = build_results_df2plot(nb_gridsearch_cv_results)
plot_results(nb_gridsearch_cv_results2plot, clf="Naive Bayes Multinomial", file_name="clf__nb_cv.png")

In [None]:
print_best_estimator_info(nb_gridsearch)

In [None]:
selected_nb = nb_gridsearch_cv_results[
    nb_gridsearch_cv_results.index.str.contains("alpha=1.0")
    & nb_gridsearch_cv_results.index.str.contains("min_df=0.1")
    & nb_gridsearch_cv_results.index.str.contains("norm=l1")
    & nb_gridsearch_cv_results.index.str.contains("smooth_idf=False")
    & nb_gridsearch_cv_results.index.str.contains("sublinear_tf=False")
]
selected_nb_params = "\n\t-- ".join(selected_nb.index.tolist()[0].split('-'))
print(f"""
    - Selected Cross-Validation score: {selected_nb.mean_test_score.values[0]}
    - Selected parameters set:\n\t-- {selected_nb_params}
""")

## Logistic Regression

In [None]:
# more about LR solvers on: https://scikit-learn.org/stable/modules/linear_model.html#solvers
lr_gridsearch = build_gridsearch(
    clf = LogisticRegression(multi_class="ovr", solver="liblinear", random_state=2023),
    clf_params = {"clf__penalty": ["l1", "l2"], "clf__C": [0.1, 0.5, 1]}
)
lr_gridsearch.fit(X, y)

In [None]:
lr_gridsearch_cv_results = build_results_df(lr_gridsearch.cv_results_)
lr_gridsearch_cv_results.iloc[0]

In [None]:
lr_gridsearch_cv_results2plot = build_results_df2plot(lr_gridsearch_cv_results)
plot_results(lr_gridsearch_cv_results2plot, clf="Regresión Logística", file_name="clf__lr_cv.png")

In [None]:
print_best_estimator_info(lr_gridsearch)

In [None]:
selected_lr = lr_gridsearch_cv_results[
    lr_gridsearch_cv_results.index.str.contains("C=0.5")
    & lr_gridsearch_cv_results.index.str.contains("penalty=l1")
    & lr_gridsearch_cv_results.index.str.contains("min_df=0.1")
    & lr_gridsearch_cv_results.index.str.contains("norm=l1")
    & lr_gridsearch_cv_results.index.str.contains("smooth_idf=True")
    & lr_gridsearch_cv_results.index.str.contains("sublinear_tf=True")
]
selected_lr_params = "\n\t-- ".join(selected_lr.index.tolist()[0].split('-'))
print(f"""
    - Selected Cross-Validation score: {selected_lr.mean_test_score.values[0]}
    - Selected parameters set:\n\t-- {selected_lr_params}
""")

# Feature Importance

In [None]:
def get_seleted_params(params:str) -> dict[str, dict[str,Any]]:
    selected_params = dict()
    key_value_params = [p.split("=") for p in params.split("\n\t-- ")]
    for key, value in key_value_params:
        try:
            value = eval(value)
        except:
            value = value
        clf, param = key.split("__")
        if clf in selected_params:
            selected_params[clf][param] = value
        else:
            selected_params[clf] = {param: value}
    return selected_params

def get_best_params(params: dict[str:Any]) -> dict[str, dict[str, Any]]:
    selected_params = dict()
    for key, value in params.items():
        clf, param = key.split("__")
        if clf in selected_params:
            selected_params[clf][param] = value
        else:
            selected_params[clf] = {param: value}
    return selected_params

def plot_nb_weights(df: pd.DataFrame, title:str, file_name:str):
    fig, axs = plt.subplots(1,2, figsize=(12,3), sharey=True)
    sns.histplot(
        data = df.melt(id_vars=["word"], var_name="weight"),
        x="value",
        hue="weight",
        bins=30,
        ax=axs[0]
    )
    axs[0].set_xlabel("")
    axs[0].set_ylabel("Cantidad de observaciones")
    axs[0].set_title("Pesos por categoría")
    axs[0].get_legend().set_title("Pesos")
    sns.histplot(
        data = df.assign(diff=lambda x: x.pos-x.neg),
        x="diff",
        bins=30,
        ax=axs[1]
    )
    axs[1].set_title("Diferencia de pesos: $pos-neg$")
    title = fig.suptitle(title,  y=1.05)
    fig.savefig(
    f"{project_path}/visualizations/{file_name}.png",
    bbox_extra_artists=[title], bbox_inches='tight'
)

def plot_lr_weights(df: pd.DataFrame, title:str, file_name:str):
    df_copy = deepcopy(df)
    df_copy.loc[df_copy.coef >= 0, "weight"] = "pos"
    df_copy.loc[df_copy.coef < 0, "weight"] = "neg"
    fig, ax = plt.subplots(figsize=(6,3))
    sns.histplot(
        data = df_copy,
        x="coef",
        hue="weight",
        bins=30,
        ax=ax
    )
    ax.set_xlabel("")
    ax.set_ylabel("Cantidad de observaciones")
    ax.get_legend().set_title("Pesos")
    fig.suptitle(title)
    fig.savefig(
    f"{project_path}/visualizations/{file_name}.png"
)


## Multinomial Naive Bayes

### Selected Params

In [None]:
selected_nb_params = get_seleted_params(selected_nb_params)
selected_nb_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **selected_nb_params["tfidf"]),
    ),
    (
        "clf",
        MultinomialNB(**selected_nb_params["clf"]
        )
    )
])
selected_nb_fi = selected_nb_pipeline.fit(X, y)

In [None]:
selected_nb_weights = pd.DataFrame({
        "word": selected_nb_fi[0].vocabulary_.keys(),
        "neg": selected_nb_fi[1].feature_log_prob_[0],
        "pos": selected_nb_fi[1].feature_log_prob_[1]
    }
)

In [None]:
plot_nb_weights(selected_nb_weights, title="Importancia de rasgos en Naive Bayes", file_name="nb_selected_feature_importance")

### Best Params

In [None]:
best_nb_params = get_best_params(nb_gridsearch.best_params_)
best_nb_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **best_nb_params["tfidf"]),
    ),
    (
        "clf",
        MultinomialNB(**best_nb_params["clf"]
        )
    )
])
best_nb_fi = best_nb_pipeline.fit(X, y)

In [None]:
best_nb_weights = pd.DataFrame({
        "word": best_nb_fi[0].vocabulary_.keys(),
        "neg": best_nb_fi[1].feature_log_prob_[0],
        "pos": best_nb_fi[1].feature_log_prob_[1]
    }
)

In [None]:
plot_nb_weights(best_nb_weights, title="Importancia de rasgos en Naive Bayes", file_name="nb_best_feature_importance")

## Logistic Regression

### Selected Params

In [None]:
le_target_pos = LabelEncoder()
le_target_pos?

In [None]:
selected_lr_params = get_seleted_params(selected_lr_params)
selected_lr_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **selected_lr_params["tfidf"]),
    ),
    (
        "clf",
        LogisticRegression(
            multi_class="ovr", solver="liblinear", random_state=2023, **selected_lr_params["clf"])
    )
])
selected_lr_fi = selected_lr_pipeline.fit(X, y)

In [None]:
selected_lr_features = selected_lr_fi[0].vocabulary_
selected_lr_weights  = selected_lr_fi[1].coef_

In [None]:
selected_lr_weights[selected_lr_weights<0]

### Best Params

In [None]:
best_lr_params = get_best_params(lr_gridsearch.best_params_)
best_lr_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **best_lr_params["tfidf"]),
    ),
    (
        "clf",
        LogisticRegression(
            multi_class="ovr", solver="liblinear", random_state=2023, **best_lr_params["clf"])
    )
])
best_lr_fi = best_lr_pipeline.fit(X, y)

In [None]:
best_lr_weights = pd.DataFrame({
        "word": best_lr_fi[0].vocabulary_.keys(),
        "coef": best_lr_fi[1].coef_[0]
    }
)

In [None]:
plot_lr_weights(best_lr_weights, title="Importancia de ragos en Regresión Logística", file_name="lr_best_feature_importance")

In [None]:
# armar una RL para predecir el negativo y ver cuáles son las palabras más importantes
# y contratas con las más importantes del positivo

# visualización
# armar un gráfico de barras para cada RL con las palabras más representativas en el eje y
# y los valores en el eje x