# Entrenamiento de modelos

In [131]:
import os
import re
from typing import Any

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from config import DATA_PATH, MODELS_PATH, VISUALIZATIONS_PATH

## Carga de datos y vectorizadores

In [88]:
data = pd.read_csv(os.path.join(DATA_PATH, "session_speech.csv"))

INDEX = os.path.join(MODELS_PATH, "index")
X_train_index = (
    pd
    .read_csv(os.path.join(INDEX, f"X_train_index.csv"), header=None, index_col=0)
    .index
)
X_test_index = (
    pd
    .read_csv(os.path.join(INDEX, f"X_test_index.csv"), header=None, index_col=0)
    .index
)

le = joblib.load(os.path.join(MODELS_PATH, "labelencoder.pkl"))

VECTORIZERS = os.path.join(MODELS_PATH, "vectorizers")
pos_vectorizer = joblib.load(os.path.join(VECTORIZERS, "pos_vectorizer.pkl"))
proportions_vectorizer = joblib.load(os.path.join(VECTORIZERS, "proportions_vectorizer.pkl"))
logodds_vectorizer = joblib.load(os.path.join(VECTORIZERS, "logodds_vectorizer.pkl"))

In [89]:
data = (
    data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
    .reset_index(drop=True)
)

## Vectorización de datos

In [90]:
TRAINSET_FOLDER = os.path.join(MODELS_PATH, "trainsets")
os.makedirs(TRAINSET_FOLDER, exist_ok=True)

In [91]:
def build_features(
    df: pd.DataFrame, converters: dict[str, CountVectorizer], test: bool = True
):
    features, features_names = list(), list()
    for column, vectorizer in converters.items():
        if test:
            X = vectorizer.transform(df[column])
        else:
            X = vectorizer.fit_transform(df[column])
        print(f"{X.shape[1]} features added")
        features.append(X)
        features_names.extend(list(vectorizer.get_feature_names_out()))
    features_array = np.concatenate(list(map(lambda x: x.toarray(), features)), axis=1)
    features_df = pd.DataFrame(features_array, columns=features_names)
    return features_df

In [102]:
out_file = os.path.join(TRAINSET_FOLDER, "X_train_proportions.csv")

X_train_proportions = build_features(
    data.iloc[X_train_index],
    converters={"speech_lemmas": proportions_vectorizer, "speech_pos": pos_vectorizer}
)
X_train_proportions.to_csv(out_file, index=False)

505 features added
13 features added


In [103]:
out_file = os.path.join(TRAINSET_FOLDER, "X_train_logodds.csv")

X_train_logodds = build_features(
    data.iloc[X_train_index],
    converters={"speech_lemmas": logodds_vectorizer, "speech_pos": pos_vectorizer}
)
X_train_logodds.to_csv(out_file, index=False)

480 features added
13 features added


In [95]:
y_train = le.transform(data.loc[X_train_index, "vote"])

## Selección de parámetros

In [110]:
def fit_grid_search(
    clf: BaseEstimator, clf_parameters: dict[str, Any],
    X:pd.DataFrame, y:pd.Series,
    out_file: str = None
):
    gs = GridSearchCV(
        clf,
        clf_parameters,
        scoring='f1',
        cv=5,
        return_train_score=True,
        n_jobs=-1
    )
    gs.fit(X, y)
    if out_file:
        FOLDER = os.path.join(MODELS_PATH, "gridsearch")
        os.makedirs(FOLDER, exist_ok=True)
        FILE = os.path.join(FOLDER, out_file)
        joblib.dump(gs, FILE, compress=True)
    return gs

In [117]:
def build_results(cv_results: dict[str, np.ndarray]) -> pd.DataFrame:
    df_cv_results = pd.DataFrame(cv_results)
    param_cols = df_cv_results.filter(regex="param_").columns.tolist()
    df_cv_results.drop(columns=param_cols, inplace=True)
    df_cv_results["params"] = (
        df_cv_results
        .params
        .apply(lambda x: "-".join([f"{k}={v}" for k, v in x.items()]))
    )
    df_cv_results.set_index("params", inplace=True)
    df_cv_results.sort_values(by="rank_test_score", inplace=True)
    df_cv_results = df_cv_results[df_cv_results.columns.sort_values().tolist()]
    return df_cv_results

In [132]:
def build_results2plot(results: pd.DataFrame) -> pd.DataFrame:
    split_cols = results.filter(regex=r"(split\d+\_test)").columns.to_list()
    return (
        pd.concat([
            results.nlargest(3, "mean_test_score"),
            results.nsmallest(3, "std_test_score")
        ])
        .drop_duplicates(keep="first")
        .reset_index()
        .melt(
            id_vars = ["params"],
            value_vars = split_cols,
            var_name = "measure"
        )
        .merge(
            results[["mean_test_score", "std_test_score"]],
            left_on = "params", right_index = True
        )
        .reset_index(drop=True)
    )

In [133]:
def plot_results(results: pd.DataFrame, clf: str, file_name: str = None) -> None:
    results2plot = build_results2plot(results)
    fig, axs = plt.subplots(1, 2, figsize=(9,3))
    for params, df in results2plot.groupby("params"):
        _df = df[["params", "mean_test_score", "std_test_score"]].drop_duplicates(keep="first")
        axs[0].errorbar(
            x=_df.mean_test_score,
            xerr=_df.std_test_score,
            y=_df.params,
            fmt="o",
            label=params
        )
        axs[1].errorbar(
            x=df.measure,
            y=df.value,
            label=params
        )
    axs[0].set_yticklabels("F1 score")
    axs[0].set_yticks([])
    axs[0].set_xlabel("")
    axs[0].set_title("Promedio y desvío")
    axs[1].set_xticklabels([
        "split {n}".format(n=re.search(r'\d+', label.get_text()).group())
        for label in axs[1].get_xticklabels()
    ])
    axs[1].set_xlabel("")
    axs[1].set_title("Por split")
    title = fig.suptitle(f"{clf}: F1-score en test en validación cruzada", y=1.1)
    lgd = plt.legend(loc="lower center", bbox_to_anchor=(-0.1, -0.8))
    if file_name:
        FOLDER = os.path.join(VISUALIZATIONS_PATH, "models")
        os.makedirs(FOLDER, exist_ok=True)
        out_file = os.path.join(FOLDER, f"{file_name}.png")
        fig.savefig(
            out_file, bbox_extra_artists=[lgd,title], bbox_inches='tight'
        )

In [134]:
def print_best_estimator_info(gs_estimators: GridSearchCV) -> None:
    best_params = "\n\t-- ".join(
        [f"{k}: {v}" for k, v in gs_estimators.best_params_.items()]
    )
    print(f"""
    - Best Cross-Validation score : {gs_estimators.best_score_}
    - Best parameters set:\n\t-- {best_params}
    """)

### Naive Bayer Multinomial

In [111]:
nb_gridsearch = dict()

for df, name in zip([X_train_proportions, X_train_logodds], ["proportions", "logodds"]):
    gs = fit_grid_search(
        clf=MultinomialNB(),
        clf_parameters={"alpha": [0.01, 0.1, 1.0]},
        X=df,
        y=y_train,
        out_file=f"nb_{name}.pkl"
    )
    nb_gridsearch[name] = gs

In [105]:
nb_gridsearch

{'proportions': GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=-1,
              param_grid={'alpha': [0.01, 0.1, 1.0]}, return_train_score=True,
              scoring='f1'),
 'logodds': GridSearchCV(cv=5, estimator=MultinomialNB(), n_jobs=-1,
              param_grid={'alpha': [0.01, 0.1, 1.0]}, return_train_score=True,
              scoring='f1')}

In [126]:
nb_gridsearch_results = dict()

for key, value in nb_gridsearch.items():
    results = build_results(value.cv_results_)
    nb_gridsearch_results[key] = results
    print(f"--- {key.upper()} ---")
    print(results.T.head())

--- PROPORTIONS ---
params            alpha=0.01  alpha=1.0  alpha=0.1
mean_fit_time       0.011003   0.006440   0.006336
mean_score_time     0.006255   0.005514   0.006112
mean_test_score     0.691389   0.691060   0.672669
mean_train_score    0.746886   0.748336   0.746886
rank_test_score     1.000000   2.000000   3.000000
--- LOGODDS ---
params            alpha=0.1  alpha=1.0  alpha=0.01
mean_fit_time      0.005635   0.005844    0.006163
mean_score_time    0.005682   0.005848    0.005240
mean_test_score    0.626103   0.604519    0.593642
mean_train_score   0.668947   0.669140    0.671339
rank_test_score    1.000000   2.000000    3.000000


### Regresión Logística

In [112]:
lr_gridsearch = dict()

for df, name in zip([X_train_proportions, X_train_logodds], ["proportions", "logodds"]):
    gs = fit_grid_search(
        clf=LogisticRegression(multi_class="ovr", solver="liblinear", random_state=6300),
        clf_parameters={"penalty": ["l1", "l2"], "C": [0.1, 0.5, 1]},
        X=df,
        y=y_train,
        out_file=f"lr_{name}.pkl"
    )
    lr_gridsearch[name] = gs

In [113]:
lr_gridsearch

{'proportions': GridSearchCV(cv=5,
              estimator=LogisticRegression(multi_class='ovr', random_state=6300,
                                           solver='liblinear'),
              n_jobs=-1,
              param_grid={'C': [0.1, 0.5, 1], 'penalty': ['l1', 'l2']},
              return_train_score=True, scoring='f1'),
 'logodds': GridSearchCV(cv=5,
              estimator=LogisticRegression(multi_class='ovr', random_state=6300,
                                           solver='liblinear'),
              n_jobs=-1,
              param_grid={'C': [0.1, 0.5, 1], 'penalty': ['l1', 'l2']},
              return_train_score=True, scoring='f1')}

In [129]:
lr_gridsearch_results = dict()

for key, value in lr_gridsearch.items():
    results = build_results(value.cv_results_)
    lr_gridsearch_results[key] = results
    print(f"--- {key.upper()} ---")
    print(results.T.head())

--- PROPORTIONS ---
params            C=0.1-penalty=l2  C=0.5-penalty=l2  C=1-penalty=l2  \
mean_fit_time             0.007429          0.006107        0.006738   
mean_score_time           0.006045          0.005780        0.005621   
mean_test_score           0.683181          0.681639        0.673618   
mean_train_score          0.817717          0.859061        0.873400   
rank_test_score           1.000000          2.000000        3.000000   

params            C=1-penalty=l1  C=0.5-penalty=l1  C=0.1-penalty=l1  
mean_fit_time           0.006723          0.005573          0.006696  
mean_score_time         0.005568          0.005170          0.005815  
mean_test_score         0.612484          0.590736          0.454496  
mean_train_score        0.811028          0.779690          0.518188  
rank_test_score         4.000000          5.000000          6.000000  
--- LOGODDS ---
params            C=0.1-penalty=l2  C=1-penalty=l2  C=1-penalty=l1  \
mean_fit_time             0.009211 

### Máquinas de Soporte Vectorial

In [115]:
svm_gridsearch = dict()

for df, name in zip([X_train_proportions, X_train_logodds], ["proportions", "logodds"]):
    gs = fit_grid_search(
        clf=SVC(random_state=6300),
        clf_parameters={
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "C": [0.1, 0.5, 1],
            "decision_function_shape": ["ovo", "ovr"]
        },
        X=df,
        y=y_train,
        out_file=f"svm_{name}.pkl"
    )
    svm_gridsearch[name] = gs

In [116]:
svm_gridsearch

{'proportions': GridSearchCV(cv=5, estimator=SVC(random_state=6300), n_jobs=-1,
              param_grid={'C': [0.1, 0.5, 1],
                          'decision_function_shape': ['ovo', 'ovr'],
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
              return_train_score=True, scoring='f1'),
 'logodds': GridSearchCV(cv=5, estimator=SVC(random_state=6300), n_jobs=-1,
              param_grid={'C': [0.1, 0.5, 1],
                          'decision_function_shape': ['ovo', 'ovr'],
                          'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
              return_train_score=True, scoring='f1')}

In [130]:
svm_gridsearch_results = dict()

for key, value in svm_gridsearch.items():
    results = build_results(value.cv_results_)
    svm_gridsearch_results[key] = results
    print(f"--- {key.upper()} ---")
    print(results.T.head())

--- PROPORTIONS ---
params            C=0.1-decision_function_shape=ovo-kernel=rbf  \
mean_fit_time                                         0.015444   
mean_score_time                                       0.010647   
mean_test_score                                       0.717667   
mean_train_score                                      0.717737   
rank_test_score                                       1.000000   

params            C=0.1-decision_function_shape=ovo-kernel=sigmoid  \
mean_fit_time                                             0.011034   
mean_score_time                                           0.007772   
mean_test_score                                           0.717667   
mean_train_score                                          0.717737   
rank_test_score                                           1.000000   

params            C=0.1-decision_function_shape=ovr-kernel=rbf  \
mean_fit_time                                         0.008193   
mean_score_time               

# Parameters Selection

## Multinomial Naive Bayes

In [None]:
nb_gridsearch_cv_results2plot = build_results_df2plot(nb_gridsearch_cv_results)
plot_results(nb_gridsearch_cv_results2plot, clf="Naive Bayes Multinomial", file_name="clf__nb_cv.png")

In [None]:
print_best_estimator_info(nb_gridsearch)

In [None]:
selected_nb = nb_gridsearch_cv_results[
    nb_gridsearch_cv_results.index.str.contains("alpha=1.0")
    & nb_gridsearch_cv_results.index.str.contains("min_df=0.1")
    & nb_gridsearch_cv_results.index.str.contains("norm=l1")
    & nb_gridsearch_cv_results.index.str.contains("smooth_idf=False")
    & nb_gridsearch_cv_results.index.str.contains("sublinear_tf=False")
]
selected_nb_params = "\n\t-- ".join(selected_nb.index.tolist()[0].split('-'))
print(f"""
    - Selected Cross-Validation score: {selected_nb.mean_test_score.values[0]}
    - Selected parameters set:\n\t-- {selected_nb_params}
""")

## Logistic Regression

In [None]:
lr_gridsearch_cv_results2plot = build_results_df2plot(lr_gridsearch_cv_results)
plot_results(lr_gridsearch_cv_results2plot, clf="Regresión Logística", file_name="clf__lr_cv.png")

In [None]:
print_best_estimator_info(lr_gridsearch)

In [None]:
selected_lr = lr_gridsearch_cv_results[
    lr_gridsearch_cv_results.index.str.contains("C=0.5")
    & lr_gridsearch_cv_results.index.str.contains("penalty=l1")
    & lr_gridsearch_cv_results.index.str.contains("min_df=0.1")
    & lr_gridsearch_cv_results.index.str.contains("norm=l1")
    & lr_gridsearch_cv_results.index.str.contains("smooth_idf=True")
    & lr_gridsearch_cv_results.index.str.contains("sublinear_tf=True")
]
selected_lr_params = "\n\t-- ".join(selected_lr.index.tolist()[0].split('-'))
print(f"""
    - Selected Cross-Validation score: {selected_lr.mean_test_score.values[0]}
    - Selected parameters set:\n\t-- {selected_lr_params}
""")

# Feature Importance

In [142]:
def get_seleted_params(params:str) -> dict[str, dict[str,Any]]:
    selected_params = dict()
    key_value_params = [p.split("=") for p in params.split("\n\t-- ")]
    for key, value in key_value_params:
        try:
            value = eval(value)
        except:
            value = value
        clf, param = key.split("__")
        if clf in selected_params:
            selected_params[clf][param] = value
        else:
            selected_params[clf] = {param: value}
    return selected_params

def get_best_params(params: dict[str:Any]) -> dict[str, dict[str, Any]]:
    selected_params = dict()
    for key, value in params.items():
        clf, param = key.split("__")
        if clf in selected_params:
            selected_params[clf][param] = value
        else:
            selected_params[clf] = {param: value}
    return selected_params

def plot_nb_weights(df: pd.DataFrame, title:str, file_name:str):
    fig, axs = plt.subplots(1,2, figsize=(12,3), sharey=True)
    sns.histplot(
        data = df.melt(id_vars=["word"], var_name="weight"),
        x="value",
        hue="weight",
        bins=30,
        ax=axs[0]
    )
    axs[0].set_xlabel("")
    axs[0].set_ylabel("Cantidad de observaciones")
    axs[0].set_title("Pesos por categoría")
    axs[0].get_legend().set_title("Pesos")
    sns.histplot(
        data = df.assign(diff=lambda x: x.pos-x.neg),
        x="diff",
        bins=30,
        ax=axs[1]
    )
    axs[1].set_title("Diferencia de pesos: $pos-neg$")
    title = fig.suptitle(title,  y=1.05)
    fig.savefig(
    f"{project_path}/visualizations/{file_name}.png",
    bbox_extra_artists=[title], bbox_inches='tight'
)

def plot_lr_weights(df: pd.DataFrame, title:str, file_name:str):
    df_copy = deepcopy(df)
    df_copy.loc[df_copy.coef >= 0, "weight"] = "pos"
    df_copy.loc[df_copy.coef < 0, "weight"] = "neg"
    fig, ax = plt.subplots(figsize=(6,3))
    sns.histplot(
        data = df_copy,
        x="coef",
        hue="weight",
        bins=30,
        ax=ax
    )
    ax.set_xlabel("")
    ax.set_ylabel("Cantidad de observaciones")
    ax.get_legend().set_title("Pesos")
    fig.suptitle(title)
    fig.savefig(
    f"{project_path}/visualizations/{file_name}.png"
)


## Multinomial Naive Bayes

### Selected Params

In [143]:
selected_nb_params = get_seleted_params(selected_nb_params)
selected_nb_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **selected_nb_params["tfidf"]),
    ),
    (
        "clf",
        MultinomialNB(**selected_nb_params["clf"]
        )
    )
])
selected_nb_fi = selected_nb_pipeline.fit(X, y)

NameError: name 'selected_nb_params' is not defined

In [None]:
selected_nb_weights = pd.DataFrame({
        "word": selected_nb_fi[0].vocabulary_.keys(),
        "neg": selected_nb_fi[1].feature_log_prob_[0],
        "pos": selected_nb_fi[1].feature_log_prob_[1]
    }
)

In [None]:
plot_nb_weights(selected_nb_weights, title="Importancia de rasgos en Naive Bayes", file_name="nb_selected_feature_importance")

### Best Params

In [None]:
best_nb_params = get_best_params(nb_gridsearch.best_params_)
best_nb_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **best_nb_params["tfidf"]),
    ),
    (
        "clf",
        MultinomialNB(**best_nb_params["clf"]
        )
    )
])
best_nb_fi = best_nb_pipeline.fit(X, y)

In [None]:
best_nb_weights = pd.DataFrame({
        "word": best_nb_fi[0].vocabulary_.keys(),
        "neg": best_nb_fi[1].feature_log_prob_[0],
        "pos": best_nb_fi[1].feature_log_prob_[1]
    }
)

In [None]:
plot_nb_weights(best_nb_weights, title="Importancia de rasgos en Naive Bayes", file_name="nb_best_feature_importance")

## Logistic Regression

### Selected Params

In [None]:
le_target_pos = LabelEncoder()
le_target_pos?

In [None]:
selected_lr_params = get_seleted_params(selected_lr_params)
selected_lr_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **selected_lr_params["tfidf"]),
    ),
    (
        "clf",
        LogisticRegression(
            multi_class="ovr", solver="liblinear", random_state=2023, **selected_lr_params["clf"])
    )
])
selected_lr_fi = selected_lr_pipeline.fit(X, y)

In [None]:
selected_lr_features = selected_lr_fi[0].vocabulary_
selected_lr_weights  = selected_lr_fi[1].coef_

In [None]:
selected_lr_weights[selected_lr_weights<0]

### Best Params

In [None]:
best_lr_params = get_best_params(lr_gridsearch.best_params_)
best_lr_pipeline = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(lowercase=True, preprocessor=preprocess, **best_lr_params["tfidf"]),
    ),
    (
        "clf",
        LogisticRegression(
            multi_class="ovr", solver="liblinear", random_state=2023, **best_lr_params["clf"])
    )
])
best_lr_fi = best_lr_pipeline.fit(X, y)

In [None]:
best_lr_weights = pd.DataFrame({
        "word": best_lr_fi[0].vocabulary_.keys(),
        "coef": best_lr_fi[1].coef_[0]
    }
)

In [None]:
plot_lr_weights(best_lr_weights, title="Importancia de ragos en Regresión Logística", file_name="lr_best_feature_importance")