# Selección de vectorizador

In [1]:
from copy import deepcopy
import math
import os
import re
import sys

import joblib
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    make_scorer, f1_score, precision_score, recall_score, roc_auc_score
)
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from config import DATA_PATH, MODELS_PATH, VISUALIZATIONS_PATH
from notebooks.src.vectorizers import *

In [None]:
os.path.dirname()

In [None]:
sys.path

In [2]:
os.makedirs(MODELS_PATH, exist_ok=True)

In [None]:
data_path = os.path.join(DATA_PATH, "session_speech.csv")
data = pd.read_csv(data_path, converters={"speech_lemma_pos":eval})
data.columns

In [None]:
data = (
    data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
    .reset_index(drop=True)
    .assign(
        speech_lemma_pos=lambda x: x.speech_lemma_pos.apply(
            lambda z: " ".join(["_".join(i) for i in z])
        )
    )
)
data[["speech_lemma_pos", "vote"]]

In [None]:
data.vote.value_counts(normalize=True)

## Separación en _train_ y _test_

In [6]:
X_train_index, X_test_index = train_test_split(
    data.index,
    test_size=.2,
    random_state=6300,
    shuffle=True,
    stratify=data.vote
)

In [None]:
for index, name in zip([X_train_index, X_test_index], ["entrenamiento", "testeo"]):
    print(f"** Cantidad de datos en conjunto de {name}: {index.shape[0]}")
    print("** Distribución de la variable target:")
    print(f"{data.loc[index, 'vote'].value_counts(normalize=True).to_frame()}", end="\n\n")

In [8]:
INDEX = os.path.join(MODELS_PATH, "index")
os.makedirs(INDEX, exist_ok=True)

for file in ["X_train_index", "X_test_index"]:
    dataset = eval(file)
    dataset.to_series().to_csv(os.path.join(INDEX, f"{file}.csv"), header=None, index=False)

## _Encoding_ de variables

### Variable _target_

In [9]:
le = LabelEncoder()

data["target"] = le.fit_transform(data.vote)

In [None]:
for value in data.vote.unique():
    print(f"Categoría {value} ---> {le.transform([value])[0]}")

In [11]:
le_path = os.path.join(MODELS_PATH, "labelencoder.pkl")
_ = joblib.dump(le, le_path)

## Selección de vectorizador

In [12]:
vectorizers = [
    {"name": "Frecuencias absolutas","class": CustomFrequenciesVectorizer, "kwargs": {}},
    {"name": "Proporciones","class": CustomProportionsVectorizer, "kwargs": {}},
    {"name": "Proporciones","class": CustomProportionsVectorizer, "kwargs": {"stop_words": "nltk"}},
    {"name": "Proporciones","class": CustomProportionsVectorizer, "kwargs": {"stop_words": "zipf"}},
    {"name": "Ratio de odds","class": CustomOddsRatioVectorizer, "kwargs": {}},
    {"name": "Ratio de log odds","class": CustomLogOddsRatioVectorizer, "kwargs": {}},
    {"name": "Ratio de log odds","class": CustomLogOddsRatioVectorizer, "kwargs": {"smooth": .5}},
    {"name": "Word scores","class": CustomWordScoresVectorizer, "kwargs": {}},
    {"name": "TF-IDF","class": CustomTfidfVectorizer, "kwargs": {}},
    {"name": "TF-IDF","class": CustomTfidfVectorizer, "kwargs": {"log_idf": True}},
]

In [13]:
FEATURES = os.path.join(MODELS_PATH, "features_selection")
os.makedirs(FEATURES, exist_ok=True)

In [None]:
SEED = 6300

X_trainset, y_trainset = (
    data.loc[X_train_index, "speech_lemma_pos"], data.loc[X_train_index, "target"]
)
cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scoring_method = {
    "accuracy": "accuracy",
    "f1": "f1",
    "f1_micro": make_scorer(f1_score, average="micro"),
    "f1_macro": make_scorer(f1_score, average="macro"),
    "precision": make_scorer(precision_score),
    "precision_micro": make_scorer(precision_score, average="micro"),
    "precision_macro": make_scorer(precision_score, average="macro"),
    "recall": make_scorer(recall_score),
    "recall_micro": make_scorer(recall_score, average="micro"),
    "recall_macro": make_scorer(recall_score, average="macro"),
    "roc_auc_micro": make_scorer(roc_auc_score, average="micro"),
    "roc_auc_macro": make_scorer(roc_auc_score, average="macro"),
}

cv_results_to_compare = pd.DataFrame()
for vectorizer in vectorizers:
    kwargs = deepcopy(vectorizer.get("kwargs"))
    stop_words = kwargs.pop("stop_words", None)
    if stop_words == "nltk":
        kwargs["stop_words"] = stopwords.words("spanish")
    elif stop_words == "zipf":
        kwargs["custom_stop_words"] = "zipf"
    pipeline = Pipeline([
        (
            "vectorizer",
            vectorizer["class"](
                positive_values=1, dimension=300, **kwargs
            )
        ),
        (
            "clf",
            LogisticRegression(random_state=SEED)
        )
    ])
    print(f"-- Running cross_validate for {pipeline.steps[0][1]}")
    cv_results = cross_validate(
        pipeline, X_trainset, y_trainset, cv=cv_method, 
        n_jobs=-1, return_train_score=True, scoring=scoring_method
    )
    cv_results_df = (
        pd.DataFrame(cv_results)
        .assign(split=lambda x: x.index+1)
    )
    cv_results_df["vectorizer"] = vectorizer["name"]
    cv_results_df["kwargs"] = pd.Series([vectorizer["kwargs"]]*len(cv_results_df))
    cv_results_to_compare = pd.concat(
        [cv_results_to_compare, cv_results_df], ignore_index=True
    )
file_path = os.path.join(FEATURES, "features_selection.csv")
cv_results_to_compare.to_csv(file_path, index=False)

In [15]:
cv_results_to_compare = (
    cv_results_to_compare
    .assign(
        kwargs_=lambda x: x.kwargs.apply(
            lambda z: ", ".join([f"{k}={v}" for k, v in z.items()])
        ),
        title=lambda x: x.apply(
            lambda z: f"{z.vectorizer} ({z.kwargs_})" if z.kwargs else z.vectorizer,
            axis=1)
        )
)

In [None]:
sns.barplot(
    cv_results_to_compare[["title", "fit_time", "split"]],
    y="title", x="fit_time",
    estimator="mean", errorbar="sd"
)

In [None]:
sns.lineplot(
    cv_results_to_compare[["title", "fit_time", "split"]],
    y="fit_time", x="split",
    hue="title"
)

In [18]:
FEATURES_PLOTS = os.path.join(VISUALIZATIONS_PATH, "features")
os.makedirs(FEATURES_PLOTS, exist_ok=True)

In [None]:
train_scores_cols = cv_results_to_compare.filter(like="train_").columns.sort_values().to_list()
test_scores_cols = cv_results_to_compare.filter(like="test_").columns.sort_values().to_list()

for i in range(len(train_scores_cols)):
    # mean and sd
    metric = re.sub("train_","",train_scores_cols[i]).replace("_", " ")
    title = metric.capitalize()
    fig, axs = plt.subplots(1,2, figsize=(9,3), sharey=True)
    sns.barplot(
        cv_results_to_compare[["title", train_scores_cols[i], "split"]],
        hue="title", y=train_scores_cols[i], legend=False,
        estimator="mean", errorbar="sd", ax=axs[0], alpha=.7
    )
    sns.barplot(
        cv_results_to_compare[["title", test_scores_cols[i], "split"]],
        hue="title", y=test_scores_cols[i],
        estimator="mean", errorbar="sd", ax=axs[1], alpha=.7
    )
    axs[0].set_ylabel("")
    axs[0].set_title("Entrenamiento")
    axs[1].set_title("Validación")
    suptitle = fig.suptitle(f"{title}", y=1.05, style="italic")
    lgd = plt.legend(loc="lower center", bbox_to_anchor=(-0.1, -0.6), ncol=2)
    plt.savefig(
        os.path.join(FEATURES_PLOTS, f"{metric}_mean_sd.png"),
        bbox_extra_artists=[lgd,suptitle], bbox_inches='tight'
    )
    # deaggregated
    fig, bxs = plt.subplots(1,2, figsize=(9,3), sharey=True)
    sns.lineplot(
        cv_results_to_compare[["title", train_scores_cols[i], "split"]],
        y=train_scores_cols[i], x="split",
        hue="title", ax=bxs[0], legend=False, alpha=.7
    )
    sns.lineplot(
        cv_results_to_compare[["title", test_scores_cols[i], "split"]],
        y=test_scores_cols[i], x="split", alpha=.7,
        hue="title", ax=bxs[1]
    )
    bxs[0].set_ylabel("")
    bxs[0].set_xlabel("")
    bxs[1].set_xlabel("")
    bxs[0].set_title("Entrenamiento")
    bxs[1].set_title("Validación")
    suptitle = fig.suptitle(f"{title}", y=1.05, style="italic")
    lgd = plt.legend(loc="lower center", bbox_to_anchor=(-0.1, -0.65), ncol=2)
    plt.savefig(
        os.path.join(FEATURES_PLOTS, f"{metric}_by_split.png"),
        bbox_extra_artists=[lgd,suptitle], bbox_inches='tight'
    )

In [None]:
N = (
    cv_results_to_compare
    .drop(columns=["vectorizer", "kwargs", "kwargs_", "split"])
    .groupby("title")
    [["test_accuracy", "test_precision", "test_recall", "test_f1", "test_f1_micro", "test_f1_macro"]]
    .agg(["mean"])
)
N.columns = list(map(lambda x: x.strip("test_"),N.columns.droplevel(1)))
N.rename_axis(index="vectorizador", inplace=True)
N.style.highlight_max(color='green')  
