# Selección de vectorizador

In [1]:
import math
import os

import joblib
from nltk.corpus import stopwords
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
)
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from config import DATA_PATH, MODELS_PATH, VISUALIZATIONS_PATH
from vectorizers import *

In [2]:
os.makedirs(MODELS_PATH, exist_ok=True)

In [3]:
data_path = os.path.join(DATA_PATH, "session_speech.csv")
data = pd.read_csv(data_path, converters={"speech_lemma_pos":eval})
data.columns

Index(['name', 'vote', 'senator', 'province', 'party', 'party_family',
       'speaker', 'speech', 'speech_preprocessed', 'speech_lemmas',
       'speech_pos', 'speech_lemma_pos', 'speech_preprocessed_count',
       'speech_preprocessed_count_unique', 'speech_lemmas_count',
       'speech_lemmas_count_unique', 'speech_pos_count',
       'speech_pos_count_unique', 'speech_lemma_pos_count_unique'],
      dtype='object')

In [4]:
data = (
    data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
    .reset_index(drop=True)
    .assign(
        speech_lemma_pos=lambda x: x.speech_lemma_pos.apply(
            lambda z: " ".join(["_".join(i) for i in z])
        )
    )
)
data[["speech_lemma_pos", "vote"]]

Unnamed: 0,speech_lemma_pos,vote
0,hacer_VERB año_NOUN tener_VERB una_DET sesión_...,positivo
1,gracia_NOUN señor_NOUN presidente_NOUN hoy_ADV...,negativo
2,como_SCONJ ir_VERB a_ADP pedir_VERB la_DET ins...,negativo
3,gracia_NOUN presidenta_NOUN nuevamente_ADV yo_...,negativo
4,gracia_NOUN señora_NOUN presidenta_NOUN realme...,negativo
...,...,...
194,gracia_NOUN presidenta_NOUN antes_ADV de_ADP c...,positivo
195,mucha_ADJ gracia_NOUN señora_NOUN presidenta_N...,positivo
196,ya_ADV presidenta_NOUN,positivo
197,en_ADP cuanto_ADJ al_ADP artículo_NOUN propone...,positivo


In [5]:
data.vote.value_counts(normalize=True)

vote
positivo    0.557789
negativo    0.442211
Name: proportion, dtype: float64

## Separación en _train_ y _test_

In [6]:
X_train_index, X_test_index = train_test_split(
    data.index,
    test_size=.2,
    random_state=6300,
    shuffle=True,
    stratify=data.vote
)

In [7]:
for index, name in zip([X_train_index, X_test_index], ["entrenamiento", "testeo"]):
    print(f"** Cantidad de datos en conjunto de {name}: {index.shape[0]}")
    print("** Distribución de la variable target:")
    print(f"{data.loc[index, 'vote'].value_counts(normalize=True).to_frame()}", end="\n\n")

** Cantidad de datos en conjunto de entrenamiento: 159
** Distribución de la variable target:
          proportion
vote                
positivo    0.559748
negativo    0.440252

** Cantidad de datos en conjunto de testeo: 40
** Distribución de la variable target:
          proportion
vote                
positivo        0.55
negativo        0.45



In [8]:
INDEX = os.path.join(MODELS_PATH, "index")
os.makedirs(INDEX, exist_ok=True)

for file in ["X_train_index", "X_test_index"]:
    dataset = eval(file)
    dataset.to_series().to_csv(os.path.join(INDEX, f"{file}.csv"), header=None, index=False)

## _Encoding_ de variables

### Variable _target_

In [9]:
le = LabelEncoder()

data["target"] = le.fit_transform(data.vote)

In [10]:
for value in data.vote.unique():
    print(f"Categoría {value} ---> {le.transform([value])[0]}")

Categoría positivo ---> 1
Categoría negativo ---> 0


In [11]:
le_path = os.path.join(MODELS_PATH, "labelencoder.pkl")
_ = joblib.dump(le, le_path)

## Selección de vectorizador

In [12]:
vectorizers = [
    {"class": CustomFrequenciesVectorizer, "kwargs": {}},
    {"class": CustomProportionsVectorizer, "kwargs": {}},
    {"class": CustomProportionsVectorizer, "kwargs": {"stop_words": stopwords.words("spanish")}},
    {"class": CustomProportionsVectorizer, "kwargs": {"custom_stop_words": "zipf"}},
    {"class": CustomOddsRatioVectorizer, "kwargs": {}},
    {"class": CustomLogOddsRatioVectorizer, "kwargs": {}},
    {"class": CustomLogOddsRatioVectorizer, "kwargs": {"smooth": .5}},
    {"class": CustomTfidfVectorizer, "kwargs": {}},
    {"class": CustomWordScoresVectorizer, "kwargs": {}},
]

In [13]:
SEED = 6300

X_trainset, y_trainset = (
    data.loc[X_train_index, "speech_lemma_pos"], data.loc[X_train_index, "target"]
)
cv_method = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
scoring_method = {
    "accuracy": accuracy_score,
    "f1": f1_score,
    "precision": precision_score,
    "recall": recall_score,
    "roc_auc": roc_auc_score,
}

for vectorizer in vectorizers:
    pipeline = Pipeline([
        (
            'vectorizer',
            vectorizer["class"](positive_values="positivo", **vectorizer["kwargs"])
        ),
        (
            'clf',
            LogisticRegression(random_state=SEED)
        )
    ])
    cv_results = cross_validate(
        pipeline, X_trainset, y_trainset, cv=cv_method, 
        n_jobs=-1, return_train_score=True, verbose=0,
        scoring=scoring_method
    )