# _Datasets_ de entrenamiento y testeo

In [3]:
import math
import os

import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from config import DATA_PATH, MODELS_PATH, VISUALIZATIONS_PATH

In [4]:
os.makedirs(MODELS_PATH, exist_ok=True)

In [5]:
data_path = os.path.join(DATA_PATH, "session_speech.csv")
data = pd.read_csv(data_path)
data.columns

Index(['name', 'vote', 'senator', 'province', 'party', 'party_family',
       'speaker', 'speech', 'speech_preprocessed', 'speech_lemmas',
       'speech_pos'],
      dtype='object')

In [6]:
data.head(2)

Unnamed: 0,name,vote,senator,province,party,party_family,speaker,speech,speech_preprocessed,speech_lemmas,speech_pos
0,Ana Claudia Almirón,positivo,ALMIRÓN ANA CLAUDIA,CORRIENTES,ALIANZA FRENTE PARA LA VICTORIA,Frente para la victoria,Almirón,"Hace dos años, tuvimos una sesión histórica cu...",hace dos años tuvimos una sesión histórica cua...,hacer año tener una sesión histórica cuando de...,VERB NOUN VERB DET NOUN ADJ SCONJ VERB ADP ADJ...
1,Roberto Gustavo Basualdo,negativo,BASUALDO ROBERTO GUSTAVO,SAN JUAN,ALIANZA CAMBIEMOS SAN JUAN,Juntos por el cambio,Basualdo,"Gracias, señor presidente. Hoy es un día en el...",gracias señor presidente hoy es un día en el q...,gracia señor presidente hoy ser un día en el q...,NOUN NOUN NOUN ADV VERB DET NOUN ADP DET SCONJ...


In [7]:
data = (
    data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
    .reset_index(drop=True)
)
data.shape

(199, 11)

In [8]:
data.vote.value_counts(normalize=True)

vote
positivo    0.557789
negativo    0.442211
Name: proportion, dtype: float64

## Separación en _train_ y _test_

In [9]:
X_train_index, X_test_index = train_test_split(
    data.index,
    test_size=.2,
    random_state=6300,
    shuffle=True,
    stratify=data.vote
)

In [10]:
for index, name in zip([X_train_index, X_test_index], ["entrenamiento", "testeo"]):
    print(f"** Cantidad de datos en conjunto de {name}: {index.shape[0]}")
    print("** Distribución de la variable target:")
    print(f"{data.loc[index, 'vote'].value_counts(normalize=True).to_frame()}", end="\n\n")

** Cantidad de datos en conjunto de entrenamiento: 159
** Distribución de la variable target:
          proportion
vote                
positivo    0.559748
negativo    0.440252

** Cantidad de datos en conjunto de testeo: 40
** Distribución de la variable target:
          proportion
vote                
positivo        0.55
negativo        0.45



In [20]:
INDEX = os.path.join(MODELS_PATH, "index")
os.makedirs(INDEX, exist_ok=True)

for file in ["X_train_index", "X_test_index"]:
    dataset = eval(file)
    dataset.to_series().to_csv(os.path.join(INDEX, f"{file}.csv"), header=None, index=False)

## _Encoding_ de variables

In [84]:
VECTORIZERS = os.path.join(MODELS_PATH, "vectorizers")
os.makedirs(VECTORIZERS, exist_ok=True)

### Predictoras

#### Etiquetas POS

Uso la clase `TfidfVectorizer` de `scikit-learn` porque me permite hacer el cálculo rápidamente.

La configuración utilizada permite calcular la frecuencia relativa de cada etiqueta en el documento en cuestión.

Solo se utilizan las etiquedas de adejetivos, adverbios, nombres y verbos, aunque el cálculo se realiza considerando el total de las etiquetas.

In [81]:
pos_vectorizer = TfidfVectorizer(
    norm="l1", use_idf=False, smooth_idf=False, sublinear_tf=False
)
X = pos_vectorizer.fit_transform(data.loc[X_train_index, "speech_pos"])
pos = (
    pd.DataFrame(X.toarray(), columns=map(str.upper, pos_vectorizer.get_feature_names_out()))
    [["ADJ", "ADV", "NOUN", "VERB"]]
)
pos.head()

Unnamed: 0,ADJ,ADV,NOUN,VERB
0,0.108696,0.038043,0.217391,0.168478
1,0.065022,0.067265,0.217489,0.186099
2,0.0,0.0,0.666667,0.0
3,0.094304,0.073763,0.185808,0.167134
4,0.1,0.0,0.4,0.1


In [96]:
pos_vectorizer_path = os.path.join(VECTORIZERS, "pos_vectorizer.pkl")
_ = joblib.dump(pos_vectorizer, pos_vectorizer_path, compress=True)

#### Lemmas

In [None]:
STATS = os.path.join(VISUALIZATIONS_PATH, "stats")

##### Proporciones sin _stopwords_ (Zipf)

In [73]:
proportions = pd.read_csv(os.path.join(STATS, "proporciones_sin_stopwords_zipf.csv"))
proportions.head()

Unnamed: 0,word,total,diff,pos,neg
0,abajo,0.693147,-3.1e-05,3.9e-05,7e-05
1,abandonada,0.0,3.9e-05,3.9e-05,0.0
2,abandonado,0.0,3.9e-05,3.9e-05,0.0
3,abandonar,0.0,-7e-05,0.0,7e-05
4,abandono,0.0,-7e-05,0.0,7e-05


In [74]:
proportions_pos_words = (
    proportions.nlargest(n=250, columns=["diff"], keep="all").word.to_list()
)
proportions_neg_words = (
    proportions.nsmallest(n=250, columns=["diff"], keep="all").word.to_list()
)

In [75]:
# cantidad de rasgos
proportions_words = proportions_pos_words+proportions_neg_words
len(proportions_words)

505

In [93]:
proportions_vectorizer = CountVectorizer(
    vocabulary=proportions_words
)
X = proportions_vectorizer.fit_transform(data.loc[X_train_index, "speech_lemmas"])
proportions_lemmas = (
    pd
    .DataFrame(
        X.toarray(),
        columns=proportions_vectorizer.get_feature_names_out()
    )
)
proportions_lemmas.head()

Unnamed: 0,embarazo,decidir,sociedad,esa,debate,seguir,abortar,decisión,estado,clandestinidad,...,presentación,respaldo,rey,supresión,técnico,ministro,organismo,resto,utilizar,veto
0,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,2,0,0,0,0,10,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
proportions_vectorizer_path = os.path.join(VECTORIZERS, "proportions_vectorizer.pkl")
_ = joblib.dump(proportions_vectorizer, proportions_vectorizer_path, compress=True)

##### Ratio de _log-odds_

In [76]:
log_odds = pd.read_csv(os.path.join(STATS, "log_odds.csv"))
log_odds.head()

Unnamed: 0,word,total,diff,pos,neg
0,abajo,0.693147,-0.615795,1.9e-05,3.5e-05
1,abandonada,0.0,inf,1.9e-05,0.0
2,abandonado,0.0,inf,1.9e-05,0.0
3,abandonar,0.0,-inf,0.0,3.5e-05
4,abandono,0.0,-inf,0.0,3.5e-05


In [77]:
log_odds.shape

(4781, 5)

In [78]:
log_odds_not_inf = log_odds[~log_odds["diff"].isin([math.inf, -math.inf])]
log_odds_not_inf.shape

(1881, 5)

In [79]:
logodds_pos_words = (
    log_odds_not_inf.nlargest(n=200, columns=["diff", "total"], keep="all").word.to_list()
)
logodds_neg_words = (
    log_odds_not_inf.nsmallest(n=200, columns=["diff", "total"], keep="all") .word.to_list()
)

In [80]:
# cantidad de rasgos
logodds_words = logodds_pos_words+logodds_neg_words
len(logodds_words)

480

In [99]:
logodds_vectorizer = CountVectorizer(
    vocabulary=logodds_words
)
X = logodds_vectorizer.fit_transform(data.loc[X_train_index, "speech_lemmas"])
logodds_lemmas = (
    pd
    .DataFrame(
        X.toarray(),
        columns=logodds_vectorizer.get_feature_names_out()
    )
)
logodds_lemmas.head()

Unnamed: 0,banca,varón,lucha,maternidad,compañera,siglo,abortar,gestar,criminalización,colectivo,...,tono,tradición,transmisión,tremendamente,vacunación,velocidad,verdadero,vestidura,ángela,énfasis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
logodds_vectorizer_path = os.path.join(VECTORIZERS, "logodds_vectorizer.pkl")
_ = joblib.dump(logodds_vectorizer, logodds_vectorizer_path, compress=True)

### Variable _target_

In [102]:
le = LabelEncoder()

data["target"] = le.fit_transform(data.vote)

In [103]:
for value in data.vote.unique():
    print(f"Categoría {value} ---> {le.transform([value])[0]}")

Categoría positivo ---> 1
Categoría negativo ---> 0


In [105]:
le_path = os.path.join(MODELS_PATH, "labelencoder.pkl")
_ = joblib.dump(le, le_path)