In [31]:
import os
import re
from string import punctuation

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
here = os.getcwd()
project_path = os.path.dirname(here)
data_path = os.path.join(project_path, "data", "session_speech.csv")

In [12]:
data = pd.read_csv(data_path, converters={"speech":eval})
data.columns

Index(['name', 'vote', 'speech'], dtype='object')

In [13]:
data = data.explode(column="speech")
data = data[(data.speech.notna()) & (~data.vote.isin(["abstención", "ausente"]))]
data.shape

(199, 3)

In [18]:
data.vote.value_counts(normalize=True)

vote
positivo    0.557789
negativo    0.442211
Name: proportion, dtype: float64

In [36]:
le = LabelEncoder()

X = data.speech
y = le.fit_transform(data.vote)

In [37]:
def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(rf"[{punctuation}\“\”\¿\¡\−\…]", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [38]:
nb_pipeline = Pipeline([
    (
        'tfidf',
        TfidfVectorizer(
            lowercase=True,
            preprocessor=preprocess
        )
    ),
    (
        'clf',
        MultinomialNB()
    )])

nb_parameters = {
    "tfidf__min_df": [0.1, 0.3, 0.5, 0.7],
    "tfidf__norm": ["l1", "l2"],
    "tfidf__smooth_idf": [False, True],
    "tfidf__sublinear_tf": [False, True],
    "tfidf__min_df": [0.05, 0.1],
    "clf__alpha": [0.01, 0.1, 1.0]
}

nb_gridsearch = GridSearchCV(
    nb_pipeline,
    nb_parameters,
    scoring='f1_macro',
    cv=5,
    return_train_score=True,
    n_jobs=-1
)
nb_gridsearch

In [39]:
nb_gridsearch.fit(
    data.speech.to_list(),
    data.vote.to_list()
)

In [102]:
nb_gridsearch_cv_results = pd.DataFrame(nb_gridsearch.cv_results_)
param_cols = nb_gridsearch_cv_results.filter(regex="param_").columns.tolist()
nb_gridsearch_cv_results.drop(columns=param_cols, inplace=True)
nb_gridsearch_cv_results["params"] = (
    nb_gridsearch_cv_results
    .params
    .apply(lambda x: "-".join([f"{k}={v}" for k, v in x.items()]))
)
nb_gridsearch_cv_results.set_index("params", inplace=True)
nb_gridsearch_cv_results.sort_values(by="rank_test_score", inplace=True)
nb_gridsearch_cv_results = nb_gridsearch_cv_results[
    nb_gridsearch_cv_results.columns.sort_values().tolist()
]

In [None]:
pipeline_FI = Pipeline([('tfidf', TfidfVectorizer(preprocessor=clean_text, 
                                               tokenizer=tokenizer, 
                                               stop_words="english", 
               
                                               ngram_range = (1,1),
                                               min_df      = best_parameters_log["tfidf__min_df"])),
                     ('clf', LogisticRegression(random_state=0, 
                                                multi_class='ovr',
                                                solver = 'lbfgs',
                                                penalty = best_parameters_log["clf__penalty"]))])

m = pipeline_FI.fit(X_train_text, y_train)

features = m[0].vocabulary_
weights  = m[1].coef_[0]