# Experiment: Run HPT with GridSearch to build a Naive Bayes model

In [1]:
%load_ext autoreload
%autoreload 2

Failed to read module file 'C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\urllib\parse.py' for module 'urllib.parse': UnicodeDecodeError
Traceback (most recent call last):
  File "c:\Users\manuelalberto.romero\Documents\repos\dslabs\dslab-nlp-pc\.venv\Lib\site-packages\IPython\core\extensions.py", line 62, in load_extension
    return self._load_extension(module_str)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\manuelalberto.romero\Documents\repos\dslabs\dslab-nlp-pc\.venv\Lib\site-packages\IPython\core\extensions.py", line 77, in _load_extension
    mod = import_module(module_str)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2800.0_x64__qbz5n2kfra8p0\Lib\importlib\__init__.py", line 90, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen i

In [2]:
import os
import string
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from src import utils


# Parameters

In [3]:
RND_SEED = 123
PCT_TEST = 0.2
K_FOLD = 3

EXPERIMENT = "exp01_hpt_nb"

# Paths
path_interim = os.path.join("data", "interim")
path_experiment =  os.path.join(path_interim, EXPERIMENT)

# Input
file_train = "train.csv"

# Output
file_exp = "df_exp_summary.csv"

In [4]:


utils.create_or_clean_folder(path_experiment)

Creating the folder: data\interim\exp01_hpt_nb


# Load data

In [5]:
path_data_train = os.path.join(path_interim, file_train)

df_train = pd.read_csv(path_data_train)
df_train.head()

Unnamed: 0,x_text,y_is_nf
0,Respuestas coherentes e idénticas ante entrada...,0
1,Gestión de usuarios: Todos los administradores...,0
2,Añadir numero de una revista. Para ello debemo...,0
3,Un usuario registrado visualiza la tabla de en...,0
4,Como usuario quiero poder ordenar las listas d...,0


# Build Pipeline

In [13]:
# Helper Cell: Tokenization and stemming in Spanish
import typing
import string


def tokenizer_stemmer_es(text) -> typing.List[str]:
    stopword_es = nltk.corpus.stopwords.words('spanish')
    stemmer = SnowballStemmer("spanish")

    clean_words = [word for word in word_tokenize(text) if word not in string.punctuation and word.lower() not in stopword_es] # list[str]
    return [stemmer.stem(word) for word in clean_words]  # list[str]


stopwords_es = nltk.corpus.stopwords.words('spanish')

example = df_train.loc[0, "x_text"]
ex_stem = tokenizer_stemmer_es(example)

print(f"{example=}")
print(f"{ex_stem=}")


example='Respuestas coherentes e idénticas ante entradas de audio o texto: Los usuarios tienen la posibilidad de escuchar la respuesta mediante voz, esta ha de ser entendida e idéntica a la respuesta por escrito.'
ex_stem=['respuest', 'coherent', 'ident', 'entrad', 'audi', 'text', 'usuari', 'posibil', 'escuch', 'respuest', 'mediant', 'voz', 'ser', 'entend', 'ident', 'respuest', 'escrit']


In [14]:
tfbin_unigrams = CountVectorizer(
    strip_accents="ascii",
    lowercase=True,
    tokenizer=tokenizer_stemmer_es,
    ngram_range=(1, 1),
    binary=True,
)


clf_nbber = BernoulliNB()

# Create the pipeline
skl_pl = Pipeline([
    ('fte', tfbin_unigrams),
    ('clf', clf_nbber)
])


# Cross validate the model

Use the GridSearchCV object but with only a single configuration,
in order to maintain experiments scheme easily comparable.
You could also use other CV methods

Remember to use always the same number of CV Folds and the same CV metric on 
every experiment!


In [15]:


X_train = df_train['x_text']
y_train = df_train['y_is_nf']


param_grid = {
    'fte__max_features': [64, 128, None],
    'fte__max_df': [0.95, 0.5, 0.25],
    'fte__min_df': [1, 3],
}

grid_search = GridSearchCV(
    skl_pl,
    param_grid,
    cv=K_FOLD,
    scoring='f1',
    n_jobs=-1
    )

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)
print(f"{grid_search.best_score_=}")



grid_search.best_score_=np.float64(0.7630102355407572)


In [16]:
df_exp_summary = pd.DataFrame(
    grid_search.cv_results_
)

df_exp_summary["experiment_id"] = EXPERIMENT
df_exp_summary 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_fte__max_df,param_fte__max_features,param_fte__min_df,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,experiment_id
0,1.983928,0.036843,0.814413,0.02758,0.95,64.0,1,"{'fte__max_df': 0.95, 'fte__max_features': 64,...",0.692308,0.72,0.72,0.710769,0.013054,14,exp01_hpt_nb
1,1.981797,0.07571,0.783901,0.025374,0.95,64.0,3,"{'fte__max_df': 0.95, 'fte__max_features': 64,...",0.692308,0.72,0.705882,0.706063,0.011306,15,exp01_hpt_nb
2,2.125473,0.02065,0.791282,0.021634,0.95,128.0,1,"{'fte__max_df': 0.95, 'fte__max_features': 128...",0.75,0.769231,0.740741,0.753324,0.011866,6,exp01_hpt_nb
3,1.95817,0.022857,0.795039,0.035407,0.95,128.0,3,"{'fte__max_df': 0.95, 'fte__max_features': 128...",0.734694,0.769231,0.727273,0.743732,0.018283,9,exp01_hpt_nb
4,2.038421,0.007168,0.806612,0.020358,0.95,,1,"{'fte__max_df': 0.95, 'fte__max_features': Non...",0.484848,0.717949,0.615385,0.606061,0.095391,18,exp01_hpt_nb
5,1.85462,0.092638,0.8707,0.047484,0.95,,3,"{'fte__max_df': 0.95, 'fte__max_features': Non...",0.727273,0.769231,0.763636,0.75338,0.018601,5,exp01_hpt_nb
6,1.832264,0.080353,0.863705,0.016454,0.5,64.0,1,"{'fte__max_df': 0.5, 'fte__max_features': 64, ...",0.716981,0.734694,0.734694,0.72879,0.00835,10,exp01_hpt_nb
7,1.859736,0.036732,0.936765,0.063601,0.5,64.0,3,"{'fte__max_df': 0.5, 'fte__max_features': 64, ...",0.716981,0.734694,0.734694,0.72879,0.00835,10,exp01_hpt_nb
8,1.927503,0.078724,0.942633,0.058768,0.5,128.0,1,"{'fte__max_df': 0.5, 'fte__max_features': 128,...",0.75,0.754717,0.784314,0.76301,0.015186,1,exp01_hpt_nb
9,1.999015,0.037303,0.862761,0.032204,0.5,128.0,3,"{'fte__max_df': 0.5, 'fte__max_features': 128,...",0.75,0.754717,0.777778,0.760832,0.012137,3,exp01_hpt_nb


# Diagnose the model

In [17]:
# Check DTM dimensions
skl_pl_fitted = grid_search.best_estimator_  
# Only one model is fit, as only one HPT configuration is passed

# Access the CountVectorizer part of the pipeline
skl_pl_fte = skl_pl_fitted.named_steps['fte']

# Get DTM with transform()
dtm_train = skl_pl_fte.transform(X_train)
print(f"{dtm_train.shape=}")  # columns: Number of terms in the vocabulary

dtm_train.shape=(311, 128)


In [18]:
# Check training predictions and scoring

y_hats_train = skl_pl_fitted.predict(X_train)
f1_score_train = f1_score(
    y_true=y_train,
    y_pred=y_hats_train
)

print(f"{f1_score_train=}")  # Is comparable to CV metric?

f1_score_train=0.8098159509202454


# Write Experiments Results

In [19]:
df_exp_summary.to_csv(
    os.path.join(path_experiment, file_exp),
    index=False
)

# other experiments results and artifacts maybe useful