# Experiment: Run HPT with GridSearch to build a Naive Bayes model

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import string
import pandas as pd

from sklearn.feature_extraction.text import ...
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ...
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from src import utils


# Parameters

In [None]:
RND_SEED = 123
PCT_TEST = 0.2
K_FOLD = 3

EXPERIMENT = "exp01_hpt_nb"

# Paths
path_interim = os.path.join("data", "interim")
path_experiment =  os.path.join(path_interim, EXPERIMENT)

# Input
file_train = "train.csv"

# Output
file_exp = "df_exp_summary.csv"

In [None]:


utils.create_or_clean_folder(path_experiment)

# Load data

In [None]:
path_data_train = os.path.join(path_interim, file_train)

df_train = pd....
df_train.head()

# Build Pipeline

In [None]:
# Helper Cell: Tokenization and stemming in Spanish
import typing
import string


def tokenizer_stemmer_es(text) -> typing.List[str]:
    stopword_es = nltk.corpus.stopwords.words('spanish')
    stemmer = SnowballStemmer("spanish")

    clean_words = [word for word in word_tokenize(text) if word not in string.punctuation and word.lower() not in stopword_es] # list[str]
    return [stemmer.stem(word) for word in clean_words]  # list[str]


stopwords_es = nltk.corpus.stopwords.words('spanish')

example = df_train.loc[0, "x_text"]
ex_stem = tokenizer_stemmer_es(example)

print(f"{example=}")
print(f"{ex_stem=}")


In [None]:
# Choose appropiate instances of XXXVectorizer and BernoulliXXX
tfbin_unigrams = XXXVectorizer(
    strip_accents="ascii",
    lowercase=True,
    tokenizer=tokenizer_stemmer_es,
    ngram_range=(1, 1),
    binary=True,
)


clf_nbber = BernoulliXXX()

# Create the pipeline
skl_pl = Pipeline([
    ('fte', tfbin_unigrams),
    ('clf', clf_nbber)
])


# Cross validate the model

Use the GridSearchCV object but with only a single configuration,
in order to maintain experiments scheme easily comparable.
You could also use other CV methods

Remember to use always the same number of CV Folds and the same CV metric on 
every experiment!


In [None]:
X_train = ...
y_train = ...

# Change at will
param_grid = {
    # pipelinestep__parameter: [list of parameters values]
    # Check in documentation which parameters are worthly to trial
    # and what values do they expect
    'fte__max_features': ...,
    'fte__max_df': ...,
    'fte__min_df': ...,
}

grid_search = GridSearchCV(
    skl_pl,
    param_grid,
    cv=K_FOLD,  # maintain the same number of folds across the project
    scoring='f1',  # maintain the same scoring function (cv metric) across the project
    n_jobs=-1
    )

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)
print(f"{grid_search.best_score_=}")

In [None]:
df_exp_summary = pd.DataFrame(
    grid_search.cv_results_
)

df_exp_summary["experiment_id"] = EXPERIMENT
df_exp_summary 

# Diagnose the model

In [None]:
# Check DTM dimensions
skl_pl_fitted = grid_search.best_estimator_  
# Only one model is fit, as only one HPT configuration is passed

# Access the CountVectorizer part of the pipeline
skl_pl_fte = skl_pl_fitted.named_steps['fte']

# Get DTM with transform()
dtm_train = ...
print(f"{dtm_train.shape=}")  # columns: Number of terms in the vocabulary

In [None]:
# Check training predictions and scoring

y_hats_train = ...  # get preds with predict()
f1_score_train = f1_score(
    y_true=...,
    y_pred=...
)

print(f"{f1_score_train=}")  # Is comparable to CV metric?

# Write Experiments Results

In [None]:
df_exp_summary.to_csv(
    os.path.join(path_experiment, file_exp),
    index=False
)

# other experiments results and artifacts maybe useful