# Experiment: Run HPT with GridSearch to build a GBT model

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import string
import pandas as pd
from sklearn.feature_extraction.text import XXXVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from src import utils


# Parameters

In [3]:
RND_SEED = 123
PCT_TEST = 0.2
K_FOLD = 3

EXPERIMENT = "exp02_hpt_gbt"

# Paths
path_interim = os.path.join("data", "interim")
path_experiment =  os.path.join(path_interim, EXPERIMENT)

# Input
file_train = "train.csv"


# Output
file_exp = "df_exp_summary.csv"

In [4]:


utils.create_or_clean_folder(path_experiment)



Creating the folder: data\interim\exp02_hpt_gbt


# Load data

In [None]:
path_data_train = os.path.join(path_interim, file_train)

df_train = pd....
df_train.head()

Unnamed: 0,x_text,y_is_nf
0,Respuestas coherentes e idénticas ante entrada...,0
1,Gestión de usuarios: Todos los administradores...,0
2,Añadir numero de una revista. Para ello debemo...,0
3,Un usuario registrado visualiza la tabla de en...,0
4,Como usuario quiero poder ordenar las listas d...,0


# Build Pipeline

In [6]:
# Helper Cell: Tokenization and stemming in Spanish
import typing

class SpanishStemTokenizer:
    def __init__(self):
        self.stemmer = SnowballStemmer("spanish")

    def __call__(self, text) -> typing.List[str]:
        return [self.stemmer.stem(word) for word in word_tokenize(text) if word not in string.punctuation]

# Do not forget to preprocesss stopwords
tokenizer_es = SpanishStemTokenizer()
stopwords_es = nltk.corpus.stopwords.words('spanish')

stopwords_es_tok = list(set([tokenizer_es(term.lower())[0] for term in stopwords_es]))

example = df_train.loc[0, "x_text"]
ex_stem = tokenizer_es(example)

print(f"{example=}")
print(f"{ex_stem=}")


example='Respuestas coherentes e idénticas ante entradas de audio o texto: Los usuarios tienen la posibilidad de escuchar la respuesta mediante voz, esta ha de ser entendida e idéntica a la respuesta por escrito.'
ex_stem=['respuest', 'coherent', 'e', 'ident', 'ante', 'entrad', 'de', 'audi', 'o', 'text', 'los', 'usuari', 'tien', 'la', 'posibil', 'de', 'escuch', 'la', 'respuest', 'mediant', 'voz', 'esta', 'ha', 'de', 'ser', 'entend', 'e', 'ident', 'a', 'la', 'respuest', 'por', 'escrit']


In [None]:
tfidf_unigrams = XXXVectorizer(
    strip_accents="ascii",
    lowercase=True,
    tokenizer=SpanishStemTokenizer(),
    stop_words=stopwords_es_tok,
    analyzer="word",
    ngram_range=(1, 1),
)


clf = GradientBoostingClassifier(
    n_estimators=2000,  # Many boosting rounds  so early stoping takes place
    validation_fraction=0.2,  # Early stopping
    random_state=RND_SEED)

# Create the pipeline
skl_pl = Pipeline([
    ('fte', tfidf_unigrams),
    ('clf', clf)
])


# GridSearch

GridSearchCV will run a set of Cross Validation experiments for you.
It will run for every combination of hiperparameters in the `param_grid`
and run a Cross Validation job for each.


Remember to use always the same number of CV Folds and the same CV metric on 
every experiment!


In [8]:
X_train = df_train['x_text']
y_train = df_train['y_is_nf']


param_grid = {
    'fte__max_features': [64, 128, None],
    'fte__max_df': [0.95, 0.5, 0.25],
    'fte__min_df': [1, 3],
    'clf__max_depth': [3, 5]
}

grid_search = GridSearchCV(
    skl_pl,
    param_grid,
    cv=K_FOLD,
    scoring='f1',
    n_jobs=-1
    )

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)
print(f"{grid_search.best_score_=}")



grid_search.best_score_=np.float64(0.7221633085896076)


In [9]:
df_exp_summary = pd.DataFrame(
    grid_search.cv_results_
)

df_exp_summary["experiment_id"] = EXPERIMENT
df_exp_summary.sort_values(ascending=True, by="rank_test_score").head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_fte__max_df,param_fte__max_features,param_fte__min_df,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,experiment_id
14,8.453087,0.582729,0.370237,0.038653,3,0.25,128,1,"{'clf__max_depth': 3, 'fte__max_df': 0.25, 'ft...",0.731707,0.73913,0.695652,0.722163,0.01899,1,exp02_hpt_gbt
8,7.789417,0.520585,0.269382,0.024915,3,0.5,128,1,"{'clf__max_depth': 3, 'fte__max_df': 0.5, 'fte...",0.731707,0.73913,0.695652,0.722163,0.01899,1,exp02_hpt_gbt
9,7.747724,0.495518,0.276371,0.011045,3,0.5,128,3,"{'clf__max_depth': 3, 'fte__max_df': 0.5, 'fte...",0.731707,0.73913,0.680851,0.71723,0.025901,3,exp02_hpt_gbt
15,9.068501,0.342614,0.369439,0.050795,3,0.25,128,3,"{'clf__max_depth': 3, 'fte__max_df': 0.25, 'ft...",0.731707,0.73913,0.680851,0.71723,0.025901,3,exp02_hpt_gbt
3,8.000706,0.553049,0.268279,0.014653,3,0.95,128,3,"{'clf__max_depth': 3, 'fte__max_df': 0.95, 'ft...",0.7,0.723404,0.708333,0.710579,0.009686,5,exp02_hpt_gbt


# Diagnose the model

In [10]:
# Check DTM dimensions
skl_pl_fitted = grid_search.best_estimator_  

# Access the Vectorizer part of the pipeline
skl_pl_fte = skl_pl_fitted.named_steps['fte']

# Get DTM with transform()
dtm_train = skl_pl_fte.transform(X_train)
print(f"{dtm_train.shape=}")  # columns: Number of terms in the vocabulary

dtm_train.shape=(311, 128)


In [11]:
# Check training predictions and scoring

y_hats_train = skl_pl_fitted.predict(X_train)
f1_score_train = f1_score(
    y_true=y_train,
    y_pred=y_hats_train
)

print(f"{f1_score_train=}")  # Is comparable to CV metric?

f1_score_train=1.0


# Write Experiments Results

In [12]:
df_exp_summary.to_csv(
    os.path.join(path_experiment, file_exp),
    index=False
)

# other experiments results and artifacts maybe useful