# Hyperparameter Tuning

The objective of this notebook is to fine-tune the hyperparameters of the ideal configuration identified during the model induction phase. To do this, the main hyperparameters of the TF-IDF transformer and the SVM classifier will be adjusted. Performance evaluation will be performed using hierarchical classification metrics.<br>

**Source file:** select_202425091103-translated.csv<br>
**Destination file:** select_202425091103-TFIDF LCPN SVM.pickle<br>  

In [None]:
import logging
logging.basicConfig(level=logging.WARNING)

In [None]:
cnpq = ['cnpq_area_level_1',
        'cnpq_area_level_2',
        'cnpq_area_level_3',
        'cnpq_area_level_4']

## Reading the dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../preprocessed/select_202425091103-translated.csv', dtype=str, na_filter=False)

In [None]:
df.head()

In [None]:
df.shape

## Loading the Transformation Techniques

In [None]:
# %load ../src/embedding.py
import re
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer

class Normalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()

    def pre_processing(self, doc):
        pattern = re.compile(r'\d+|http\S+|<.*?>', re.IGNORECASE)
        return pattern.sub('', doc).lower()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [' '.join(self.stemmer.stem(word) for word in word_tokenize(self.pre_processing(doc))) for doc in X]

class Lazy(BaseEstimator, TransformerMixin):

    def __init__(self, vectorizer, ngram_range, max_df, min_df, max_features):
        self.vectorizer = vectorizer(ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)
        self.ngram_range = ngram_range
        self.max_df = max_df
        self.min_df = min_df
        self.max_features = max_features

    def fit(self, raw_documents, y=None):
        self.vectorizer.fit(raw_documents, y)
        return self

    def transform(self, raw_documents):
        return self.vectorizer.transform(raw_documents).toarray()

    def fit_transform(self, raw_documents, y=None):
        return self.vectorizer.fit_transform(raw_documents, y).toarray()

class BoW(Lazy):
    def __init__(self, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=9000):
        super().__init__(CountVectorizer, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)

class TFIDF(Lazy):
    def __init__(self, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=9000):
        super().__init__(TfidfVectorizer, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features)

class Embedding(BaseEstimator, TransformerMixin):
    
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = SentenceTransformer('../models/' + model_name)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.model.encode(text) for text in X])

class RoBERTa(Embedding):
    def __init__(self, model_name='all-distilroberta-v1'):
        super().__init__(model_name=model_name)

class USE(Embedding):
    def __init__(self, model_name='distiluse-base-multilingual-cased-v1'):
        super().__init__(model_name=model_name)

## Loading the SVM wrapper

In [None]:
# %load ../src/svm.py
from sklearn.svm import SVC

class SVM(SVC):
    def __init__(self, C=1.0, kernel='rbf', probability=True):
        super().__init__(C=C, kernel=kernel, probability=probability)

## Defining the hyperparameters

In [None]:
from hiclass import LocalClassifierPerNode
from hiclass.metrics import f1
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split

In [None]:
pipeline = Pipeline([
    ('', Normalizer()),
    ('TFIDF', TFIDF()),
    ('LCPN', LocalClassifierPerNode(SVM()))
])

In [None]:
parameters = {
    'TFIDF__max_df': [0.75, 1.0],
    'TFIDF__min_df': [1, 3],
    'TFIDF__max_features': [5000, 7000, 9000],
    'TFIDF__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'LCPN__local_classifier__C': [0.1, 1, 10],
    'LCPN__local_classifier__kernel': ['linear', 'rbf'],
}

## Splitting training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['all'].to_numpy(), df[cnpq].to_numpy(), test_size=0.30, random_state=42)

## Running the Tuning

In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=20, cv=5, verbose=1, scoring=make_scorer(f1))

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor cross-validation score:", grid_search.best_score_)