# Búsqueda de hiper parámetros con SetFit

Instalamos e importamos las dependencias

In [None]:
!pip install setfit
!pip install setfit[optuna]

In [None]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from optuna.visualization.matplotlib import plot_param_importances

Cargamos el conjunto de datos de entrenamiento y validación

In [4]:
# Load dataset

data_files = {"train": "train.csv", "validation": "validation.csv"}
dataset = load_dataset("csv", data_files=data_files)

dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dacc9e1b9b6e8fc3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dacc9e1b9b6e8fc3/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither'],
        num_rows: 117
    })
    validation: Dataset({
        features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither'],
        num_rows: 51
    })
})

In [5]:
labels = [label for label in dataset['train'].features.keys() if label not in ['text', 'obligation', 'right', 'neither']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['service',
 'metric',
 'objective',
 'remedy',
 'claim',
 'exception',
 'definition']

In [6]:
def encode_labels(record):
  return {"labels": [record[label] for label in labels]}

dataset = dataset.map(encode_labels)

  0%|          | 0/117 [00:00<?, ?ex/s]

  0%|          | 0/51 [00:00<?, ?ex/s]

In [7]:
train_ds = dataset["train"]
train_ds

Dataset({
    features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither', 'labels'],
    num_rows: 117
})

In [8]:
eval_ds = dataset["validation"]
eval_ds

Dataset({
    features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither', 'labels'],
    num_rows: 51
})

Seleccionamos el modelo a entrenar con el Framework SetFit

In [15]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"

In [16]:
# Función de inicialización del modelo

def make_model(params=None):
  multi_target_strategy = params["multi_target_strategy"] if params else "one-vs-rest"
  return SetFitModel.from_pretrained(
      model_id, multi_target_strategy=multi_target_strategy
  )

Fine-tuning con multi-label SetFitModel empleando la estrategia one-vs-rest

In [21]:
trainer = SetFitTrainer(
    model_init=make_model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss_class=CosineSimilarityLoss,
    num_epochs=3,
    num_iterations=80,
    column_mapping={
        "text": "text",
        "labels": "label"
        }
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Realizamos la búsqueda de hiperparámetros estableciendo rangos u opciones para probar el entrenamiento del modelo

In [22]:
def hyperparameter_search_function(trial):
  return {
      "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
      "batch_size": trial.suggest_categorical("batch_size", [4,8,16,32]),
      "multi_target_strategy": trial.suggest_categorical("multi_target_strategy", ["one-vs-rest","multi-output","classifier-chain"])
  }

La función hyperparameter_search realizará 10 intentos con diferentes combinaciones de los hiperparámetros dados

In [None]:
best = trainer.hyperparameter_search(hyperparameter_search_function, n_trials=10)

[32m[I 2023-02-05 15:50:09,490][0m A new study created in memory with name: no-name-00320c19-9467-4389-917d-b82e4729cb46[0m
Trial: {'learning_rate': 0.0002955532316677312, 'batch_size': 4, 'multi_target_strategy': 'classifier-chain'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 18480
  Total train batch size = 4


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[32m[I 2023-02-05 16:40:05,497][0m Trial 0 finished with value: 0.19607843137254902 and parameters: {'learning_rate': 0.0002955532316677312, 'batch_size': 4, 'multi_target_strategy': 'classifier-chain'}. Best is trial 0 with value: 0.19607843137254902.[0m
Trial: {'learning_rate': 0.000783884335275157, 'batch_size': 32, 'multi_target_strategy': 'classifier-chain'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 2310
  Total train batch size = 32


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[32m[I 2023-02-05 17:09:30,067][0m Trial 1 finished with value: 0.19607843137254902 and parameters: {'learning_rate': 0.000783884335275157, 'batch_size': 32, 'multi_target_strategy': 'classifier-chain'}. Best is trial 0 with value: 0.19607843137254902.[0m
Trial: {'learning_rate': 6.29879548797332e-05, 'batch_size': 32, 'multi_target_strategy': 'one-vs-rest'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 2310
  Total train batch size = 32


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[32m[I 2023-02-05 17:38:53,017][0m Trial 2 finished with value: 0.47058823529411764 and parameters: {'learning_rate': 6.29879548797332e-05, 'batch_size': 32, 'multi_target_strategy': 'one-vs-rest'}. Best is trial 2 with value: 0.47058823529411764.[0m
Trial: {'learning_rate': 0.0009094581711289811, 'batch_size': 4, 'multi_target_strategy': 'classifier-chain'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 18480
  Total train batch size = 4


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Iteration:   0%|          | 0/18480 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[32m[I 2023-02-05 18:26:50,960][0m Trial 3 finished with value: 0.19607843137254902 and parameters: {'learning_rate': 0.0009094581711289811, 'batch_size': 4, 'multi_target_strategy': 'classifier-chain'}. Best is trial 2 with value: 0.47058823529411764.[0m
Trial: {'learning_rate': 1.91071342626869e-05, 'batch_size': 16, 'multi_target_strategy': 'one-vs-rest'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 4620
  Total train batch size = 16


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4620 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****
[32m[I 2023-02-05 18:52:25,140][0m Trial 4 finished with value: 0.5294117647058824 and parameters: {'learning_rate': 1.91071342626869e-05, 'batch_size': 16, 'multi_target_strategy': 'one-vs-rest'}. Best is trial 4 with value: 0.5294117647058824.[0m
Trial: {'learning_rate': 0.000682196666261844, 'batch_size': 32, 'multi_target_strategy': 'one-vs-rest'}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24640
  Num epochs = 3
  Total optimization steps = 2310
  Total train batch size = 32


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2310 [00:00<?, ?it/s]

Después de 3 horas aproximadamente, el rate limit de GPU del entorno de ejecución que estábamos usando se sobrepasó y se detuvo la ejecución habiendo realizado 5 intentos de entrenamiento, es decir, que para haber terminado la ejecución de 10 intentos habríamos necesitado unas 6 horas

Una vez hubiera terminado esta ejecución, podríamos ver la mejor combinación obtenida imprimiendo la variable best

In [None]:
best

Podríamos mostrar gráficamente la importancia de cada hiperparámetro (https://optuna.readthedocs.io/en/stable/reference/visualization/generated/optuna.visualization.plot_param_importances.html)

In [None]:
plot_param_importances(best.backend)

Y por último aplicar los mejores hiperparámetros obtenidos como entrenamiento óptimo del modelo

In [None]:
trainer.apply_hyperparameters(best.hyperparameters, final_model=True)
trainer.train()

Para la realización de este notebook se ha tomado de referencia algunos de los recursos disponibles. Por ejemplo:

- **SetFit - Efficient Few-shot Learning with Sentence Transformers**: https://github.com/huggingface/setfit
- **SETFIT - HYPER Parameter Optimization for SBERT Text Classification (SBERT 45)**: https://www.youtube.com/watch?v=j1_szOni5-0&ab_channel=code_your_own_AI