
# Aplicación de Few Shot con paraphrase-MiniLM-L3-v2



Instalamos e importamos dependencias

In [None]:
!pip install setfit
!pip install huggingface-hub==0.11.0


In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from huggingface_hub import notebook_login

Nos conectamos con Huggingface para subir el modelo

In [4]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


Cargamos el conjunto de datos de entrenamiento y validación

In [5]:
# Load dataset

data_files = {"train": "train.csv", "validation": "validation.csv"}
dataset = load_dataset("csv", data_files=data_files)

dataset



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither'],
        num_rows: 117
    })
    validation: Dataset({
        features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither'],
        num_rows: 51
    })
})

In [6]:
labels = [label for label in dataset['train'].features.keys() if label not in ['text', 'obligation', 'right', 'neither']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['service',
 'metric',
 'objective',
 'remedy',
 'claim',
 'exception',
 'definition']

In [7]:
def encode_labels(record):
  return {"labels": [record[label] for label in labels]}

dataset = dataset.map(encode_labels)



In [8]:
train_ds = dataset["train"]
train_ds

Dataset({
    features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither', 'labels'],
    num_rows: 117
})

In [9]:
eval_ds = dataset["validation"]
eval_ds

Dataset({
    features: ['text', 'service', 'metric', 'objective', 'remedy', 'claim', 'exception', 'definition', 'obligation', 'right', 'neither', 'labels'],
    num_rows: 51
})

Descargamos el modelo a entrenar con el Framework SetFit

In [10]:
model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
model = SetFitModel.from_pretrained(model_id, multi_target_strategy="one-vs-rest")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Fine-tuning con multi-label SetFitModel empleando la estratégia one-vs-rest

In [14]:
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=12,
    num_epochs=3,
    num_iterations=50,
    learning_rate=2e-5,
    column_mapping={
        "text": "text",  
        "labels": "label"
        }
)

In [15]:
trainer.train()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 15400
  Num epochs = 3
  Total optimization steps = 3852
  Total train batch size = 12


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3852 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3852 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3852 [00:00<?, ?it/s]

Evaluación del modelo

In [16]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.5490196078431373}

Subir modelo entrenado a Hugging Face

In [18]:
trainer.push_to_hub('marmolpen3/paraphrase-MiniLM-L3-v2-sla')

Cloning https://huggingface.co/marmolpen3/paraphrase-MiniLM-L3-v2-sla into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/66.4M [00:00<?, ?B/s]

Upload file model_head.pkl: 100%|##########| 24.3k/24.3k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/marmolpen3/paraphrase-MiniLM-L3-v2-sla
   68ab13a..c581110  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/marmolpen3/paraphrase-MiniLM-L3-v2-sla
   68ab13a..c581110  main -> main



'https://huggingface.co/marmolpen3/paraphrase-MiniLM-L3-v2-sla/commit/c58111056ac88d7829d4f02db4a259fd30e05c8c'

Inferencia de los datos de test para su clasificación

In [21]:
data_file = {"test": "test.csv"}

test_data = load_dataset("csv", data_files=data_file)

test_data



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-72cb7f53614c2e84/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-72cb7f53614c2e84/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 31
    })
})

In [22]:
preds = model(test_data["test"][:]["text"])

preds

array([[1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0]])

Resultados obtenidos

In [24]:
[[f for f, p in zip(labels, ps) if p] for ps in preds]

[['service'],
 ['service', 'metric'],
 ['service'],
 ['service', 'metric', 'objective'],
 ['claim'],
 ['service', 'claim'],
 ['metric'],
 ['metric'],
 ['metric', 'objective'],
 ['claim'],
 ['claim'],
 ['claim'],
 ['claim', 'exception'],
 ['service', 'claim', 'exception'],
 ['service', 'claim'],
 ['service', 'definition'],
 ['remedy', 'claim'],
 ['definition'],
 ['definition'],
 ['service'],
 ['service'],
 ['service'],
 ['service'],
 ['definition'],
 ['service', 'definition'],
 ['service'],
 ['metric', 'definition'],
 ['service'],
 ['service'],
 ['service', 'metric'],
 ['metric']]