Imports

In [None]:
!pip install transformers
!pip install datasets
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

In [3]:
import numpy as np
import pandas as pd
from sklearn import metrics 
import transformers
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction


Lectura de los csv como Datasets

In [None]:
data_files = {"train": "train.csv", "validation": "validation.csv"}
dataset = load_dataset("csv", data_files=data_files)

dataset

In [5]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'service': Value(dtype='int64', id=None),
 'metric': Value(dtype='int64', id=None),
 'objective': Value(dtype='int64', id=None),
 'remedy': Value(dtype='int64', id=None),
 'claim': Value(dtype='int64', id=None),
 'exception': Value(dtype='int64', id=None),
 'definition': Value(dtype='int64', id=None),
 'obligation': Value(dtype='int64', id=None),
 'right': Value(dtype='int64', id=None),
 'neither': Value(dtype='int64', id=None)}

In [6]:
example = dataset['train'][:10]
example

{'text': ['Last Updated: May 25, 2022 This Amazon Compute Service Level Agreement (this “SLA”) is a policy governing the use of Amazon Elastic Compute Cloud (“Amazon EC2”)* and applies separately to each account using Amazon EC2.',
  'In the event of a conflict between the terms of this SLA and the terms of the AWS Customer Agreement or other agreement with us governing your use of our Services (the “Agreement”), the terms and conditions of this SLA apply, but only to the extent of such conflict.',
  'Capitalized terms used herein but not defined herein shall have the meanings set forth in the Agreement.',
  '*For purposes of this SLA, Amazon EC2 includes any Amazon Elastic Graphics, Amazon Elastic Inference, and Elastic IP Address resources purchased with the relevant Amazon EC2 instance(s).',
  'SLAs AWS makes two SLA commitments for Amazon EC2: (1) a Region-Level SLA that governs Amazon EC2 deployed across multiple AZs or regions, and (2) an Instance-Level SLA that governs Amazon EC

In [7]:
labels = [label for label in dataset['train'].features.keys() if label not in ['text', 'obligation', 'right', 'neither']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['service',
 'metric',
 'objective',
 'remedy',
 'claim',
 'exception',
 'definition']

In [8]:
df = dataset['train'].to_pandas()
df.head()

Unnamed: 0,text,service,metric,objective,remedy,claim,exception,definition,obligation,right,neither
0,"Last Updated: May 25, 2022 This Amazon Compute...",1,1,0,0,0,0,0,0,0,1
1,In the event of a conflict between the terms o...,1,0,0,0,0,0,1,0,0,1
2,Capitalized terms used herein but not defined ...,1,0,0,0,0,0,0,0,0,1
3,"*For purposes of this SLA, Amazon EC2 includes...",1,1,0,0,0,0,0,0,0,1
4,SLAs AWS makes two SLA commitments for Amazon ...,1,1,0,0,0,0,0,0,0,1


In [9]:
# Contar el número de veces que cada etiqueta toma el valor 1
freq = df[labels].sum()

# Divide las frecuencias entre el número total de filas
freq = freq / len(df)
print(freq*100)

service       36.752137
metric        29.059829
objective     28.205128
remedy        10.256410
claim         28.205128
exception      8.547009
definition    22.222222
dtype: float64


Preprocess data

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [11]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 117
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 51
    })
})

El tokenizer nos devuelve: 
  - **input_ids**: indica el orden de los inputs tokenizados que vamos a proporcionar al modelo
  - **token_type_ids**: forma de indicar al modelo de lenguaje natural el contexto de un token dentro de una frase o enunciado. Por ejemplo, en un modelo BERT, se utilizan dos tipos de tokens para indicar que una determinada secuencia de tokens proviene del primer segmento y otra para indicar que proviene del segundo.
  - **attention_mask**: es un vector que se usa en modelos de procesamiento de lenguaje natural como BERT para indicar a la red neuronal cuáles tokens en el input deben ser considerados para predecir la salida. Por ejemplo, si un token es un token de relleno agregado para que el input tenga un tamaño fijo, la máscara de atención marcará ese token como 0, lo que indica que no debe ser considerado por la red neuronal.

In [12]:
example = encoded_dataset['train'][0]
print(example)

{'input_ids': [101, 2197, 7172, 1024, 2089, 2423, 1010, 16798, 2475, 2023, 9733, 24134, 2326, 2504, 3820, 1006, 2023, 1523, 22889, 2050, 1524, 1007, 2003, 1037, 3343, 8677, 1996, 2224, 1997, 9733, 21274, 24134, 6112, 1006, 1523, 9733, 14925, 2475, 1524, 1007, 1008, 1998, 12033, 10329, 2000, 2169, 4070, 2478, 9733, 14925, 2475, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1,

In [13]:
example = encoded_dataset['validation'][0]
print(example)

{'input_ids': [101, 22889, 2050, 2005, 24296, 13970, 5677, 7159, 2229, 2326, 1006, 17712, 2015, 1007, 2197, 7172, 1024, 2233, 12609, 2005, 6304, 2040, 2031, 4156, 2019, 24296, 13970, 5677, 7159, 2229, 2326, 1006, 17712, 2015, 1007, 2039, 7292, 22889, 2050, 1010, 2057, 11302, 2039, 7292, 1997, 5585, 1012, 5345, 1003, 2005, 1996, 13970, 5677, 7159, 2229, 17928, 8241, 2005, 17712, 2015, 12906, 2008, 2224, 24296, 11343, 10019, 1998, 5585, 1012, 1023, 1003, 2005, 17712, 2015, 12906, 2008, 2079, 2025, 2224, 24296, 11343, 10019, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
tokenizer.decode(example['input_ids'])

'[CLS] last updated : may 25, 2022 this amazon compute service level agreement ( this “ sla ” ) is a policy governing the use of amazon elastic compute cloud ( “ amazon ec2 ” ) * and applies separately to each account using amazon ec2. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
example['labels']

[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['service', 'metric']

In [None]:
encoded_dataset.set_format("torch")

##### Tratar con las clases desequilibradas
Lo que hacemos para balancear es asignar pesos a cada clase de manera inversamente proporcional, de forma que mientras menor frecuencia tenga la clase, mayor será su peso.

In [None]:
freq

service       0.367521
metric        0.290598
objective     0.282051
remedy        0.102564
claim         0.282051
exception     0.085470
definition    0.222222
dtype: float64

In [None]:
class_weights = 1 - freq / len(df)

Convertimos la lista de pesos en un tensor de floats

In [None]:
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights

tensor([0.9969, 0.9975, 0.9976, 0.9991, 0.9976, 0.9993, 0.9981])

Estos pesos los usaremos más adelante para tratar de afinar los resultados del modelo

Definir modelo Bert

Usaremos el preentrenado con la configuración por defecto

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Entrenamos el modelo

Usaremos el método de entrenamiento de la API de Hugging face. Requiere dos cosas.

In [None]:
batch_size = 1
metric_name = "f1"

In [None]:
args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [None]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  2197,  7172,  1024,  2089,  2423,  1010, 16798,  2475,  2023,
         9733, 24134,  2326,  2504,  3820,  1006,  2023,  1523, 22889,  2050,
         1524,  1007,  2003,  1037,  3343,  8677,  1996,  2224,  1997,  9733,
        21274, 24134,  6112,  1006,  1523,  9733, 14925,  2475,  1524,  1007,
         1008,  1998, 12033, 10329,  2000,  2169,  4070,  2478,  9733, 14925,
         2475,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6643, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0120, -0.1574,  0.2412, -0.2226, -0.1759, -0.0436, -0.4365]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 117
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 585
  Number of trainable parameters = 109487623
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.424799,0.344086,0.602076,0.196078
2,No log,0.355922,0.6,0.737024,0.431373
3,No log,0.308086,0.625,0.741782,0.470588
4,No log,0.299891,0.650407,0.768166,0.470588
5,0.349800,0.292461,0.666667,0.777249,0.509804


***** Running Evaluation *****
  Num examples = 51
  Batch size = 1
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-117
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-117/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-117/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-117/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-117/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 51
  Batch size = 1
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-234
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-234/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-234/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-234/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-234/

TrainOutput(global_step=585, training_loss=0.32597469264625484, metrics={'train_runtime': 2181.8043, 'train_samples_per_second': 0.268, 'train_steps_per_second': 0.268, 'total_flos': 38481719328000.0, 'train_loss': 0.32597469264625484, 'epoch': 5.0})

Evaluate

After training, we evaluate our model on the validation set.

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 51
  Batch size = 1


{'eval_loss': 0.2924607992172241,
 'eval_f1': 0.6666666666666666,
 'eval_roc_auc': 0.777249134948097,
 'eval_accuracy': 0.5098039215686274,
 'eval_runtime': 113.0399,
 'eval_samples_per_second': 0.451,
 'eval_steps_per_second': 0.451,
 'epoch': 5.0}

Inference

In [None]:
text = "With respect to a Cloud Service listed above for which the Availability Service Level Agreement under this subsection applies, Oracle will use commercially reasonable efforts to have each such Service available with a Monthly Uptime Percentage (as defined below) of at least 99.9% during any calendar month (the “Service Commitment”)."

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

torch.Size([1, 7])

In [None]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
print(probs)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

tensor([0.4455, 0.8627, 0.9357, 0.5216, 0.1901, 0.1635, 0.1718],
       grad_fn=<SigmoidBackward0>)
['metric', 'objective', 'remedy']
