In [None]:
!pip install datasets evaluate transformers[sentencepiece] accelerate huggingface_hub

In [2]:
import evaluate
import numpy as np
from torch import nn
from scipy.special import expit
from datasets import load_dataset
from sklearn.metrics import f1_score
from huggingface_hub import notebook_login
from transformers.trainer_utils import get_last_checkpoint
from transformers import (Trainer, AutoConfig, TrainingArguments,
                          AutoModelForSequenceClassification, AutoTokenizer,
                          DataCollatorWithPadding, EvalPrediction)

In [3]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
# loading datasets
train_dataset = load_dataset("lex_glue", "unfair_tos", split="train")
eval_dataset = load_dataset("lex_glue", "unfair_tos", split="validation")
predict_dataset = load_dataset("lex_glue", "unfair_tos", split="test")

# Labels
label_list = list(range(8))
num_labels = len(label_list)

# loading model and tokenizer
checkpoint = "nlpaueb/legal-bert-base-uncased"

config = AutoConfig.from_pretrained(
        checkpoint,
        num_labels=num_labels,
        finetuning_task="unfair_toc",
    )

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# preprocessing data
def preprocess_function(examples):
  # Tokenize the texts
  batch = tokenizer(
      examples["text"],
      max_length = 128,
      truncation=True,)
  batch["labels"] = [[1 if label in labels else 0 for label in label_list] for labels in examples["labels"]]
  return batch

train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on train dataset")

eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on validation dataset")

predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                desc="Running tokenizer on prediction dataset")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
train_dataset.features

{'text': Value(dtype='string', id=None),
 'labels': Sequence(feature=ClassLabel(names=['Limitation of liability', 'Unilateral termination', 'Unilateral change', 'Content removal', 'Contract by using', 'Choice of law', 'Jurisdiction', 'Arbitration'], id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [7]:
# Compute Metrics
def compute_metrics(p: EvalPrediction):
    # Fix gold labels
    y_true = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
    y_true[:, :-1] = p.label_ids
    y_true[:, -1] = (np.sum(p.label_ids, axis=1) == 0).astype('int32')
    # Fix predictions
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = (expit(logits) > 0.5).astype('int32')
    y_pred = np.zeros((p.label_ids.shape[0], p.label_ids.shape[1] + 1), dtype=np.int32)
    y_pred[:, :-1] = preds
    y_pred[:, -1] = (np.sum(preds, axis=1) == 0).astype('int32')
    # Compute scores
    macro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
    micro_f1 = f1_score(y_true=y_true, y_pred=y_pred, average='micro', zero_division=0)
    return {'macro-f1': macro_f1, 'micro-f1': micro_f1}

In [8]:
default_args = {
    "output_dir": "tmp",
    "log_level": "error",
    "report_to": "none",
}

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    fp16=True,
    fp16_full_eval=True,
    num_train_epochs=20,
    learning_rate=3e-5,
    **default_args,
)

trainer = MultilabelTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [9]:
trainer.train()



Epoch,Training Loss,Validation Loss,Macro-f1,Micro-f1
1,0.0939,0.037644,0.446537,0.923178
2,0.0422,0.028878,0.630738,0.929989
3,0.0215,0.030882,0.592471,0.916939
4,0.0197,0.026498,0.746782,0.942358
5,0.0124,0.027103,0.748604,0.947414
6,0.0082,0.042017,0.703134,0.925005
7,0.0052,0.032327,0.746362,0.93731
8,0.0049,0.029133,0.772515,0.947919
9,0.0033,0.030577,0.752595,0.947231
10,0.0032,0.032623,0.765845,0.950327




TrainOutput(global_step=13840, training_loss=0.010073507424293226, metrics={'train_runtime': 1970.8081, 'train_samples_per_second': 56.139, 'train_steps_per_second': 7.023, 'total_flos': 4428122154404736.0, 'train_loss': 0.010073507424293226, 'epoch': 20.0})

In [10]:
metrics = trainer.evaluate(eval_dataset=eval_dataset)
metrics["eval_samples"] = len(eval_dataset)
metrics

{'eval_loss': 0.026495959609746933,
 'eval_macro-f1': 0.746781969724428,
 'eval_micro-f1': 0.9423580786026201,
 'eval_runtime': 5.6588,
 'eval_samples_per_second': 402.028,
 'eval_steps_per_second': 50.364,
 'epoch': 20.0,
 'eval_samples': 2275}

In [11]:
predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
metrics["predict_samples"] = len(predict_dataset)
metrics

{'predict_loss': 0.02494034171104431,
 'predict_macro-f1': 0.7697002786038069,
 'predict_micro-f1': 0.946555452579549,
 'predict_runtime': 4.0109,
 'predict_samples_per_second': 400.653,
 'predict_steps_per_second': 50.113,
 'predict_samples': 1607}

In [None]:
notebook_login()

In [None]:
# pushing model and tokenizer to the hub
model.push_to_hub("udot-mk2")
tokenizer.push_to_hub("udot-mk2")