### Fine-tuning DeBERTa to classify PII

Building off of the public DeBERTa baseline posted on forums: https://www.kaggle.com/code/valentinwerner/915-deberta3base-training.

In [1]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "output"

In [2]:
!pip install seqeval evaluate -q

import json
import argparse
from itertools import chain
from functools import partial
import pdb

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

2024-03-08 21:01:17.678362: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 21:01:17.678461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 21:01:17.842596: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### 👷 🔨Load and preprocess data

In [3]:
train = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))

# downsampling of negative examples -- exclude points with all negative tokens
p=[]
n=[]
for d in train:
    if any(np.array(d["labels"]) != "O"): p.append(d)
    else: n.append(d)    

print(f'Original training data: {len(train)}')
print(f'Downsampled training data: (+): {len(p)}; (-): {len(n)}')

# version 1: train on full data
# data = train

# version 2: only train on downsampled data to compare recall
data = p

Original training data: 6807
Downsampled training data: (+): 945; (-): 5862


**TODO**: External data seems to improve performance, consider adding it

```
external = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"))
print("external datapoints: ", len(external))

moredata = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"))
print("moredata datapoints: ", len(moredata))

data = moredata+external+p+n[:len(n)//3]
print("combined: ", len(data))
```

In [4]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label_list = ['O',
  'B-NAME_STUDENT',
  'I-NAME_STUDENT',
  'B-EMAIL',
  'I-EMAIL',
  'B-USERNAME',
  'I-USERNAME',
  'B-ID_NUM',
  'I-ID_NUM',
  'B-PHONE_NUM',
  'I-PHONE_NUM',
  'B-URL_PERSONAL',
  'I-URL_PERSONAL',
  'B-STREET_ADDRESS',
  'I-STREET_ADDRESS'
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

🔪 **Tokenization**

In [5]:
def tokenize(example):
    text = []
    labels = []
    for t, l, ws in zip(
        example["tokens"], example["data_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        # interesting that every _letter_ is being assigned a label
        labels.extend([l] * len(t))
        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=TRAINING_MAX_LENGTH)

    labels = np.array(labels)
    text = "".join(text)
    token_labels = []  # aligned labels to each tokenized token

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        # this is where that every-letter-has-a-label is used
        # essentially, we're picking labels from the start_idx of each word
        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)
    # classifier should use these `labels`, not `data_labels`
    return {**tokenized, "labels": token_labels, "length": length}

In [6]:
# construct Dataset object from loaded json
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "data_labels": [x["labels"] for x in data]
})

# initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

# run tokenizer on dataset
ds = ds.map(tokenize, num_proc=3)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



    

#0:   0%|          | 0/315 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#1:   0%|          | 0/315 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#2:   0%|          | 0/315 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
# inspecting data
x = ds[0]
print('-'*20, 'BEFORE TOKENIZING', '-'*20)
for t, l in zip(x['tokens'], x['data_labels']):
    if l != 'O':
        print((t, l))
        
print('\n', '-'*20, 'AFTER TOKENIZING', '-'*20)
tokens = tokenizer.convert_ids_to_tokens(x['input_ids'])
for t, l in zip(tokens, x['labels']):
    if id2label[l] != 'O':
        print((t, id2label[l]))

-------------------- BEFORE TOKENIZING --------------------
('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')
('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')
('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')

 -------------------- AFTER TOKENIZING --------------------
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')


In [8]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

### Metrics to evaluate on

In [9]:
from seqeval.metrics import recall_score, precision_score, f1_score, classification_report

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # stupid snippet of nested comprehensions that's all over kaggle
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

### Fit model

In [10]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(label_list),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
).to(DEVICE)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model='f1',
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels)
)

In [12]:
%%time
trainer.train()



Step,Training Loss
20,1.8563
40,0.0505
60,0.0301
80,0.0168
100,0.0105
120,0.0082
140,0.005
160,0.0063


CPU times: user 9min 46s, sys: 3min 42s, total: 13min 28s
Wall time: 9min 21s


TrainOutput(global_step=177, training_loss=0.22455766439269492, metrics={'train_runtime': 561.1072, 'train_samples_per_second': 5.053, 'train_steps_per_second': 0.315, 'total_flos': 1455840775675872.0, 'train_loss': 0.22455766439269492, 'epoch': 2.97})

In [13]:
trainer.save_model("deberta3base_1024-pii-finetuned")
tokenizer.save_pretrained("deberta3base_1024-pii-finetuned")

('deberta3base_1024-pii-finetuned/tokenizer_config.json',
 'deberta3base_1024-pii-finetuned/special_tokens_map.json',
 'deberta3base_1024-pii-finetuned/spm.model',
 'deberta3base_1024-pii-finetuned/added_tokens.json',
 'deberta3base_1024-pii-finetuned/tokenizer.json')