In [1]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "output"

In [2]:
!pip install seqeval evaluate -q

In [3]:
import json
import pandas as pd
import numpy as np
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features

from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

2024-09-15 18:10:20.891860: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-15 18:10:20.891963: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-15 18:10:21.019187: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data (pre)processing

## Get data

In [4]:
train_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))

In [5]:
len(train_data)

6807

## Mapping

In [6]:
# Map labels to ids
all_labels = sorted(list(set(chain(*[x["labels"] for x in train_data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [item for item in all_labels if item != 'O']

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [7]:
def rebuild_text(data):
    
    text, labels = [], []
    
    for tok, lab, ws in zip(
        data["tokens"], data["provided_labels"], data["trailing_whitespace"]
    ):
        # append each token to the reconstructed text and the label for each token's character
        text.append(tok)
        labels.extend([lab] * len(tok))
        
        # add space in text if whitespace and label "O"
        if ws:
            text.append(" ")
            labels.append("O")
            
    return text, labels

In [8]:
# Prepare data to be fed to the model & attribute labels to new token format
def tokenize(data, tokenizer, label2id, max_length):
    
    text, labels = rebuild_text(data)
    text = "".join(text)
    labels = np.array(labels)
    token_labels = []
    
    # returns a dictionary-like object containing tokenized inputs and offsets mapping (represents the mapping between the tokens and their corresponding positions in the original text)
    tokenized = tokenizer(text, return_offsets_mapping=True, max_length=max_length)
    
    for start_idx, end_idx in tokenized.offset_mapping:
        
        # if CLS tokens
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue
            
        # if token starts with ws
        if text[start_idx].isspace():
            start_idx += 1
            
        token_labels.append(label2id[labels[start_idx]])
        
    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [10]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in train_data],
    "document": [str(x["document"]) for x in train_data],
    "tokens": [x["tokens"] for x in train_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in train_data],
    "provided_labels": [x["labels"] for x in train_data],
})

In [11]:
# tokenize each row in the dataset
ds = ds.map(tokenize, fn_kwargs={"tokenizer":tokenizer, "label2id":label2id, "max_length":TRAINING_MAX_LENGTH}, num_proc=3)

    

#0:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#1:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 

#2:   0%|          | 0/2269 [00:00<?, ?ex/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
# Compare tokens and labels for original dataset and new tokenization
x = ds[0]

for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

print("*"*100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')
('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')
('Nathalie', 'B-NAME_STUDENT')
('Sylla', 'I-NAME_STUDENT')
****************************************************************************************************
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')
('N', 'B-NAME_STUDENT')
('atha', 'B-NAME_STUDENT')
('lie', 'B-NAME_STUDENT')
('▁S', 'I-NAME_STUDENT')
('ylla', 'I-NAME_STUDENT')


In [13]:
def compute_metrics(p, all_labels):
    # p is a tuple containing preds and true labels
    predictions, labels = p
    # preds are in form of probs for each label for each token => we take the highest one
    predictions = np.argmax(predictions, axis=2)

    # Remove special tokens from preds and labels
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # Compute metrics using sklearn and own formula
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    # Store metrics and return
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    
    return results

In [14]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Creates a collator object (tailored for token classification tasks)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

## Training

In [16]:
# Define training arguments
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

In [17]:
# Define trainer object (responsible for orchestrating the training process)
trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
%%time
trainer.train()

Step,Training Loss
20,3.2791
40,1.4492
60,0.0578
80,0.0088
100,0.0069
120,0.0073
140,0.0149
160,0.0112
180,0.0123
200,0.0083


CPU times: user 17min 54s, sys: 4min 35s, total: 22min 30s
Wall time: 22min 30s


TrainOutput(global_step=851, training_loss=0.11622859703674562, metrics={'train_runtime': 1349.9694, 'train_samples_per_second': 5.042, 'train_steps_per_second': 0.63, 'total_flos': 3161498795311008.0, 'train_loss': 0.11622859703674562, 'epoch': 1.0})

## Save model

In [19]:
trainer.save_model("deberta3base_1024")
tokenizer.save_pretrained("deberta3base_1024")

('deberta3base_1024/tokenizer_config.json',
 'deberta3base_1024/special_tokens_map.json',
 'deberta3base_1024/spm.model',
 'deberta3base_1024/added_tokens.json',
 'deberta3base_1024/tokenizer.json')

# Inference

In [34]:
INFERENCE_MAX_LENGTH = 2048
model_path = '/kaggle/working/deberta3base_1024'

In [35]:
import json
import pandas as pd
import numpy as np
from itertools import chain
from pathlib import Path

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import Dataset

In [37]:
test_data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

In [38]:
def tokenize(data, tokenizer):
    
    text, token_map = [], []
    idx = 0
    
    for tok, ws in zip(data["tokens"], data["trailing_whitespace"]):
        
        text.append(tok)
        token_map.extend([idx] * len(tok))
        
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
        
    return {**tokenized, "token_map": token_map}

In [39]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in test_data],
    "document": [x["document"] for x in test_data],
    "tokens": [x["tokens"] for x in test_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in test_data],
})

In [40]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [41]:
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

In [42]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)

In [43]:
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [44]:
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

In [45]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]

In [46]:
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]

In [47]:
threshold = 0.9
preds_final = np.where(O_preds < threshold, preds_without_O, preds)

In [48]:
triplets = []
document, token, label, token_str = [], [], [], []

In [49]:
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break
        
        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)


In [50]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
