###  This code is a modification of mutiple other notebook from the competition
Will document further ....

This is a notebook used to train an PII detection model based on a NER approach. We train our transformers model on both original essays and LLM generated ones to achieve higher F5 score.
The data is readily accessible in the kaggle competition page, if you want to train the model.

In [None]:
!pip install -q  seqeval evaluate
!pip install -U -q datasets accelerate 


In [None]:

import json
import numpy as np

data = json.load(open("./data/train.json"))

# downsampling of negative examples
p=[] # positive samples (contain relevant labels)
n=[] # negative samples (presumably contain entities that are possibly wrongly classified as entity)
for d in data:
    if any(np.array(d["labels"]) != "O"): p.append(d)
    else: n.append(d)
print("original datapoints: ", len(data))

external = json.load(open("./data/pii_dataset_fixed.json"))
print("external datapoints: ", len(external))

moredata = json.load(open("./data/moredata_dataset_fixed.json"))
print("moredata datapoints: ", len(moredata))

train_data = moredata+external+p+n[:len(n)//3]
print("combined: ", len(data))

In [None]:
from transformers import AutoTokenizer
import numpy as np
import random
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results



In [None]:
unique_labels = set(label for item in train_data for label in item["labels"])
unique_labels

In [None]:
import json
import numpy as np
from functools import partial
from datasets import load_metric, Dataset
from transformers import (
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification,
    AutoTokenizer
)
from tokenizers import AddedToken
import evaluate
from itertools import chain

    
# train_data = train_data[:1000]
# Extract and set up label mappings
unique_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


# Validate and convert training data into a dataset
dataset = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

# Training configuration
TRAIN_MODEL_PATH = "microsoft/deberta-v3-base"
MAX_LENGTH = 1024
OUTPUT_DIR = "output"

# Initialize tokenizer and modify for specific use-case
tokenizer = AutoTokenizer.from_pretrained(TRAIN_MODEL_PATH)  

# Filter dataset and apply tokenization
ds = dataset.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id,"max_length": MAX_LENGTH}, num_proc=3)


# Configure the model
model = AutoModelForTokenClassification.from_pretrained(
    TRAIN_MODEL_PATH, 
    num_labels=len(unique_labels), 
    id2label=id2label, 
    label2id=label2id,
    ignore_mismatched_sizes=True)

# Prepare data collator
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)


args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)



# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=unique_labels),
)

# Start the training process
trainer.train()


In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


In [None]:
## inference

import json
import argparse
from itertools import chain
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import pandas as pd
from pathlib import Path


def tokenize(example, tokenizer, max_length):
    text, token_map, idx = [], [], 0
    for token, has_space in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(token)
        token_map.extend([idx] * len(token))
        if has_space:
            text.append(" ")
            token_map.append(-1)
        idx += 1

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)
    return {**tokenized, "token_map": token_map}

def load_data(file_path):
    data = json.load(open(file_path))
    return Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [x["document"] for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    })

def create_dataframe(preds, ds, id2label):
    triplets = []
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(preds, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1: 
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                triplet = (label_pred, token_id, tokens[token_id])

                if triplet not in triplets:
                    document.append(doc)
                    token.append(token_id)
                    label.append(label_pred)
                    token_str.append(tokens[token_id])
                    triplets.append(triplet)

    return pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })

# Load and prepare data
# model_path = "/kaggle/input/pii-data-detection-baseline/output/checkpoint-240"
model_path = "/kaggle/working/output"
max_length = 2048 # Define max_length as needed
test_file_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
ds = load_data(test_file_path)

# Tokenization and Model Preparation
tokenizer = AutoTokenizer.from_pretrained(model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}, num_proc=2)
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# Training Arguments and Trainer Initialization
args = TrainingArguments(".", per_device_eval_batch_size=4, report_to="none")
trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer)

# Prediction and Saving
predictions = trainer.predict(ds).predictions
np.save("preds.npy", predictions)
ds.to_parquet("test_ds.pq")    
    
# Post-processing Predictions
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = np.load("preds.npy").argmax(-1)
ds = Dataset.from_parquet("test_ds.pq")


# Create results dataframe and dump it in sumbmission.csv file
df = create_dataframe(preds, ds, id2label)
df["row_id"] = list(range(len(df)))
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)
display(df.head(50))


In [None]:

# import pandas as pd
# import plotly.express as px

# # Flatten the dataset for analysis
# flattened_data = [(label, i) for sample_labels in ds["labels"] for i, label in enumerate(sample_labels) if label != "O"]

# # Create a DataFrame
# df = pd.DataFrame(flattened_data, columns=["label", "position"])

# # Define group thresholds
# bins = [0, 50, 100, 200, 500, 1000, 2000, 10000]
# df['range'] = pd.cut(df['position'], bins, right=False)

# # Convert Interval objects to strings
# df['range'] = df['range'].astype(str)

# # Group and count
# grouped_df = df.groupby(['label', 'range']).size().reset_index(name='count')

# # Plot
# px.scatter(grouped_df, x="range", y="count", color="label", log_y=True, height=1000)
