In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import sys, os
import json
import numpy as np
import pandas as pd
from functools import partial
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
from datasets import Dataset
from sklearn.model_selection import train_test_split

%aimport preprocess

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
file_train = 'train.json'
file_test = 'test.json'

MAXLEN = 512

In [4]:
df = pd.read_json(file_train)

### Preprocessing

**Step 1**: Convert `labels` column into ordinal.

In [5]:
label_list = ['O',
  'B-NAME_STUDENT',
  'I-NAME_STUDENT',
  'B-EMAIL',
  'I-EMAIL',
  'B-USERNAME',
  'I-USERNAME',
  'B-ID_NUM',
  'I-ID_NUM',
  'B-PHONE_NUM',
  'I-PHONE_NUM',
  'B-URL_PERSONAL',
  'I-URL_PERSONAL',
  'B-STREET_ADDRESS',
  'I-STREET_ADDRESS'
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

In [6]:
def create_mapped_labels(labels):
    mapped_labels = pd.DataFrame({
        'mapped_labels': labels
    })['mapped_labels'].map(label2id).tolist()

    return mapped_labels

In [7]:
# categorical column -- friendlier for classifiers
df['labels_cat'] = df['labels'].apply(create_mapped_labels)

In [8]:
# downsample because too many non-PII examples
filter = df['labels'].apply(lambda arr: any([l != 'O' for l in arr]))
downsampled_df = df[filter]

train, valid = train_test_split(downsampled_df, test_size=0.1, shuffle=True, random_state=22124)

def create_dataset(df):
    ds = Dataset.from_dict({
        'document': [d for d in df['document']],
        'full_text': [ft for ft in df['full_text']],
        'tokens': [t for t in df['tokens']],
        'trailing_whitespace': [tw for tw in df['trailing_whitespace']],
        'labels': [l for l in df['labels']],
        'labels_cat': [ml for ml in df['labels_cat']]
    })
    return ds

train_ds = create_dataset(train)
valid_ds = create_dataset(valid)

#### Some preprocessing helper functions

In [49]:
import pdb

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        # pdb.set_trace()
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    """
    After running tokenizer, word ids can get misaligned
    need to re-align BIO labels, i.e. make sure split-up words
    get tagged as I-, [CLS] and [SEP] etc. are given sentinel values
    """
    tokenized_inputs = tokenizer(
        examples["tokens"], padding=True, truncation=True, is_split_into_words=True, max_length=MAXLEN
    )
    # pdb.set_trace()
    all_labels = examples["labels_cat"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)                
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

### Finetune LLM

In [126]:
# MODEL = 'microsoft/deberta-v3-small'
# tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer_config.json: 100%|█| 52.0/52.0 [00:00<00:00, 144kB/s]
config.json: 100%|████████████| 578/578 [00:00<00:00, 1.11MB/s]
spm.model: 100%|██████████| 2.46M/2.46M [00:00<00:00, 13.0MB/s]


In [35]:
MODEL = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [50]:
tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_ds.map(tokenize_and_align_labels, batched=True)

Map: 100%|███████████| 850/850 [00:00<00:00, 888.21 examples/s]
Map: 100%|█████████████| 95/95 [00:00<00:00, 872.42 examples/s]


In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=16)

##### Define eval metrics before starting finetuning

In [13]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

**Finally**, the model

In [14]:
import torch

In [31]:
mps_device = torch.device("mps")

model = AutoModelForTokenClassification.from_pretrained(
    MODEL,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
).to(mps_device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
training_args = TrainingArguments(
    output_dir='output',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'
)

In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, label_list=label_list)
)

trainer.train()

Epoch,Training Loss,Validation Loss,Recall,Precision,F1
1,0.0672,0.010827,0.805755,0.682927,0.80022
2,0.0058,0.009806,0.741007,0.735714,0.740802
3,0.0032,0.009322,0.834532,0.84058,0.834763


TrainOutput(global_step=2550, training_loss=0.018512891536834192, metrics={'train_runtime': 284.2429, 'train_samples_per_second': 8.971, 'train_steps_per_second': 8.971, 'total_flos': 333243717580800.0, 'train_loss': 0.018512891536834192, 'epoch': 3.0})

In [53]:
trainer.save_model('distilbert-finetuned-downsampled-512')

In [56]:
!ls

Untitled.ipynb                       preprocess.py
[1m[36m__pycache__[m[m                          sample_submission.csv
[1m[36mdistilbert-finetuned-downsampled-512[m[m scratch.ipynb
distilbert_and_deberta.ipynb         test.json
[1m[36moutput[m[m                               train.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
