## Imports

In [1]:
!python --version

Python 3.10.12


In [2]:
import torch
import transformers

from transformers.utils import send_example_telemetry
send_example_telemetry("token_classification_notebook", framework="pytorch")

from datasets import load_dataset, load_metric

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
task = "ner"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

## Load Dataset

In [6]:
datasets = load_dataset("wnut_17")

In [7]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [8]:
label_list = datasets["train"].features["ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [9]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))
    
show_random_elements(datasets["train"])

Unnamed: 0,id,tokens,ner_tags
0,2588,"[NASCAR, 's, Popular, Regional, Series, ,, ACT, Return, to, Help, Celebrate, 25th, ..., -, RaceDayCT, http://t.co/fYjrBS1QW0, #nascar]","[B-corporation, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,392,"[about, to, get, dress, hangin, wit, the, big, sis, Abby, tonite]","[O, O, O, O, O, O, O, O, O, B-person, O]"
2,2187,"[@XboxSupport, after, making, any, changes, to, an, avatar, whole, system, chugs, along, and, when, booting, up, a, game, it, freeze, the, system, up, ., 250g, 360-s]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,1261,"[Halo, Reach, was, a, bit, crap, tonight]","[B-product, I-product, O, O, O, O, O]"
4,2050,"[I, just, took, "", After, getting, trampled, at, a, Justin, Bieber, concert, ,, yoiu, wake, up, and, ..."", and, got, :, Part, 6, :), !, Try, it, :, http://tinyurl.com/26zeju5]","[O, O, O, O, O, O, O, O, O, B-person, I-person, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5,181,"[FRIDAY, Sept, ., 17, http://goo.gl/fb/83ib3]","[O, O, O, O, O]"
6,2231,"[#MedicalJobs, CT/Rad, Tech, -, PRN, :, TX-Fort, Worth, ,, When, physicians, own, the, hospital, ,, the, latest, advances, in, medical, s, ..., http://bit.ly/cVPhNE]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
7,3276,"[Since, Saturday, night, ., http://t.co/hTdDzwfEkG]","[O, O, O, O, O]"
8,753,"[RT, @therealadamwest, :, Sunday, is, my, birthday, ., 82, years, young, !, What, should, I, wear, ?, (, don't, say, birthday, suit, !, )]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
9,3168,"[-If, you, value, your, life, ,, if, you, have, any, hope, of, seeing, tomorrow, ,, there, 's, something, you, never, EVER, put, in, a, trap, -And, what, would, that, be, ?, -ME]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


## Preprocessing

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [12]:
tokenizer("Hello, my name is Neel")

{'input_ids': [101, 7592, 1010, 2026, 2171, 2003, 7663, 2140, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
ex = datasets["train"][0]
print(ex)

{'id': '0', 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]}


In [14]:
ex_tokenized = tokenizer(ex["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(ex_tokenized["input_ids"])

print(ex["tokens"], len(ex["tokens"]))
print(tokens, len(tokens))

['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'] 27
['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]'] 34


### Align and Tokenize Labels

In [15]:
# if below is True, label all sub-words following the first sub-word, else assign -100 to all following sub-words
label_all_tokens = False

In [16]:
def tokenize_and_align_label(ex):
    # tokenize all examples, truncate if exceeding max model size and accept pre-split words
    ex_tokenized = tokenizer(ex["tokens"], truncation=True, is_split_into_words=True)
    
    labels = []
    for i, label in enumerate(ex["ner_tags"]):
        word_ids = ex_tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # set the labels of special tokens (CLS, SEP) that have a word id of None to -100
            if word_idx is None:
                label_ids.append(-100)
            # set the label of the first token of each word
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # for other tokens, either set -100 or the current label depending on the label_all_tokens flag
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                
            previous_word_idx = word_idx      
        labels.append(label_ids) 
    ex_tokenized["labels"] = labels
    
    return ex_tokenized

In [17]:
# apply function to all sentences in dataset, use map function
tokenized_datasets = datasets.map(tokenize_and_align_label, batched=True)

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

## Fine-tuning the `distilbert` Model

In [18]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [19]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# establishing the base model
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model_name = "distilbert-finetuned-ner"

# establishing training arguments
training_args = TrainingArguments(
    output_dir=model_name,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

### Create a Data Collator to Pad Tokens and Training Metrics

In [21]:
from transformers import DataCollatorForTokenClassification

# establishing data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
import evaluate
import numpy as np

# establishing seqeval metric
metric = evaluate.load("seqeval")

# post-processing for metric results generated
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Training

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.278026,0.6,0.250232,0.353172,0.938352
2,No log,0.251375,0.538897,0.353105,0.426652,0.943055
3,0.191000,0.257625,0.545332,0.373494,0.443344,0.945406
4,0.191000,0.2716,0.544098,0.37164,0.44163,0.945535


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=852, training_loss=0.13877330923304312, metrics={'train_runtime': 2262.0758, 'train_samples_per_second': 6.002, 'train_steps_per_second': 0.377, 'total_flos': 184068639256200.0, 'train_loss': 0.13877330923304312, 'epoch': 4.0})

### Evaluation

In [52]:
trainer.evaluate()

{'eval_loss': 0.27652886509895325,
 'eval_precision': 0.5822050290135397,
 'eval_recall': 0.2789620018535681,
 'eval_f1': 0.37719298245614036,
 'eval_accuracy': 0.9410884528237357,
 'eval_runtime': 70.6462,
 'eval_samples_per_second': 18.218,
 'eval_steps_per_second': 1.147,
 'epoch': 2.0}

In [25]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'corporation': {'precision': 0.35714285714285715,
  'recall': 0.29411764705882354,
  'f1': 0.3225806451612903,
  'number': 34},
 'creative-work': {'precision': 0.2727272727272727,
  'recall': 0.02857142857142857,
  'f1': 0.05172413793103448,
  'number': 105},
 'group': {'precision': 0.36363636363636365,
  'recall': 0.20512820512820512,
  'f1': 0.2622950819672131,
  'number': 39},
 'location': {'precision': 0.4375,
  'recall': 0.5675675675675675,
  'f1': 0.4941176470588235,
  'number': 74},
 'person': {'precision': 0.7695961995249406,
  'recall': 0.6893617021276596,
  'f1': 0.7272727272727273,
  'number': 470},
 'product': {'precision': 0.4827586206896552,
  'recall': 0.12280701754385964,
  'f1': 0.1958041958041958,
  'number': 114},
 'overall_precision': 0.6606260296540363,
 'overall_recall': 0.4796650717703349,
 'overall_f1': 0.5557865557865559,
 'overall_accuracy': 0.9504736474028864}

In [26]:
trainer.push_to_hub(
    commit_message="training: 4 epochs"
)

'https://huggingface.co/neelgokhale/distilbert-finetuned-ner/tree/main/'