# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m


You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "prathisuper@gmail.com"
!git config --global user.name "Prathik Jain"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("DFKI-SLT/fabner", "fabner_simple")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/866k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9435 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2183 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9435
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2183
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2064
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['Revealed',
 'the',
 'location-specific',
 'flow',
 'patterns',
 'and',
 'quantified',
 'the',
 'speeds',
 'of',
 'various',
 'types',
 'of',
 'flow',
 '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[0, 0, 0, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'MATE', 'MANP', 'MACEQ', 'APPL', 'FEAT', 'PRO', 'CHAR', 'PARA', 'ENAT', 'CONPRI', 'MANS', 'BIOP'], id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O',
 'MATE',
 'MANP',
 'MACEQ',
 'APPL',
 'FEAT',
 'PRO',
 'CHAR',
 'PARA',
 'ENAT',
 'CONPRI',
 'MANS',
 'BIOP']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Revealed the location-specific flow   patterns and quantified the speeds of various types of flow . 
O        O   O                 CONPRI CONPRI   O   O          O   O      O  O       O     O  O    O 


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "kalexa2/fabner-ner"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Rev',
 '##eal',
 '##ed',
 'the',
 'location',
 '-',
 'specific',
 'flow',
 'patterns',
 'and',
 'q',
 '##uant',
 '##ified',
 'the',
 'speeds',
 'of',
 'various',
 'types',
 'of',
 'flow',
 '.',
 '[SEP]']

In [None]:
inputs.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 3,
 4,
 5,
 6,
 6,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/9435 [00:00<?, ? examples/s]

Map:   0%|          | 0/2183 [00:00<?, ? examples/s]

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,   10,   10,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100, -100,
         -100],
        [-100,    0,    0,    0,    0,    0,   10,   10,   10,   10,   10,    0,
            0,    0,    0,    0,    0,    2,    0,    0,    0,    0,    2,    0,
         -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 0, 0, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, -100]


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=52365abaf6fcca3661c7ad2d5620c057b13b580a3226fa3a5d60d67618f9444e
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'CONPRI',
 'CONPRI',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])



{'ONPRI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at kalexa2/fabner-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([49]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([49, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

13

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "kalexa2/fabner-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(100)),
    eval_dataset=tokenized_datasets["validation"].select(range(20)),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.368231,0.814159,0.836364,0.825112,0.904472
2,No log,0.377746,0.823009,0.845455,0.834081,0.904472
3,No log,0.380193,0.823009,0.845455,0.834081,0.904472


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=39, training_loss=0.08955646172547951, metrics={'train_runtime': 230.7391, 'train_samples_per_second': 1.3, 'train_steps_per_second': 0.169, 'total_flos': 4456361021400.0, 'train_loss': 0.08955646172547951, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/PrathikJain/fabner-ner/commit/e1ab3b3d7a93dec23d47fb142d3e8963a1df4f9f', commit_message='Training complete', commit_description='', oid='e1ab3b3d7a93dec23d47fb142d3e8963a1df4f9f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "fabner-ner"
repo_name = get_full_repo_name(model_name)
repo_name

'PrathikJain/fabner-ner'

In [None]:
output_dir = "fabner-ner"
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/PrathikJain/fabner-ner into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/249M [00:00<?, ?B/s]

Download file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725162818.6213c8095d0f.521.0: 100%|#######…

Download file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163306.6213c8095d0f.521.2: 100%|#######…

Download file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163658.6213c8095d0f.521.3: 100%|#######…

Download file training_args.bin: 100%|##########| 4.99k/4.99k [00:00<?, ?B/s]

Download file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163060.6213c8095d0f.521.1: 100%|#######…

Clean file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163306.6213c8095d0f.521.2:  20%|#9        …

Clean file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725162818.6213c8095d0f.521.0:  20%|#9        …

Clean file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163658.6213c8095d0f.521.3:  15%|#4        …

Clean file runs/Sep01_03-53-30_6213c8095d0f/events.out.tfevents.1725163060.6213c8095d0f.521.1:  20%|#9        …

Clean file training_args.bin:  20%|##        | 1.00k/4.99k [00:00<?, ?B/s]

Clean file model.safetensors:   0%|          | 1.00k/249M [00:00<?, ?B/s]

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from tqdm.auto import tqdm
import torch

num_train_epochs = 2
small_train_dataset = tokenized_datasets["train"].select(range(100))  # First 100 samples
small_eval_dataset = tokenized_datasets["validation"].select(range(20))  # First 20 samples

train_dataloader = DataLoader(small_train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8, collate_fn=data_collator)

# Adjust the number of training steps accordingly
num_training_steps = len(train_dataloader) * num_train_epochs

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )


  0%|          | 0/26 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


epoch 0: {'precision': 0.8363636363636363, 'recall': 0.8363636363636363, 'f1': 0.8363636363636363, 'accuracy': 0.9085365853658537}


  _warn_prf(average, modifier, msg_start, len(result))


epoch 1: {'precision': 0.8, 'recall': 0.8380952380952381, 'f1': 0.8186046511627908, 'accuracy': 0.8943089430894309}


In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "fabner-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Biofabrication through AM emerged in the recent years as a new alternative to fabricate tissues.")

[{'entity_group': 'MANP',
  'score': 0.9905567,
  'word': 'Biofabrication',
  'start': 0,
  'end': 14},
 {'entity_group': 'MANP',
  'score': 0.9966037,
  'word': 'AM',
  'start': 23,
  'end': 25},
 {'entity_group': 'MATE',
  'score': 0.9813712,
  'word': 'as',
  'start': 54,
  'end': 56},
 {'entity_group': 'MANP',
  'score': 0.9954963,
  'word': 'fabricate',
  'start': 78,
  'end': 87}]