https://huggingface.co/docs/transformers/en/tasks/token_classification

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ['AUTOGRAPH_VERBOSITY'] = '1'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

#!pip3 install -qU tensorflow tf_keras transformers datasets evaluate seqeval sentencepiece

In [2]:
#from huggingface_hub import notebook_login
#notebook_login()

In [3]:
import tensorflow as tf

from datasets import load_dataset
from transformers import (
    create_optimizer,
    pipeline,
    AutoTokenizer,
    BigBirdTokenizerFast,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TFAutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)

from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback

In [4]:
wnut = load_dataset("wnut_17")
wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [5]:
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [6]:
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [7]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

labels = [label_list[i] for i in wnut["train"][0][f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def tokenize_and_align_labels(examples): # TODO: PASS TOKENIXER
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
if False:
    # PyTorch
    model_path = 'google/bigbird-roberta-base'
    model_checkpoint = 'bigbird-roberta-ner' # marksusol/bigbird-roberta-ner
    model_output_dir = 'bigbird-roberta-ner'
    tokenizer = BigBirdTokenizerFast.from_pretrained(model_path)
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=13, id2label=id2label, label2id=label2id)

    tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        dataloader_num_workers=2,
        #dataloader_prefetch_factor=1,
        #num_workers=1,  # enable multiprocessing.
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        # push_to_hub=True, 
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_wnut["train"],
        eval_dataset=tokenized_wnut["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    tokenizer.save_pretrained(model_checkpoint)
    model.save_pretrained(model_checkpoint)
else:
    # TensorFlow
    model_path = 'distilbert/distilbert-base-uncased'
    model_checkpoint = 'distilbert-ner' # marksusol/distilbert-ner
    model_output_dir = 'distilbert-ner'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

    model = TFAutoModelForTokenClassification.from_pretrained(model_path, num_labels=13, id2label=id2label, label2id=label2id)

    tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

    tf_train_set = model.prepare_tf_dataset(
        tokenized_wnut["train"],
        shuffle=True,
        batch_size=16,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        tokenized_wnut["validation"],
        shuffle=False,
        batch_size=16,
        collate_fn=data_collator,
    )

    batch_size = 16
    num_train_epochs = 3
    num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
    optimizer, lr_schedule = create_optimizer(
        init_lr=2e-5,
        num_train_steps=num_train_steps,
        weight_decay_rate=0.01,
        num_warmup_steps=0,
    )
    
    model.compile(optimizer=optimizer)  # No loss argument!
    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

    #push_to_hub_callback = PushToHubCallback(
    #    output_dir=model_output_dir,
    #    tokenizer=tokenizer,
    #)

    callbacks = [metric_callback] #, push_to_hub_callback]
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

    tokenizer.save_pretrained(model_checkpoint)
    model.save_pretrained(model_checkpoint)


2024-03-09 20:12:19.471748: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-03-09 20:12:19.471776: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-03-09 20:12:19.471783: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-03-09 20:12:19.471799: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-09 20:12:19.471807: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_n

Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2024-03-09 20:12:34.101450: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




2024-03-09 20:29:54.419965: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 2/3


  _warn_prf(average, modifier, msg_start, len(result))




2024-03-09 20:57:43.253027: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 3/3


2024-03-09 21:18:57.887945: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Inference

In [9]:
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."

#classifier = pipeline("ner", model="marksusol/bigbird-roberta-ner")
classifier = pipeline("ner", model=model_checkpoint)
classifier(text)

Some layers from the model checkpoint at distilbert-ner were not used when initializing TFDistilBertForTokenClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-ner and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no pr

[{'entity': 'B-location',
  'score': 0.18775739,
  'index': 1,
  'word': 'the',
  'start': 0,
  'end': 3},
 {'entity': 'B-location',
  'score': 0.25161478,
  'index': 2,
  'word': 'golden',
  'start': 4,
  'end': 10},
 {'entity': 'I-location',
  'score': 0.24408132,
  'index': 3,
  'word': 'state',
  'start': 11,
  'end': 16},
 {'entity': 'I-location',
  'score': 0.190604,
  'index': 4,
  'word': 'warriors',
  'start': 17,
  'end': 25},
 {'entity': 'B-location',
  'score': 0.32201427,
  'index': 13,
  'word': 'san',
  'start': 80,
  'end': 83},
 {'entity': 'B-location',
  'score': 0.23932636,
  'index': 14,
  'word': 'francisco',
  'start': 84,
  'end': 93},
 {'entity': 'B-location',
  'score': 0.102141924,
  'index': 15,
  'word': '.',
  'start': 93,
  'end': 94}]

In [10]:
# what the pipeline does above
if False:
    from transformers import AutoTokenizer
    from transformers import TFAutoModelForTokenClassification
    
    #tokenizer = AutoTokenizer.from_pretrained("marksusol/bigbird-roberta-ner")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    inputs = tokenizer(text, return_tensors="tf")
    
    #model = TFAutoModelForTokenClassification.from_pretrained("marksusol/bigbird-roberta-ner")
    model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint)
    logits = model(**inputs).logits
    
    predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
    predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
    predicted_token_class