In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
import torch

2023-09-07 22:28:29.773032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-07 22:28:29.904404: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-09-07 22:28:30.310262: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-07 22:28:30.310346: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
dataset = load_dataset("conll2003")

In [3]:
label_names = dataset["train"].features["ner_tags"].feature.names

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(
        all_samples_per_split["tokens"],
        is_split_into_words=True
    )
    total_adjusted_labels = []

    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
   
        for wid in word_ids_list:
            if wid is None:
                adjusted_label_ids.append(-100)
            elif wid != prev_wid:
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [6]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [8]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [9]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if k not in flattened_results.keys():
            flattened_results[k+"_f1"] = results[k]["f1"]

    return flattened_results


In [10]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(label_names)
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps = 1000,
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [13]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc F1,Misc F1,Org F1,Per F1
1000,0.1146,0.059676,0.92622,0.938136,0.93214,0.984781,0.963782,0.82631,0.903443,0.968786
2000,0.0215,0.061686,0.933576,0.94496,0.939234,0.985591,0.963855,0.839664,0.915352,0.975473
3000,0.0101,0.065469,0.935266,0.94552,0.940365,0.985988,0.963273,0.84668,0.918686,0.973793


TrainOutput(global_step=3073, training_loss=0.047812584126922776, metrics={'train_runtime': 294.6037, 'train_samples_per_second': 333.625, 'train_steps_per_second': 10.431, 'total_flos': 2641560961318200.0, 'train_loss': 0.047812584126922776, 'epoch': 7.0})

In [14]:
tokenized_dataset["validation"][0]

{'id': '0',
 'tokens': ['CRICKET',
  '-',
  'LEICESTERSHIRE',
  'TAKE',
  'OVER',
  'AT',
  'TOP',
  'AFTER',
  'INNINGS',
  'VICTORY',
  '.'],
 'pos_tags': [22, 8, 22, 22, 15, 22, 22, 22, 22, 21, 7],
 'chunk_tags': [11, 0, 11, 12, 13, 11, 12, 12, 12, 12, 0],
 'ner_tags': [0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0],
 'input_ids': [101,
  4533,
  1011,
  20034,
  2202,
  2058,
  2012,
  2327,
  2044,
  7202,
  3377,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, -100]}

In [33]:
torch.save(model.state_dict(), "NER_BERT/NER_BERT.pt")
preds = trainer.predict(tokenized_dataset["test"])