In [None]:
! pip install spacy
! pip install transformers
! pip install datasets
!pip show seqeval
!pip install -U seqeval
! pip install scikit-learn
! pip install evaluate

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import evaluate

In [None]:
# Load the dataset
datasets = load_dataset("conll2003")

# Load the tokenizer and model
model = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForTokenClassification.from_pretrained(model, num_labels=len(datasets["train"].features["ner_tags"].feature.names))


# https://huggingface.co/docs/transformers/en/tasks/token_classification
# Adjusted tokenization and alignment function
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=False)
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word in the inputs
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)  # Special token or same word as previous token
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, remove_columns=datasets["train"].column_names)

# Initialise the data collator with padding
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True, return_tensors="pt")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialise Trainer with the new data collator
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Train the model with the adjusted settings
trainer.train()


label_list = datasets["train"].features["ner_tags"].feature.names

# Tunstall, Lewis; Werra, Leandro von; Wolf, Thomas. Natural Language Processing with Transformers
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    preds_list_str = []
    labels_list_str = []

    for batch_index in range(len(label_ids)):
        batch_preds = []
        batch_labels = []

        for idx, label_id in enumerate(label_ids[batch_index]):
            if label_id != -100:  # Ensure we only consider real labels, not padding
                true_label = label_list[label_id]
                batch_labels.append(true_label)
                
                pred_label_id = preds[batch_index][idx]
                pred_label = label_list[pred_label_id]
                batch_preds.append(pred_label)

        preds_list_str.append(batch_preds)
        labels_list_str.append(batch_labels)

    return preds_list_str, labels_list_str


predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
preds_list, labels_list = align_predictions(predictions, labels)


In [None]:
flat_preds = [label for sublist in preds_list for label in sublist]
flat_labels = [label for sublist in labels_list for label in sublist]

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(flat_labels + flat_preds)

split_index = len(flat_labels)

flat_encoded_labels = encoded_labels[:split_index]
flat_encoded_preds = encoded_labels[split_index:]

# Compute metrics
print(classification_report(flat_encoded_labels, flat_encoded_preds, target_names=label_encoder.classes_, zero_division=0))

In [None]:
metric = evaluate.load("seqeval")

results = metric.compute(predictions=preds_list, references=labels_list, scheme="IOB2", mode="strict")
print("Full entity evaluation Results:", results)

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(flat_encoded_labels, flat_encoded_preds)

custom_order = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
# Get indices for the new order based on label_encoder
order_indices = [label_encoder.transform([label])[0] for label in custom_order]
new_cm = cm[order_indices, :][:, order_indices]
plt.figure(figsize=(8, 8))
sns.heatmap(new_cm, annot=True, fmt="d", cmap='Blues', xticklabels=custom_order, yticklabels=custom_order)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Transformers Confusion Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()