# Evaluate the NER model

In preparation for this workshop, we have annotated a subset of 200 of those 2000 reviews, and trained a NER model with it. Training the model requires access to "GPU", graphical processing units ([explain what that is better]), which is a piece of hardware that is very useful for playing big videogames, and for running models. Unfortunately, this machine does not have access to a GPU, and training the model anyways takes too long! (you won't want to spend 10 minutes looking at the screen). So we have pretrained this model for you.

As explained, we have divided it into a "train", "validation", and "test" dataset. We have used the "train" and "validation" datasets to train and get the best version of the model, but the model hasn't seen the "test" section of the data yet. We will load the model, and test it, to see what is its performance, and show a few examples of cases where it fails

In [None]:
# LOAD THE DATA
# TODO - This is incorrect. Load the actual test set, already split into sentences
import json
test_annotations_path = "animals_100_annotated.json"
with open(test_annotations_path, "r") as f:
    test_ann = json.load(f)
test_ann = test_ann[:30]

# === Separate into sentences
import spacy

def extract_sentence_level_annotations(ann):
    """Split reviews into sentences and extract entity span offsets for each sentence."""

    # Load model to split sentences
    nlp = spacy.load("en_core_web_sm")

    annotated_data = []
    for item in ann:
        
        # Separate into sentences using Spacy
        review_text = item["data"]["Text"]
        doc = nlp(review_text)
        for sent in doc.sents:
            sent_spans = []
    
            # Get the annotations from that sentence
            for annotation in item.get("annotations", []):
                for res in annotation.get("result", []):
                    v = res.get("value", {})
    
                    # Add to sent_spans if the annotation is from this sentence
                    if sent.start_char <= v['start'] and v['end'] < sent.end_char:
                        sent_spans.append((v['start']-sent.start_char, v['end']-sent.start_char))
    
            # Add to annotated data
            annotated_data.append({
                "text": sent.text.strip(),
                "sent_spans": sent_spans
            })

    # Return annotated data
    return annotated_data
    
annotated_data = extract_sentence_level_annotations(test_ann)

# Show an example of a sentence with an annotation.
# As most sentences don't have an annotation, we must filter out for one that has:
for example in annotated_data:
    # Check if the sentence has an annotated animal:
    if example["sent_spans"] != []:
        sent = example["text"]
        spans = example["sent_spans"]
        print("Sentence:\t", sent)
        print("Spans:\t", spans)
        print("Animals:\t", ", ".join(sent[start:end] for start, end in spans))

        # Break the loop to only see one example
        break


In [None]:
# LOAD THE MODEL AND THE TOKENIZER
from transformers import AutoTokenizer, AutoModelForTokenClassification

LOAD_MODEL_PATH = "animal-ner-model"

tokenizer = AutoTokenizer.from_pretrained(LOAD_MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(LOAD_MODEL_PATH)

In [None]:
# PREPARE DATA FOR EVALUATING
from datasets import Dataset

# === Define label schema
label_list = ["O", "B-ANIMAL", "I-ANIMAL"]
label_to_id = {label: i for i, label in enumerate(label_list)}

# === Convert into Dataset for training
dataset = Dataset.from_list(annotated_data)

# === Tokenize + align labels
def align_tokens_with_labels(example):
    text = example["text"]
    spans = example["sent_spans"]

    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True,
        max_length=128
    )

    labels = []
    for offset in encoding["offset_mapping"]:
        start, end = offset
        if start == end:
            labels.append(-100)
            continue

        label = "O"
        for span_start, span_end in spans:
            if start == span_start:
                label = "B-ANIMAL"
                break
            elif span_start < start < span_end:
                label = "I-ANIMAL"
                break
        labels.append(label_to_id[label])

    encoding.pop("offset_mapping")
    encoding["labels"] = labels
    encoding["text"] = text
    return encoding

tokenized_dataset = dataset.map(align_tokens_with_labels)

In [None]:
# Inspect one sample from the tokenized dataset
def inspect_dataset(tokenized_dataset, i=3) -> None:
    '''
    Change i to get other examples
    '''
    sample = tokenized_dataset[i]
    
    print("Original Text:")
    print(dataset[i]["text"])
    print("\nTokenized Tokens:")
    print(tokenizer.convert_ids_to_tokens(sample["input_ids"]))
    print("\nLabels:")
    print([id for id in sample["labels"]])
    
    # Decode label ids back to strings (optional)
    id_to_label = {v: k for k, v in label_to_id.items()}
    decoded_labels = [id_to_label.get(id, "PAD") if id != -100 else "IGN" for id in sample["labels"]]
    print("\nDecoded Labels:")
    print(decoded_labels)

inspect_dataset(tokenized_dataset)

In [None]:
# EVALUATE THE DATASET
# TODO - I don't know how to do that without the trainer. Ask ChatGPT
from tqdm import tqdm

batch_size = 16

# Process reviews in batches
for i in tqdm(range(0, len(tokenized_dataset), batch_size), desc="Finding animals in text"):
    batch_sents = tokenized_dataset[i:i+batch_size]

    # Tokenize all sentences in the batch with padding, truncation, and offset mapping
    encodings = tokenizer(
        batch_sents,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,
        return_offsets_mapping=True
    )

    # Get input tensors
    input_ids = encodings["input_ids"]
    attention_mask = encodings["attention_mask"]
    offset_mappings = encodings["offset_mapping"]

    # Run model inference without tracking gradients
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    # Get predicted label indices
    predictions = torch.argmax(logits, dim=-1)

    for sent_idx, sentence in enumerate(sentences):
        # Get model output for this sentence
        preds = predictions[sent_idx]           # predicted labels (as IDs) for each token
        input_ids_sent = input_ids[sent_idx]    # the token IDs for the sentence (what was passed into the model).
        offsets = offset_mappings[sent_idx]     # for each token, this tells use where the original string of the token came from