# Evaluate/Test Trained model

In [None]:
#!pip install seqeval torch>=1.6.0 transformers datasets

In [1]:
import os
import sys
parent = os.path.dirname(os.getcwd())

sys.path.insert(0,f'{parent}/src/training')

In [56]:

import torch
from datasets import load_metric
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.sampler import SequentialSampler
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification,default_data_collator,DataCollatorWithPadding

from preprocess_utils import load_ner_dataset, tokenize_dataset
from train_utils import prepare_compute_metrics

In [60]:
from types import SimpleNamespace

args = SimpleNamespace(
    model_name_or_path='elastic/distilbert-base-uncased-finetuned-conll03-english',
    dataset='conll2003',
    batch_size=4,
    pad_to_max_length=False
)


In [67]:
def evaluate(args):
    # Load tokenizer and preprocess dataset

    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path,
        use_fast=True,
        add_prefix_space=True if "roberta" in args.model_name_or_path.lower() else False,
    )
    model = AutoModelForTokenClassification.from_pretrained(args.model_name_or_path)

    datasets, num_labels, label_to_id, label_list = load_ner_dataset(args.dataset)
    padding = "max_length" if args.pad_to_max_length else False
    dataset = tokenize_dataset(dataset=datasets["test"], tokenizer=tokenizer, padding=padding, label_to_id=label_to_id)
    
    metric = load_metric("seqeval")
    compute_metrics = prepare_compute_metrics(metric, label_list)

    model.eval()
    for step, batch in enumerate(dataset):
        print(batch)
        with torch.no_grad():
            input_ids = torch.tensor(batch['input_ids'])
            attention_mask = torch.tensor(batch['attention_mask'])
            outputs = model(*[input_ids,attention_mask])

        predictions = outputs.logits
        labels = batch["labels"]
        metric.add(
            predictions=predictions,
            references=labels,
        )  # predictions and preferences are expected to be a nested list of labels, not label_ids

    result = compute_metrics()
    print(result)
    return result


In [None]:
evaluate(args)