**Load the Fine-Tuned Model**

In [7]:
import torch
from transformers import BertForTokenClassification, BertTokenizer, Trainer, TrainingArguments
import evaluate  
import numpy as np

In [8]:
# Load the fine-tuned model and tokenizer
model = BertForTokenClassification.from_pretrained("./models/fine_tuned_model")
tokenizer = BertTokenizer.from_pretrained("./models/fine_tuned_model")

In [9]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

**Define Functions to Prepare Test Data for Evaluation**

In [10]:
# Function to load CoNLL test data (same as in previous tasks)
def load_conll_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, entity = parts
                    sentence.append(token)
                    label.append(entity)
    return sentences, labels

# Load the test data
test_data_path = './data/labeled/test_data.conll'
test_sentences, test_labels = load_conll_data(test_data_path)

# Tokenize the test sentences
tokenized_test_inputs = tokenizer(test_sentences, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")


**Make Predictions and Align Labels**

In [16]:
# Tokenize the test sentences using the fast tokenizer
tokenized_test_inputs = tokenizer(test_sentences, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

# Align labels with the tokenized inputs
def align_labels_with_tokens(labels, tokenized_inputs):
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        aligned_labels.append(aligned_label)
    return aligned_labels

# Convert string labels to numeric labels
def convert_labels_to_ids(labels, label2id):
    numeric_labels = []
    for label_list in labels:
        numeric_label_list = [label2id.get(label, -100) for label in label_list]
        numeric_labels.append(numeric_label_list)
    return numeric_labels

# Create label-to-id and id-to-label mappings (same mappings as Task 3)
unique_labels = set(label for label_list in test_labels for label in label_list)
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Align test labels with the tokenized inputs
aligned_test_labels = align_labels_with_tokens(test_labels, tokenized_test_inputs)

# Convert aligned test labels to numeric IDs
numeric_test_labels = convert_labels_to_ids(aligned_test_labels, label2id)

# Define the NERDataset class (as in Task 3)
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create the test dataset
test_dataset = NERDataset(tokenized_test_inputs, numeric_test_labels)

# Define a function to compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# Load evaluation arguments
evaluation_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=16,
    logging_dir="./logs"
)

# Initialize the Trainer with evaluation dataset
trainer = Trainer(
    model=model,
    args=evaluation_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

100%|██████████| 1/1 [00:01<00:00,  1.67s/it]

Evaluation Results: {'eval_loss': 4.736898899078369, 'eval_model_preparation_time': 0.0, 'eval_precision': 0.012328767123287671, 'eval_recall': 1.0, 'eval_f1': 0.02435723951285521, 'eval_accuracy': 0.012328767123287671, 'eval_runtime': 5.6434, 'eval_samples_per_second': 1.063, 'eval_steps_per_second': 0.177}



