# Week 41: Model Evaluation on Custom Test Set


## 1. Setup and Configuration

### 1.1 Environment Detection

In [61]:
# Environment detection
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Running in local environment")

Running in Google Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.2 Library Imports

In [62]:
!pip install seqeval



In [None]:
# Core Python libraries
import re
import os
import sys
import json
import warnings
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any, Optional

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine Learning and NLP
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import nltk
from nltk.corpus import stopwords

# Transformers
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    DataCollatorForTokenClassification
)
from datasets import Dataset as HFDataset

# Sklearn metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    confusion_matrix,
    classification_report
)

# Sequence evaluation
from seqeval.metrics import classification_report as seqeval_report
from seqeval.metrics import f1_score as seqeval_f1

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# PyTorch setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Setup stopwords for English
stop_words = set(stopwords.words('english'))

print("All libraries imported successfully")

PyTorch device: cpu
âœ“ All libraries imported successfully


### 1.3 Load Custom Test Data

In [None]:
# Load custom test set
if IN_COLAB:
    TEST_FILE = "/content/drive/MyDrive/Colab_Notebooks/NLP/test.json"
else:
    TEST_FILE = "./questions.json"

print(f"Loading test data from: {TEST_FILE}")

with open(TEST_FILE, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)

print(f"\nLoaded {len(df_test)} test questions")
print(f"\nLanguage distribution:")
print(df_test['lang'].value_counts())
print(f"\nAnswerability distribution:")
print(df_test['answerable'].value_counts())

# Display sample questions
print("\n" + "="*70)
print("SAMPLE TEST QUESTIONS")
print("="*70)
for i in range(min(3, len(df_test))):
    row = df_test.iloc[i]
    print(f"\nQuestion {i+1} ({row['lang']}):")
    print(f"  Q: {row['question']}")
    print(f"  A: {row['answer'] if row['answerable'] else 'Not answerable'}")
    print(f"  Context: {row['context'][:100]}...")

Loading test data from: /content/drive/MyDrive/Colab_Notebooks/NLP/test.json

âœ“ Loaded 30 test questions

Language distribution:
lang
en    24
el     6
Name: count, dtype: int64

Answerability distribution:
answerable
True     22
False     8
Name: count, dtype: int64

SAMPLE TEST QUESTIONS

Question 1 (en):
  Q: Who wrote the novel '1984'?
  A: George Orwell
  Context: 1984 is a dystopian novel published in 1949 by English writer George Orwell. It depicts a totalitari...

Question 2 (en):
  Q: What is the capital city of Japan?
  A: Tokyo
  Context: Japan is an island country in East Asia. Its capital and largest city is Tokyo, known for its modern...

Question 3 (en):
  Q: When did World War II end?
  A: 1945
  Context: World War II was a global conflict that lasted from 1939 to 1945. It ended with the unconditional su...


---

## 2. Week 36: Rule-Based Answerability Classifier

Using TF-IDF weighted keyword overlap to classify questions as answerable or not.

### 2.1 Load Translation Model

In [None]:
# Initialize translation model for multilingual questions
device_id = 0 if torch.cuda.is_available() else -1
TARGET_LANG = 'eng_Latn'

print("Loading NLLB translation model...")
translator = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    device=device_id,
    torch_dtype=torch.float16 if device_id == 0 else torch.float32
)

print("Translation model loaded successfully")

Loading NLLB translation model...


Device set to use cpu


âœ“ Translation model loaded successfully


### 2.2 Define Rule-Based Classifier

In [66]:
# Language code mapping for NLLB
NLLB_LANG_CODES = {
    'ar': 'arb_Arab',
    'ko': 'kor_Hang',
    'te': 'tel_Telu',
    'en': 'eng_Latn',
    'el': 'ell_Grek',
}

class CrossLingualAnswerabilityClassifier:
    """Rule-based classifier using TF-IDF weighted keyword overlap."""

    def __init__(self, threshold=0.1, min_keyword_length=3):
        self.threshold = threshold
        self.min_keyword_length = min_keyword_length
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

    def extract_keywords(self, question_translated):
        """Extract meaningful keywords from translated question."""
        words = nltk.word_tokenize(question_translated.lower())
        keywords = [
            word for word in words
            if (word.isalnum() and
                len(word) > self.min_keyword_length and
                word not in stop_words)
        ]
        return keywords

    def compute_overlap_score(self, keywords, context):
        """Compute TF-IDF weighted overlap score."""
        if not keywords:
            return 0.0

        question_text = ' '.join(keywords)
        documents = [question_text, context.lower()]

        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
            vocab = self.tfidf_vectorizer.vocabulary_

            overlap_score = 0.0
            context_words = set(nltk.word_tokenize(context.lower()))

            for keyword in keywords:
                if keyword in vocab and keyword in context_words:
                    keyword_idx = vocab[keyword]
                    tfidf_score = tfidf_matrix[0, keyword_idx]
                    overlap_score += tfidf_score

            return overlap_score

        except Exception:
            # Fallback: simple overlap
            context_words = set(nltk.word_tokenize(context.lower()))
            simple_overlap = sum(1 for kw in keywords if kw in context_words)
            return simple_overlap / len(keywords) if keywords else 0.0

    def predict_single(self, question, context, language_code):
        """Predict answerability for a single question-context pair."""
        try:
            # Translate question if needed
            if language_code == 'en':
                translated_q = question
                translated_ctx = context
            else:
                src = NLLB_LANG_CODES.get(language_code, 'eng_Latn')
                translation = translator(question, src_lang=src, tgt_lang=TARGET_LANG)
                translated_q = translation[0]['translation_text'] if isinstance(translation, list) else translation['translation_text']
                tc = translator(context, src_lang=src, tgt_lang=TARGET_LANG)
                translated_ctx = tc[0]['translation_text'] if isinstance(tc, list) else tc['translation_text']

            # Extract keywords and compute score
            keywords = self.extract_keywords(translated)
            score = self.compute_overlap_score(keywords, context)
            prediction = score > self.threshold

            return prediction, score, translated, keywords

        except Exception as e:
            print(f"Warning: prediction failed - {e}")
            return False, 0.0, question, []

print("âœ“ Rule-based classifier defined")

âœ“ Rule-based classifier defined


### 2.3 Evaluate W36 Classifier on Test Set

In [None]:
print("\n" + "="*70)
print("WEEK 36: RULE-BASED ANSWERABILITY CLASSIFIER")
print("="*70)

# Initialize classifier
w36_classifier = CrossLingualAnswerabilityClassifier(threshold=0.1)

# Make predictions
predictions = []
true_labels = []

for idx, row in df_test.iterrows():
    pred, score, translated, keywords = w36_classifier.predict_single(
        row['question'],
        row['context'],
        row['lang']
    )
    predictions.append(pred)
    true_labels.append(row['answerable'])

# Calculate metrics
w36_accuracy = accuracy_score(true_labels, predictions)
w36_precision = precision_score(true_labels, predictions, zero_division=0)
w36_recall = recall_score(true_labels, predictions, zero_division=0)
w36_f1 = f1_score(true_labels, predictions, zero_division=0)

# Display results
print(f"\n Results on {len(df_test)} test questions:")
print(f"  Accuracy:  {w36_accuracy:.4f}")
print(f"  Precision: {w36_precision:.4f}")
print(f"  Recall:    {w36_recall:.4f}")
print(f"  F1-score:  {w36_f1:.4f}")

# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
print(f"\nConfusion Matrix:")
print(f"  TN: {cm[0,0]:3d}  FP: {cm[0,1]:3d}")
print(f"  FN: {cm[1,0]:3d}  TP: {cm[1,1]:3d}")

# Show some examples
print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)
for i in range(min(5, len(df_test))):
    row = df_test.iloc[i]
    pred = predictions[i]
    true = true_labels[i]
    correct = "âœ“" if pred == true else "âœ—"

    print(f"\nExample {i+1} {correct}:")
    print(f"  Question: {row['question'][:80]}...")
    print(f"  Language: {row['lang']}")
    print(f"  Predicted: {'Answerable' if pred else 'Not answerable'}")
    print(f"  True:      {'Answerable' if true else 'Not answerable'}")


WEEK 36: RULE-BASED ANSWERABILITY CLASSIFIER

ðŸ“Š Results on 30 test questions:
  Accuracy:  0.3000
  Precision: 1.0000
  Recall:    0.0455
  F1-score:  0.0870

Confusion Matrix:
  TN:   8  FP:   0
  FN:  21  TP:   1

EXAMPLE PREDICTIONS

Example 1 âœ—:
  Question: Who wrote the novel '1984'?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 2 âœ—:
  Question: What is the capital city of Japan?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 3 âœ—:
  Question: When did World War II end?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 4 âœ—:
  Question: Which planet is known as the Red Planet?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 5 âœ—:
  Question: Who painted the Mona Lisa?...
  Language: en
  Predicted: Not answerable
  True:      Answerable


---

## 3. Week 38: Trained Answerability Classifier (XLM-RoBERTa)

Using a fine-tuned XLM-RoBERTa model for multilingual answerability classification.

### 3.1 Load Pre-trained Model

Note: This assumes the model was previously trained and saved. If not available, this section will train a new model.

In [68]:
!pip install sacremoses



In [69]:
# Configuration
MAX_CONTEXT_CHARS = 1500
MAX_SEQ_LEN_TRANSFORMER = 512

# Model path
if IN_COLAB:
    W38_MODEL_PATH = "/content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc/models/xlmr_answerability_ar"
else:
    W38_MODEL_PATH = "./models/xlmr_answerability"

# Try to load saved model
if os.path.exists(W38_MODEL_PATH):
    print(f"Loading pre-trained model from {W38_MODEL_PATH}...")
    w38_tokenizer = AutoTokenizer.from_pretrained(W38_MODEL_PATH)
    w38_model = AutoModelForSequenceClassification.from_pretrained(W38_MODEL_PATH)
    w38_model.to(device)
    w38_model.eval()
    print("âœ“ Model loaded successfully")
else:
    print(f"âš  Model not found at {W38_MODEL_PATH}")
    print("  Using base XLM-R model (not fine-tuned)")
    w38_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    w38_model = AutoModelForSequenceClassification.from_pretrained(
        "xlm-roberta-base",
        num_labels=2
    )
    w38_model.to(device)
    w38_model.eval()

Loading pre-trained model from /content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc/models/xlmr_answerability_ar...
âœ“ Model loaded successfully


### 3.2 Evaluate W38 Classifier on Test Set

In [None]:
print("\n" + "="*70)
print("WEEK 38: XLM-RoBERTa ANSWERABILITY CLASSIFIER")
print("="*70)

# Prepare test data
df_test_copy = df_test.copy()
df_test_copy["context_trunc"] = df_test_copy["context"].str[:MAX_CONTEXT_CHARS]
df_test_copy["text"] = df_test_copy["question"] + " [SEP] " + df_test_copy["context_trunc"]

# Tokenize
test_encodings = w38_tokenizer(
    df_test_copy["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_SEQ_LEN_TRANSFORMER,
    return_tensors="pt"
)

# Make predictions
w38_predictions = []
w38_true_labels = df_test_copy["answerable"].astype(int).tolist()

print(f"Making predictions on {len(df_test)} test examples...")

with torch.no_grad():
    for i in range(0, len(test_encodings["input_ids"]), 16):
        batch_input_ids = test_encodings["input_ids"][i:i+16].to(device)
        batch_attention_mask = test_encodings["attention_mask"][i:i+16].to(device)

        outputs = w38_model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask
        )

        logits = outputs.logits
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
        w38_predictions.extend(batch_preds.tolist())

# Calculate metrics
w38_accuracy = accuracy_score(w38_true_labels, w38_predictions)
w38_precision = precision_score(w38_true_labels, w38_predictions, zero_division=0)
w38_recall = recall_score(w38_true_labels, w38_predictions, zero_division=0)
w38_f1 = f1_score(w38_true_labels, w38_predictions, zero_division=0)

# Display results
print(f"\nResults on {len(df_test)} test questions:")
print(f"  Accuracy:  {w38_accuracy:.4f}")
print(f"  Precision: {w38_precision:.4f}")
print(f"  Recall:    {w38_recall:.4f}")
print(f"  F1-score:  {w38_f1:.4f}")

# Confusion matrix
cm = confusion_matrix(w38_true_labels, w38_predictions)
print(f"\nConfusion Matrix:")
print(f"  TN: {cm[0,0]:3d}  FP: {cm[0,1]:3d}")
print(f"  FN: {cm[1,0]:3d}  TP: {cm[1,1]:3d}")

# Show some examples
print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)
for i in range(min(5, len(df_test))):
    row = df_test.iloc[i]
    pred = w38_predictions[i]
    true = w38_true_labels[i]
    correct = "âœ“" if pred == true else "âœ—"

    print(f"\nExample {i+1} {correct}:")
    print(f"  Question: {row['question'][:80]}...")
    print(f"  Language: {row['lang']}")
    print(f"  Predicted: {'Answerable' if pred else 'Not answerable'}")
    print(f"  True:      {'Answerable' if true else 'Not answerable'}")


WEEK 38: XLM-RoBERTa ANSWERABILITY CLASSIFIER
Making predictions on 30 test examples...

ðŸ“Š Results on 30 test questions:
  Accuracy:  0.4667
  Precision: 1.0000
  Recall:    0.2727
  F1-score:  0.4286

Confusion Matrix:
  TN:   8  FP:   0
  FN:  16  TP:   6

EXAMPLE PREDICTIONS

Example 1 âœ“:
  Question: Who wrote the novel '1984'?...
  Language: en
  Predicted: Answerable
  True:      Answerable

Example 2 âœ—:
  Question: What is the capital city of Japan?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 3 âœ—:
  Question: When did World War II end?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 4 âœ—:
  Question: Which planet is known as the Red Planet?...
  Language: en
  Predicted: Not answerable
  True:      Answerable

Example 5 âœ“:
  Question: Who painted the Mona Lisa?...
  Language: en
  Predicted: Answerable
  True:      Answerable


---

## 3. Week 39: mT5 model


In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load base mT5 TyDiQA model
MT5 = "mrm8488/mT5-small-finetuned-tydiqa-for-xqa"
tok = AutoTokenizer.from_pretrained(MT5)
mt5 = AutoModelForSeq2SeqLM.from_pretrained(MT5).to(device).eval()


# Simple normalization + metrics (EM, token-F1)
def normalize(s):
    if s is None: return ""
    s = s.strip().lower()
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def exact_match(p, g): return 1.0 if normalize(p) == normalize(g) else 0.0

def token_f1(p, g):
    p, g = normalize(p).split(), normalize(g).split()
    if not p and not g: return 1.0
    if not p or not g:  return 0.0
    overlap = len(set(p) & set(g))
    if overlap == 0: return 0.0
    prec, rec = overlap/len(set(p)), overlap/len(set(g))
    return 2*prec*rec/(prec+rec)

# Evaluate
preds, golds, ems, f1s = [], [], [], []
print("\nEvaluating Week-39 base mT5 (Model 1 style) on EN/EL test set...")
for _, row in df_test.iterrows():
    q, ctx = str(row["question"]), str(row["context"])
    is_ans = bool(row.get("answerable", False))
    gold = str(row.get("answer", "")) if is_ans else ""

    
    prompt = f"question: {q} context: {ctx[:400]}"
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        out = mt5.generate(**inputs, max_new_tokens=64, num_beams=4, early_stopping=True)
    pred = tok.decode(out[0], skip_special_tokens=True)

    preds.append(pred)
    golds.append(gold)
    ems.append(exact_match(pred if is_ans else "", gold))
    f1s.append(token_f1(pred if is_ans else "", gold))

w39_em = float(np.mean(ems))
w39_f1 = float(np.mean(f1s))
print(f"\nW39 base mT5 on custom test â€” EM: {w39_em:.4f}  |  F1: {w39_f1:.4f}")

for i in range(min(5, len(df_test))):
    row = df_test.iloc[i]
    print(f"\nExample {i+1} [{row.get('lang')}]: {row['question'][:100]}...")
    print("Pred:", preds[i])
    print("Gold:", golds[i])

Some weights of the model checkpoint at mrm8488/mT5-small-finetuned-tydiqa-for-xqa were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.3.layer.2.DenseReluDense.wi.weight', 'decoder.block.4.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.2.DenseReluDense.wi.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.7.layer.2.DenseReluDense.wi.weight', 'encoder.block.0.layer.1.DenseReluDense.wi.weight', 'encoder.block.1.layer.1.DenseReluDense.wi.weight', 'encoder.block.2.layer.1.DenseReluDense.wi.weight', 'encoder.block.3.layer.1.DenseReluDense.wi.weight', 'encoder.block.4.layer.1.DenseReluDense.wi.weight', 'encoder.block.5.layer.1.DenseReluDense.wi.weight', 'encoder.block.6.layer.1.DenseReluDense.wi.weight', 'encoder.block.7.layer.1.DenseReluDense.wi.weight']
- This IS expected if yo


Evaluating Week-39 base mT5 (Model 1 style) on EN/EL test set...

W39 base mT5 on custom test â€” EM: 0.4000  |  F1: 0.5086

Example 1 [en]: Who wrote the novel '1984'?...
Pred: George Orwell. The The the the The The The The The The The The The The The The the the the The The The The the fiction the the the the The The The The The 1984 is
Gold: George Orwell

Example 2 [en]: What is the capital city of Japan?...
Pred: Tokyo
Gold: Tokyo

Example 3 [en]: When did World War II end?...
Pred: the the the the the the the the of the Axis powers
Gold: 1945

Example 4 [en]: Which planet is known as the Red Planet?...
Pred: the Sun and is the fourth planet from the the Sun and is the fourth planet from the the Sun and is the fourth planet from the the the the the Sun and is the four the planet from the the the the the the the the the the Sun and is the the four the planet from the the the
Gold: Mars

Example 5 [en]: Who painted the Mona Lisa?...
Pred: the the the the The The The Thomas
Gold: Le

---

## 4. Week 40: Sequence Labeling for Answer Extraction

Using BIO tagging with XLM-RoBERTa for extractive question answering.

### 4.1 Define Helper Functions

In [None]:
def char_to_token_labels(context, answer_start, answer_text, tokenizer):
    """
    Convert character-level answer indices to token-level BIO labels.
    """
    # Tokenize context
    encoding = tokenizer(context, add_special_tokens=False, return_offsets_mapping=True)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]

    # Initialize all labels as "O" (outside)
    labels = ["O"] * len(tokens)

    # If answerable, find answer span
    if answer_start >= 0 and answer_text:
        answer_end = answer_start + len(answer_text)
        found_start = False

        for i, (start, end) in enumerate(offsets):
            # Check if token overlaps with answer
            if start < answer_end and end > answer_start:
                if not found_start:
                    labels[i] = "B-ANS"  # First token of answer
                    found_start = True
                else:
                    labels[i] = "I-ANS"  # Inside answer

    return tokens, labels


def extract_answer_from_bio(tokens, labels, tokenizer):
    """
    Extract answer text from BIO labels.
    """
    answer_tokens = []
    for token, label in zip(tokens, labels):
        if label.startswith("B-") or label.startswith("I-"):
            answer_tokens.append(token)

    if not answer_tokens:
        return ""

    # Decode tokens to text
    answer_text = tokenizer.convert_tokens_to_string(answer_tokens)
    return answer_text.strip()


def calculate_f1_token_overlap(pred_text, gold_text):
    """
    Calculate F1 score based on token overlap.
    """
    pred_tokens = set(pred_text.lower().split())
    gold_tokens = set(gold_text.lower().split())

    if len(pred_tokens) == 0 and len(gold_tokens) == 0:
        return 1.0
    elif len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0

    overlap = len(pred_tokens & gold_tokens)
    precision = overlap / len(pred_tokens) if len(pred_tokens) > 0 else 0
    recall = overlap / len(gold_tokens) if len(gold_tokens) > 0 else 0

    if precision + recall == 0:
        return 0.0

    f1 = 2 * precision * recall / (precision + recall)
    return f1

print("Helper functions defined")

âœ“ Helper functions defined


### 4.2 Load Pre-trained Sequence Labeling Model

In [None]:
# BIO label mappings
LABEL_MAP = {"O": 0, "B-ANS": 1, "I-ANS": 2}
ID2LABEL = {v: k for k, v in LABEL_MAP.items()}

# Model path
if IN_COLAB:
    W40_MODEL_PATH = "/content/drive/MyDrive/NLP-Project/models/xlmr_bio_tagger"
else:
    W40_MODEL_PATH = "./models/xlmr_bio_tagger"

# Try to load saved model
if os.path.exists(W40_MODEL_PATH):
    print(f"Loading pre-trained BIO tagger from {W40_MODEL_PATH}...")
    w40_tokenizer = AutoTokenizer.from_pretrained(W40_MODEL_PATH)
    w40_model = AutoModelForTokenClassification.from_pretrained(W40_MODEL_PATH)
    w40_model.to(device)
    w40_model.eval()
    print("Model loaded successfully")
else:
    print(f"Model not found at {W40_MODEL_PATH}")
    print("  Using base XLM-R model (not fine-tuned)")
    w40_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    w40_model = AutoModelForTokenClassification.from_pretrained(
        "xlm-roberta-base",
        num_labels=len(LABEL_MAP),
        id2label=ID2LABEL,
        label2id=LABEL_MAP
    )
    w40_model.to(device)
    w40_model.eval()

Loading pre-trained BIO tagger from /content/drive/MyDrive/NLP-Project/models/xlmr_bio_tagger...
âœ“ Model loaded successfully


### 4.3 Evaluate W40 Sequence Labeler on Test Set

In [None]:
print("\n" + "="*70)
print("WEEK 40: SEQUENCE LABELING FOR ANSWER EXTRACTION")
print("="*70)

# Prepare data: need to add answer_start field
print("Preparing test data with answer positions...")

# For questions.json, we need to find answer_start
for idx, row in df_test.iterrows():
    if row['answerable'] and row['answer']:
        # Find the answer in the context
        answer_pos = row['context'].find(row['answer'])
        if answer_pos == -1:
            # Try case-insensitive search
            answer_pos = row['context'].lower().find(row['answer'].lower())
        df_test.loc[idx, 'answer_start'] = answer_pos if answer_pos >= 0 else 0
    else:
        df_test.loc[idx, 'answer_start'] = -1

# Convert to int
df_test['answer_start'] = df_test['answer_start'].fillna(-1).astype(int)

# Make predictions
w40_pred_answers = []
w40_gold_answers = []
w40_f1_scores = []
w40_exact_matches = []

print(f"\nMaking predictions on {len(df_test)} test examples...")

for idx, row in df_test.iterrows():
    # Tokenize context
    tokens, gold_labels = char_to_token_labels(
        row['context'],
        row['answer_start'],
        row['answer'] if row['answerable'] else "",
        w40_tokenizer
    )

    # Encode for model
    encoding = w40_tokenizer(
        row['context'],
        truncation=True,
        max_length=512,
        return_tensors="pt",
        add_special_tokens=False
    )

    # Predict
    with torch.no_grad():
        outputs = w40_model(
            input_ids=encoding['input_ids'].to(device),
            attention_mask=encoding['attention_mask'].to(device)
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

    # Convert predictions to labels
    pred_label_ids = predictions[0].cpu().numpy()[:len(tokens)]
    pred_labels = [ID2LABEL[id] for id in pred_label_ids]

    # Extract answer
    pred_answer = extract_answer_from_bio(tokens, pred_labels, w40_tokenizer)

    if row['answerable']:
        ctx_lower = row['context'].lower()
        gold_candidates = [row.get('answer', ''), row.get('answer_inlang', '')]
        gold_answer = ""
        for cand in gold_candidates:
            if cand and cand.lower() in ctx_lower:
                gold_answer = cand
                break
        if not gold_answer:
            gold_answer = row.get('answer', '')
    else:
        gold_answer = ""

    w40_pred_answers.append(pred_answer)
    w40_gold_answers.append(gold_answer)

    # Calculate metrics
    exact_match = 1.0 if pred_answer.strip() == gold_answer.strip() else 0.0
    f1 = calculate_f1_token_overlap(pred_answer, gold_answer)

    w40_exact_matches.append(exact_match)
    w40_f1_scores.append(f1)

# Calculate overall metrics
w40_em = np.mean(w40_exact_matches)
w40_f1_avg = np.mean(w40_f1_scores)

# Display results
print(f"\nResults on {len(df_test)} test questions:")
print(f"  Exact Match (EM): {w40_em:.4f}")
print(f"  F1-score:         {w40_f1_avg:.4f}")

# Show some examples
print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)
for i in range(min(5, len(df_test))):
    row = df_test.iloc[i]
    pred = w40_pred_answers[i]
    gold = w40_gold_answers[i]
    em = w40_exact_matches[i]
    f1 = w40_f1_scores[i]
    correct = "âœ“" if em == 1.0 else "âœ—"

    print(f"\nExample {i+1} {correct}:")
    print(f"  Question: {row['question'][:80]}...")
    print(f"  Language: {row['lang']}")
    print(f"  Predicted: '{pred}'")
    print(f"  Gold:      '{gold}'")
    print(f"  EM: {em:.2f}, F1: {f1:.4f}")


WEEK 40: SEQUENCE LABELING FOR ANSWER EXTRACTION
Preparing test data with answer positions...

Making predictions on 30 test examples...

ðŸ“Š Results on 30 test questions:
  Exact Match (EM): 0.2667
  F1-score:         0.2667

EXAMPLE PREDICTIONS

Example 1 âœ—:
  Question: Who wrote the novel '1984'?...
  Language: en
  Predicted: ''
  Gold:      'George Orwell'
  EM: 0.00, F1: 0.0000

Example 2 âœ“:
  Question: What is the capital city of Japan?...
  Language: en
  Predicted: 'Tokyo'
  Gold:      'Tokyo'
  EM: 1.00, F1: 1.0000

Example 3 âœ—:
  Question: When did World War II end?...
  Language: en
  Predicted: ''
  Gold:      '1945'
  EM: 0.00, F1: 0.0000

Example 4 âœ—:
  Question: Which planet is known as the Red Planet?...
  Language: en
  Predicted: ''
  Gold:      'Mars'
  EM: 0.00, F1: 0.0000

Example 5 âœ—:
  Question: Who painted the Mona Lisa?...
  Language: en
  Predicted: ''
  Gold:      'Leonardo da Vinci'
  EM: 0.00, F1: 0.0000


---

## 5. Summary and Comparison

Comparing all models on the custom test set.

### 5.1 Results Table

In [74]:
print("\n" + "="*70)
print("FINAL RESULTS SUMMARY")
print("="*70)

# Create results summary
results_summary = []

# Week 36 results
results_summary.append({
    'Week': 'W36',
    'Model': 'Rule-based TF-IDF',
    'Task': 'Answerability',
    'Accuracy': f'{w36_accuracy:.4f}',
    'Precision': f'{w36_precision:.4f}',
    'Recall': f'{w36_recall:.4f}',
    'F1': f'{w36_f1:.4f}',
    'EM': '-',
})

# Week 38 results
results_summary.append({
    'Week': 'W38',
    'Model': 'XLM-RoBERTa (fine-tuned)',
    'Task': 'Answerability',
    'Accuracy': f'{w38_accuracy:.4f}',
    'Precision': f'{w38_precision:.4f}',
    'Recall': f'{w38_recall:.4f}',
    'F1': f'{w38_f1:.4f}',
    'EM': '-',
})

# Week 40 results
results_summary.append({
    'Week': 'W40',
    'Model': 'XLM-R BIO Tagger',
    'Task': 'Answer Extraction',
    'Accuracy': '-',
    'Precision': '-',
    'Recall': '-',
    'F1': f'{w40_f1_avg:.4f}',
    'EM': f'{w40_em:.4f}',
})

# Display as DataFrame
results_df = pd.DataFrame(results_summary)
print("\n")
print(results_df.to_string(index=False))

# Visual comparison
print("\n" + "="*70)
print("KEY OBSERVATIONS")
print("="*70)
print(f"\nTotal test questions: {len(df_test)}")
print(f"\nAnswerability Classification (W36 vs W38):")
print(f"  W36 Rule-based:    F1={w36_f1:.4f}, Accuracy={w36_accuracy:.4f}")
print(f"  W38 XLM-RoBERTa:   F1={w38_f1:.4f}, Accuracy={w38_accuracy:.4f}")
if w38_f1 > w36_f1:
    print(f"  â†’ XLM-RoBERTa performs better by {(w38_f1-w36_f1):.4f} F1 points")
else:
    print(f"  â†’ Rule-based performs better by {(w36_f1-w38_f1):.4f} F1 points")

print(f"\nAnswer Extraction (W40):")
print(f"  XLM-R BIO Tagger:  F1={w40_f1_avg:.4f}, EM={w40_em:.4f}")


FINAL RESULTS SUMMARY


Week                    Model              Task Accuracy Precision Recall     F1     EM
 W36        Rule-based TF-IDF     Answerability   0.3000    1.0000 0.0455 0.0870      -
 W38 XLM-RoBERTa (fine-tuned)     Answerability   0.4667    1.0000 0.2727 0.4286      -
 W40         XLM-R BIO Tagger Answer Extraction        -         -      - 0.2667 0.2667

KEY OBSERVATIONS

Total test questions: 30

Answerability Classification (W36 vs W38):
  W36 Rule-based:    F1=0.0870, Accuracy=0.3000
  W38 XLM-RoBERTa:   F1=0.4286, Accuracy=0.4667
  â†’ XLM-RoBERTa performs better by 0.3416 F1 points

Answer Extraction (W40):
  XLM-R BIO Tagger:  F1=0.2667, EM=0.2667
