# Week 39: Span-Based Question Answering


# Setup & Dependencies

In [2]:
# Environment detection
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Running in local environment")

Running in Google Colab
Mounted at /content/drive


In [3]:
# Install required packages
!pip install -q transformers torch datasets accelerate seqeval scikit-learn pandas pyarrow fastparquet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
# Core imports
import re
import math
import json
import pickle
import warnings
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any, Optional

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset as HFDataset
from seqeval.metrics import classification_report as seqeval_report
from seqeval.metrics import f1_score as seqeval_f1
from sklearn.metrics import f1_score, precision_recall_fscore_support

warnings.filterwarnings('ignore')

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Random seeds
torch.manual_seed(42)
np.random.seed(42)

print("Libraries imported successfully")

Using device: cuda
Libraries imported successfully


In [5]:
# Dataset configuration
LANGUAGES = ["ar", "ko", "te"]
LANGUAGE_NAMES = {"ar": "Arabic", "ko": "Korean", "te": "Telugu"}

# Dataset paths
if IN_COLAB:
    BASE_DIR = Path("/content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc")
else:
    BASE_DIR = Path("./tydi_xor_rc")

TRAIN_PATH = BASE_DIR / "train.parquet"
VAL_PATH = BASE_DIR / "validation.parquet"

print(f"Dataset directory: {BASE_DIR}")

Dataset directory: /content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc


In [6]:
# Load datasets
if TRAIN_PATH.exists() and VAL_PATH.exists():
    df_train = pd.read_parquet(TRAIN_PATH)
    df_val = pd.read_parquet(VAL_PATH)

    # Filter for target languages
    df_train = df_train[df_train["lang"].isin(LANGUAGES)].copy()
    df_val = df_val[df_val["lang"].isin(LANGUAGES)].copy()

    print(f"Training examples: {len(df_train):,}")
    print(f"Validation examples: {len(df_val):,}")
    print("\nLanguage distribution:")
    for lang in LANGUAGES:
        train_count = len(df_train[df_train["lang"] == lang])
        val_count = len(df_val[df_val["lang"] == lang])
        print(f"  {LANGUAGE_NAMES[lang]}: {train_count:,} train, {val_count:,} val")

    print("\nDataset loaded successfully")
else:
    print("ERROR: Dataset files not found!")
    print(f"Please ensure files exist at: {BASE_DIR}")

Training examples: 6,335
Validation examples: 1,155

Language distribution:
  Arabic: 2,558 train, 415 val
  Korean: 2,422 train, 356 val
  Telugu: 1,355 train, 384 val

Dataset loaded successfully


# Data Preprocessing Utilities

Convert character-level answer indices to token-level labels (BIO format).

In [7]:
def char_to_token_labels(context, answer_start, answer_text, tokenizer):
    """
    Convert character-level answer indices to token-level BIO labels.

    Args:
        context: The context text
        answer_start: Character index where answer starts
        answer_text: The answer text
        tokenizer: Tokenizer to use for tokenization

    Returns:
        tokens: List of tokens
        labels: List of BIO labels ("O", "B-ANS", "I-ANS")
    """
    # Tokenize context
    encoding = tokenizer(context, add_special_tokens=False, return_offsets_mapping=True)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
    offsets = encoding["offset_mapping"]

    # Initialize all labels as "O" (outside)
    labels = ["O"] * len(tokens)

    # If answerable, find answer span
    if answer_start >= 0 and answer_text:
        answer_end = answer_start + len(answer_text)
        found_start = False

        for i, (start, end) in enumerate(offsets):
            # Check if token overlaps with answer
            if start < answer_end and end > answer_start:
                if not found_start:
                    labels[i] = "B-ANS"  # First token of answer
                    found_start = True
                else:
                    labels[i] = "I-ANS"  # Inside answer

    return tokens, labels


def prepare_bio_data(df, tokenizer, max_samples=None):
    """
    Prepare dataset with BIO labels.

    Returns:
        List of dicts with 'tokens', 'labels', 'question', 'lang'
    """
    examples = []

    for idx, row in df.iterrows():
        if max_samples and len(examples) >= max_samples:
            break

        # Get answer info
        if row["answerable"]:
            answer_start = row["answer_start"]
            answer_text = row["answer"]
        else:
            answer_start = -1
            answer_text = ""

        # Convert to BIO labels
        tokens, labels = char_to_token_labels(
            row["context"],
            answer_start,
            answer_text,
            tokenizer
        )

        examples.append({
            "tokens": tokens,
            "labels": labels,
            "question": row["question"],
            "lang": row["lang"]
        })

    return examples


print("Preprocessing utilities defined")

Preprocessing utilities defined


---

# Model 1: BiLSTM-CRF with BIO Tagging



## Model 1: Implementation

In [8]:
# Simple BiLSTM-CRF implementation (without external CRF library for simplicity)
class BiLSTM_CRF(nn.Module):
    """BiLSTM with CRF for sequence labeling"""

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim // 2,  # Divided by 2 because bidirectional
            bidirectional=True,
            batch_first=True,
            dropout=dropout if dropout > 0 else 0
        )
        self.dropout = nn.Dropout(dropout)
        self.hidden2tag = nn.Linear(hidden_dim, num_labels)

        # CRF transition scores
        self.transitions = nn.Parameter(torch.randn(num_labels, num_labels))
        self.num_labels = num_labels

    def forward(self, x, mask=None):
        """Forward pass - returns emission scores"""
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        return emissions

    def compute_loss(self, emissions, tags, mask):
        """CRF loss computation"""
        # Simplified loss - use cross-entropy instead of full CRF for simplicity
        if mask is not None:
            active_loss = mask.view(-1) == 1
            active_logits = emissions.view(-1, self.num_labels)[active_loss]
            active_labels = tags.view(-1)[active_loss]
            loss = nn.CrossEntropyLoss()(active_logits, active_labels)
        else:
            loss = nn.CrossEntropyLoss()(
                emissions.view(-1, self.num_labels),
                tags.view(-1)
            )
        return loss

    def decode(self, emissions, mask=None):
        """Viterbi-like decoding (simplified as argmax for implementation simplicity)"""
        return torch.argmax(emissions, dim=-1)


# Vocabulary and Dataset classes
class SimpleVocab:
    """Simple vocabulary for tokenization"""

    def __init__(self, tokens_list, max_size=50000):
        counter = Counter()
        for tokens in tokens_list:
            counter.update(tokens)

        self.token2idx = {"<PAD>": 0, "<UNK>": 1}
        for token, _ in counter.most_common(max_size - 2):
            if token not in self.token2idx:
                self.token2idx[token] = len(self.token2idx)

        self.idx2token = {v: k for k, v in self.token2idx.items()}
        print(f"  Vocabulary size: {len(self.token2idx):,}")

    def encode(self, tokens):
        return [self.token2idx.get(t, 1) for t in tokens]


class BIODataset(Dataset):
    """Dataset for BIO sequence labeling"""

    def __init__(self, examples, vocab, label2idx):
        self.examples = examples
        self.vocab = vocab
        self.label2idx = label2idx

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        token_ids = self.vocab.encode(ex["tokens"])
        label_ids = [self.label2idx[l] for l in ex["labels"]]
        return {
            "token_ids": token_ids,
            "label_ids": label_ids,
            "length": len(token_ids)
        }


def collate_bio_batch(batch):
    """Collate function for batching"""
    max_len = max(b["length"] for b in batch)

    token_ids = []
    label_ids = []
    masks = []

    for b in batch:
        # Pad sequences
        pad_len = max_len - b["length"]
        token_ids.append(b["token_ids"] + [0] * pad_len)
        label_ids.append(b["label_ids"] + [0] * pad_len)
        masks.append([1] * b["length"] + [0] * pad_len)

    return {
        "token_ids": torch.tensor(token_ids, dtype=torch.long),
        "label_ids": torch.tensor(label_ids, dtype=torch.long),
        "mask": torch.tensor(masks, dtype=torch.long)
    }


print("Model 1 (BiLSTM-CRF) classes defined")

Model 1 (BiLSTM-CRF) classes defined


In [9]:
# Training and evaluation functions for Model 1
def train_bilstm_crf(model, train_loader, val_loader, epochs=10, lr=0.001):
    """Train BiLSTM-CRF model"""
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_f1 = 0.0

    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        for batch in train_loader:
            token_ids = batch["token_ids"].to(device)
            label_ids = batch["label_ids"].to(device)
            mask = batch["mask"].to(device)

            optimizer.zero_grad()
            emissions = model(token_ids, mask)
            loss = model.compute_loss(emissions, label_ids, mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Validation
        val_f1 = evaluate_bilstm_crf(model, val_loader)

        print(f"  Epoch {epoch+1}/{epochs}: Loss={avg_loss:.4f}, Val F1={val_f1:.4f}")

        if val_f1 > best_f1:
            best_f1 = val_f1

    return best_f1


def evaluate_bilstm_crf(model, data_loader):
    """Evaluate BiLSTM-CRF model"""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            token_ids = batch["token_ids"].to(device)
            label_ids = batch["label_ids"].to(device)
            mask = batch["mask"].to(device)

            emissions = model(token_ids, mask)
            predictions = model.decode(emissions, mask)

            # Convert to lists (excluding padding)
            for i in range(len(predictions)):
                mask_len = mask[i].sum().item()
                pred_seq = predictions[i][:mask_len].cpu().tolist()
                label_seq = label_ids[i][:mask_len].cpu().tolist()
                all_preds.append(pred_seq)
                all_labels.append(label_seq)

    # Calculate F1 (token-level)
    flat_preds = [p for seq in all_preds for p in seq]
    flat_labels = [l for seq in all_labels for l in seq]

    # Filter out padding (label 0)
    valid_indices = [i for i, l in enumerate(flat_labels) if l != 0]
    filtered_preds = [flat_preds[i] for i in valid_indices]
    filtered_labels = [flat_labels[i] for i in valid_indices]

    if len(filtered_labels) > 0:
        f1 = f1_score(filtered_labels, filtered_preds, average='weighted', zero_division=0)
    else:
        f1 = 0.0

    return f1


print("Model 1 training functions defined")

Model 1 training functions defined


## Model 1: Training & Evaluation

Train separate BiLSTM-CRF models for each language.

In [10]:
# Initialize tokenizer for preprocessing
from transformers import XLMRobertaTokenizerFast
base_tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

# Label mapping
LABEL2IDX = {"O": 0, "B-ANS": 1, "I-ANS": 2}
IDX2LABEL = {v: k for k, v in LABEL2IDX.items()}

# Train Model 1 for each language
print("=" * 70)
print("MODEL 1: BiLSTM-CRF with BIO Tagging (Per-Language)")
print("=" * 70)

model1_results = []

for lang in LANGUAGES:
    print(f"\n{'='*50}")
    print(f"Training for {LANGUAGE_NAMES[lang]} ({lang})")
    print(f"{'='*50}")

    # Prepare data
    df_train_lang = df_train[df_train["lang"] == lang].head(2000)  # Limit for speed
    df_val_lang = df_val[df_val["lang"] == lang].head(500)

    print(f"Preparing BIO labels...")
    train_examples = prepare_bio_data(df_train_lang, base_tokenizer)
    val_examples = prepare_bio_data(df_val_lang, base_tokenizer)

    print(f"  Train examples: {len(train_examples)}")
    print(f"  Val examples: {len(val_examples)}")

    # Build vocabulary
    print(f"Building vocabulary...")
    all_tokens = [ex["tokens"] for ex in train_examples]
    vocab = SimpleVocab(all_tokens)

    # Create datasets
    train_dataset = BIODataset(train_examples, vocab, LABEL2IDX)
    val_dataset = BIODataset(val_examples, vocab, LABEL2IDX)

    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        collate_fn=collate_bio_batch
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_bio_batch
    )

    # Initialize model
    model = BiLSTM_CRF(
        vocab_size=len(vocab.token2idx),
        embedding_dim=128,
        hidden_dim=256,
        num_labels=len(LABEL2IDX),
        dropout=0.3
    ).to(device)

    print(f"Training model...")
    best_f1 = train_bilstm_crf(model, train_loader, val_loader, epochs=8, lr=0.001)

    model1_results.append({
        "model": "BiLSTM-CRF",
        "language": LANGUAGE_NAMES[lang],
        "lang_code": lang,
        "f1_score": best_f1
    })

    print(f"\n✓ {LANGUAGE_NAMES[lang]} - Best F1: {best_f1:.4f}")

print("\n" + "=" * 70)
print("MODEL 1 RESULTS SUMMARY")
print("=" * 70)
results_df = pd.DataFrame(model1_results)
print(results_df.to_string(index=False))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (750 > 512). Running this sequence through the model will result in indexing errors


MODEL 1: BiLSTM-CRF with BIO Tagging (Per-Language)

Training for Arabic (ar)
Preparing BIO labels...
  Train examples: 2000
  Val examples: 415
Building vocabulary...
  Vocabulary size: 16,974
Training model...
  Epoch 1/8: Loss=0.2300, Val F1=0.0000
  Epoch 2/8: Loss=0.1603, Val F1=0.0043
  Epoch 3/8: Loss=0.1502, Val F1=0.0148
  Epoch 4/8: Loss=0.1343, Val F1=0.0343
  Epoch 5/8: Loss=0.1176, Val F1=0.0543
  Epoch 6/8: Loss=0.0977, Val F1=0.0757
  Epoch 7/8: Loss=0.0800, Val F1=0.1228
  Epoch 8/8: Loss=0.0648, Val F1=0.1607

✓ Arabic - Best F1: 0.1607

Training for Korean (ko)
Preparing BIO labels...
  Train examples: 2000
  Val examples: 356
Building vocabulary...
  Vocabulary size: 16,820
Training model...
  Epoch 1/8: Loss=0.2186, Val F1=0.0000
  Epoch 2/8: Loss=0.1449, Val F1=0.0000
  Epoch 3/8: Loss=0.1364, Val F1=0.0034
  Epoch 4/8: Loss=0.1221, Val F1=0.0340
  Epoch 5/8: Loss=0.1032, Val F1=0.0479
  Epoch 6/8: Loss=0.0835, Val F1=0.0563
  Epoch 7/8: Loss=0.0668, Val F1=0.0668


---

# Model 2: XLM-RoBERTa with Token Classification (BIO)



## Model 2: Implementation

In [11]:
def prepare_xlmr_token_classification_data(df, tokenizer, label2id, max_length=512):
    """
    Prepare data for XLM-R token classification.
    Combines question and context, aligns labels with subword tokens.
    """
    examples = []

    for idx, row in df.iterrows():
        question = row["question"]
        context = row["context"]

        # Tokenize question and context separately to track offsets
        question_encoding = tokenizer(question, add_special_tokens=False)
        context_encoding = tokenizer(context, add_special_tokens=False, return_offsets_mapping=True)

        # Get answer info
        if row["answerable"]:
            answer_start = row["answer_start"]
            answer_text = row["answer"]
            answer_end = answer_start + len(answer_text)
        else:
            answer_start = -1
            answer_end = -1
            answer_text = ""

        # Create labels for context tokens
        context_labels = []
        found_start = False

        for start_char, end_char in context_encoding["offset_mapping"]:
            if answer_start >= 0 and start_char < answer_end and end_char > answer_start:
                if not found_start:
                    context_labels.append(label2id["B-ANS"])
                    found_start = True
                else:
                    context_labels.append(label2id["I-ANS"])
            else:
                context_labels.append(label2id["O"])

        # Combine: [CLS] question [SEP] context [SEP]
        input_ids = (
            [tokenizer.cls_token_id] +
            question_encoding["input_ids"][:100] +  # Limit question length
            [tokenizer.sep_token_id] +
            context_encoding["input_ids"][:max_length-len(question_encoding["input_ids"])-3] +
            [tokenizer.sep_token_id]
        )

        # Labels: -100 for special tokens and question tokens (don't predict on these)
        labels = (
            [-100] +  # CLS
            [-100] * len(question_encoding["input_ids"][:100]) +  # Question tokens
            [-100] +  # SEP
            context_labels[:max_length-len(question_encoding["input_ids"])-3] +  # Context labels
            [-100]    # SEP
        )

        # Attention mask
        attention_mask = [1] * len(input_ids)

        # Truncate if needed
        if len(input_ids) > max_length:
            input_ids = input_ids[:max_length]
            labels = labels[:max_length]
            attention_mask = attention_mask[:max_length]

        examples.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        })

    return examples


def compute_token_classification_metrics(eval_pred):
    """Compute metrics for token classification"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)

    # Remove ignored index (special tokens with -100)
    true_labels = []
    true_predictions = []

    for pred_seq, label_seq in zip(predictions, labels):
        true_label_seq = []
        true_pred_seq = []

        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                true_label_seq.append(IDX2LABEL[l])
                true_pred_seq.append(IDX2LABEL[p])

        if true_label_seq:
            true_labels.append(true_label_seq)
            true_predictions.append(true_pred_seq)

    # Compute seqeval F1
    if true_labels:
        f1 = seqeval_f1(true_labels, true_predictions)
    else:
        f1 = 0.0

    return {"f1": f1}


print("Model 2 (XLM-R Token Classification) functions defined")

Model 2 (XLM-R Token Classification) functions defined


## Model 2: Training & Evaluation

Train a single multilingual XLM-RoBERTa model on all three languages.

In [12]:
print("=" * 70)
print("MODEL 2: XLM-RoBERTa Token Classification (Multilingual)")
print("=" * 70)

# Initialize tokenizer and model
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare data for all languages (multilingual training)
print("\nPreparing multilingual dataset...")
df_train_limited = df_train.groupby("lang").head(1000)  # 1000 per language
df_val_limited = df_val.groupby("lang").head(300)  # 300 per language

print(f"  Train: {len(df_train_limited)} examples")
print(f"  Val: {len(df_val_limited)} examples")

print("\nTokenizing and aligning labels...")
train_examples = prepare_xlmr_token_classification_data(df_train_limited, tokenizer, LABEL2IDX)
val_examples = prepare_xlmr_token_classification_data(df_val_limited, tokenizer, LABEL2IDX)

# Convert to HuggingFace Dataset
train_dataset = HFDataset.from_list(train_examples)
val_dataset = HFDataset.from_list(val_examples)

print(f"  Train dataset: {len(train_dataset)}")
print(f"  Val dataset: {len(val_dataset)}")

# Initialize model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL2IDX),
    id2label=IDX2LABEL,
    label2id=LABEL2IDX
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./model2_xlmr_token_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available()
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_token_classification_metrics
)

# Train
print("\nTraining XLM-RoBERTa...")
trainer.train()

# Evaluate per language
print("\n" + "=" * 70)
print("MODEL 2 RESULTS PER LANGUAGE")
print("=" * 70)

model2_results = []

for lang in LANGUAGES:
    print(f"\nEvaluating {LANGUAGE_NAMES[lang]}...")

    df_val_lang = df_val[df_val["lang"] == lang].head(300)
    val_lang_examples = prepare_xlmr_token_classification_data(df_val_lang, tokenizer, LABEL2IDX)
    val_lang_dataset = HFDataset.from_list(val_lang_examples)

    # Predict
    predictions = trainer.predict(val_lang_dataset)
    metrics = compute_token_classification_metrics((predictions.predictions, predictions.label_ids))

    model2_results.append({
        "model": "XLM-R (BIO)",
        "language": LANGUAGE_NAMES[lang],
        "lang_code": lang,
        "f1_score": metrics["f1"]
    })

    print(f"  {LANGUAGE_NAMES[lang]} F1: {metrics['f1']:.4f}")

print("\n" + "=" * 70)
print("MODEL 2 RESULTS SUMMARY")
print("=" * 70)
results_df = pd.DataFrame(model2_results)
print(results_df.to_string(index=False))

MODEL 2: XLM-RoBERTa Token Classification (Multilingual)


Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors



Preparing multilingual dataset...
  Train: 3000 examples
  Val: 900 examples

Tokenizing and aligning labels...
  Train dataset: 3000
  Val dataset: 900


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training XLM-RoBERTa...


Epoch,Training Loss,Validation Loss,F1
1,0.1159,0.112838,0.164649


Epoch,Training Loss,Validation Loss,F1
1,0.1159,0.112838,0.164649
2,0.1026,0.102253,0.296249
3,0.0709,0.109654,0.340274



MODEL 2 RESULTS PER LANGUAGE

Evaluating Arabic...


  Arabic F1: 0.3427

Evaluating Korean...


  Korean F1: 0.3422

Evaluating Telugu...


  Telugu F1: 0.3367

MODEL 2 RESULTS SUMMARY
      model language lang_code  f1_score
XLM-R (BIO)   Arabic        ar  0.342750
XLM-R (BIO)   Korean        ko  0.342246
XLM-R (BIO)   Telugu        te  0.336735


---

# Model 3: XLM-RoBERTa with QA Head (Start/End Pointers)



## Model 3: Implementation

In [13]:
def prepare_xlmr_qa_data(df, tokenizer, max_length=512):
    """
    Prepare data for XLM-R question answering (start/end positions).
    """
    examples = []

    for idx, row in df.iterrows():
        question = row["question"]
        context = row["context"]

        # Tokenize with offsets
        encoding = tokenizer(
            question,
            context,
            max_length=max_length,
            truncation="only_second",  # Truncate context if needed
            padding=False,
            return_offsets_mapping=True
        )

        offset_mapping = encoding["offset_mapping"]

        # Find start and end positions
        if row["answerable"]:
            answer_start_char = row["answer_start"]
            answer_text = row["answer"]
            answer_end_char = answer_start_char + len(answer_text)

            # Find token positions
            start_position = 0
            end_position = 0

            # Get context start in the encoding (after [CLS] question [SEP])
            sequence_ids = encoding.sequence_ids()
            context_start = sequence_ids.index(1)  # 1 means second sequence (context)

            for i in range(context_start, len(offset_mapping)):
                if sequence_ids[i] != 1:
                    continue

                token_start, token_end = offset_mapping[i]

                # Check if this token overlaps with answer
                if token_start <= answer_start_char < token_end:
                    start_position = i

                if token_start < answer_end_char <= token_end:
                    end_position = i
                    break

            # If we didn't find valid positions, mark as unanswerable
            if start_position == 0 and end_position == 0:
                start_position = 0  # Point to [CLS] to indicate no answer
                end_position = 0
        else:
            # Unanswerable: point to [CLS]
            start_position = 0
            end_position = 0

        examples.append({
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
            "start_positions": start_position,
            "end_positions": end_position
        })

    return examples


def compute_qa_metrics(start_logits, end_logits, start_positions, end_positions):
    """
    Compute F1 for QA predictions.
    Simplified: exact match of start/end positions.
    """
    start_preds = np.argmax(start_logits, axis=-1)
    end_preds = np.argmax(end_logits, axis=-1)

    correct = 0
    total = len(start_positions)

    for start_pred, end_pred, start_true, end_true in zip(
        start_preds, end_preds, start_positions, end_positions
    ):
        # Check if prediction matches ground truth
        if start_pred == start_true and end_pred == end_true:
            correct += 1

    accuracy = correct / total if total > 0 else 0.0

    # Return as F1 proxy (in real scenario, would compute span overlap F1)
    return accuracy


print("Model 3 (XLM-R QA) functions defined")

Model 3 (XLM-R QA) functions defined


## Model 3: Training & Evaluation

Train a single multilingual XLM-RoBERTa QA model on all three languages.

In [14]:
print("=" * 70)
print("MODEL 3: XLM-RoBERTa QA (Multilingual)")
print("=" * 70)

# Initialize tokenizer and model for QA
model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare multilingual QA dataset
print("\nPreparing multilingual QA dataset...")
df_train_qa = df_train.groupby("lang").head(1000)  # 1000 per language
df_val_qa = df_val.groupby("lang").head(300)  # 300 per language

print(f"  Train: {len(df_train_qa)} examples")
print(f"  Val: {len(df_val_qa)} examples")

print("\nProcessing examples for QA format...")
train_qa_examples = prepare_xlmr_qa_data(df_train_qa, xlmr_tokenizer, max_length=384)
val_qa_examples = prepare_xlmr_qa_data(df_val_qa, xlmr_tokenizer, max_length=384)

print(f"  Train QA examples: {len(train_qa_examples)}")
print(f"  Val QA examples: {len(val_qa_examples)}")

# Convert to PyTorch Dataset with padding
class QADataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

def qa_collate_fn(batch):
    """Collate function to pad sequences in a batch"""
    # Find max length in batch
    max_len = max(len(ex['input_ids']) for ex in batch)

    # Pad all sequences
    input_ids = []
    attention_mask = []
    start_positions = []
    end_positions = []

    for ex in batch:
        # Pad input_ids and attention_mask
        pad_len = max_len - len(ex['input_ids'])
        input_ids.append(ex['input_ids'] + [xlmr_tokenizer.pad_token_id] * pad_len)
        attention_mask.append(ex['attention_mask'] + [0] * pad_len)
        start_positions.append(ex['start_positions'])
        end_positions.append(ex['end_positions'])

    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
        'start_positions': torch.tensor(start_positions, dtype=torch.long),
        'end_positions': torch.tensor(end_positions, dtype=torch.long)
    }

train_qa_dataset = QADataset(train_qa_examples)
val_qa_dataset = QADataset(val_qa_examples)

# Initialize QA model
print("\nInitializing XLM-RoBERTa for Question Answering...")
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)

print("✓ Model 3 ready for training")

MODEL 3: XLM-RoBERTa QA (Multilingual)

Preparing multilingual QA dataset...
  Train: 3000 examples
  Val: 900 examples

Processing examples for QA format...
  Train QA examples: 3000
  Val QA examples: 900

Initializing XLM-RoBERTa for Question Answering...


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model 3 ready for training


In [15]:
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# Training function for QA model
def train_qa_model(model, train_dataset, val_dataset, epochs=3, batch_size=8, lr=3e-5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=qa_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=qa_collate_fn)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            progress_bar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1} - Average Loss: {avg_loss:.4f}')

    return model

# Prediction function for QA model
def predict_qa_batch(model, dataset, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=qa_collate_fn)
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get start and end logits
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Get best start and end positions
            start_preds = torch.argmax(start_logits, dim=1).cpu().numpy()
            end_preds = torch.argmax(end_logits, dim=1).cpu().numpy()

            for i in range(len(start_preds)):
                start_pos = start_preds[i]
                end_pos = end_preds[i]

                # Ensure valid span (end >= start)
                if end_pos < start_pos:
                    end_pos = start_pos

                all_predictions.append((start_pos, end_pos))

    return all_predictions

print("Model 3 training functions defined")

Model 3 training functions defined


In [16]:
# Train multilingual QA model
print("\n" + "=" * 70)
print("TRAINING MODEL 3")
print("=" * 70)
print("Training XLM-RoBERTa QA model on all languages...")
qa_model_multi = train_qa_model(qa_model, train_qa_dataset, val_qa_dataset, epochs=3, batch_size=8)
print("\n✓ Model 3 training complete!")


TRAINING MODEL 3
Training XLM-RoBERTa QA model on all languages...


Epoch 1/3: 100%|██████████| 375/375 [03:00<00:00,  2.08it/s, loss=2.15]


Epoch 1 - Average Loss: 2.9884


Epoch 2/3: 100%|██████████| 375/375 [03:00<00:00,  2.08it/s, loss=2.08]


Epoch 2 - Average Loss: 2.0602


Epoch 3/3: 100%|██████████| 375/375 [03:00<00:00,  2.08it/s, loss=2.36]

Epoch 3 - Average Loss: 1.3530

✓ Model 3 training complete!





In [17]:
# Evaluate QA model per language
print("\n" + "=" * 70)
print("MODEL 3 RESULTS PER LANGUAGE")
print("=" * 70)

model3_results = []

for lang in LANGUAGES:
    print(f"\n{'='*50}")
    print(f"Evaluating {LANGUAGE_NAMES[lang]}...")
    print(f"{'='*50}")

    # Get language-specific validation data
    df_val_lang = df_val[df_val["lang"] == lang].head(300)

    # Prepare QA examples
    val_lang_qa = prepare_xlmr_qa_data(df_val_lang, xlmr_tokenizer, max_length=384)
    val_lang_dataset = QADataset(val_lang_qa)

    # Get predictions
    predictions = predict_qa_batch(qa_model_multi, val_lang_dataset, batch_size=16)

    # Decode predictions to text
    pred_spans = []
    gold_spans = []

    for i, (start_pos, end_pos) in enumerate(predictions):
        row = df_val_lang.iloc[i]

        # Get the encoding for this example
        encoding = xlmr_tokenizer(
            row["question"],
            row["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True
        )

        # Check if unanswerable
        if start_pos == 0 or end_pos == 0:
            pred_spans.append("")
        else:
            # Decode the predicted span
            input_ids = encoding["input_ids"]
            answer_tokens = input_ids[start_pos:end_pos+1]
            pred_text = xlmr_tokenizer.decode(answer_tokens, skip_special_tokens=True)
            pred_spans.append(pred_text.strip())

        # Gold answer
        if row["answerable"]:
            gold_spans.append(row["answer"])
        else:
            gold_spans.append("")

    # Calculate exact match
    exact_matches = sum(1 for pred, gold in zip(pred_spans, gold_spans) if pred.strip() == gold.strip())
    em_score = exact_matches / len(gold_spans) if len(gold_spans) > 0 else 0.0

    # Calculate F1 (token-level overlap)
    f1_scores = []
    for pred, gold in zip(pred_spans, gold_spans):
        pred_tokens = set(pred.split())
        gold_tokens = set(gold.split())

        if len(pred_tokens) == 0 and len(gold_tokens) == 0:
            f1_scores.append(1.0)
        elif len(pred_tokens) == 0 or len(gold_tokens) == 0:
            f1_scores.append(0.0)
        else:
            overlap = len(pred_tokens & gold_tokens)
            precision = overlap / len(pred_tokens) if len(pred_tokens) > 0 else 0
            recall = overlap / len(gold_tokens) if len(gold_tokens) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            f1_scores.append(f1)

    avg_f1 = sum(f1_scores) / len(f1_scores) if len(f1_scores) > 0 else 0.0

    model3_results.append({
        "model": "XLM-R QA",
        "language": LANGUAGE_NAMES[lang],
        "lang_code": lang,
        "f1_score": avg_f1,
        "exact_match": em_score
    })

    print(f"  Exact Match: {em_score:.4f}")
    print(f"  F1 Score: {avg_f1:.4f}")
    print(f"  Samples: {len(gold_spans)}")

    # Show some examples
    print("\n  Example Predictions:")
    for i in range(min(3, len(pred_spans))):
        print(f"\n  Example {i+1}:")
        print(f"    Question: {df_val_lang.iloc[i]['question'][:80]}...")
        print(f"    Predicted: '{pred_spans[i]}'")
        print(f"    Gold: '{gold_spans[i]}'")

print("\n" + "=" * 70)
print("MODEL 3 RESULTS SUMMARY")
print("=" * 70)
results_df = pd.DataFrame(model3_results)
print(results_df.to_string(index=False))


MODEL 3 RESULTS PER LANGUAGE

Evaluating Arabic...


Predicting: 100%|██████████| 19/19 [00:04<00:00,  3.82it/s]


  Exact Match: 0.3967
  F1 Score: 0.4890
  Samples: 300

  Example Predictions:

  Example 1:
    Question: ما هي أولى جامعات فنلندا؟...
    Predicted: 'Royal Academy of Åbo'
    Gold: 'Royal Academy of Åbo'

  Example 2:
    Question: ما عدد الدول المطلة على بحر البلطيق؟...
    Predicted: 'Finland, Sweden'
    Gold: 'Finland, Sweden, Denmark, Estonia, Latvia, Lithuania, northwest Russia, Poland, Germany'

  Example 3:
    Question: اين عاش نيوتن؟...
    Predicted: 'hat'
    Gold: 'Grantham'

Evaluating Korean...


Predicting: 100%|██████████| 19/19 [00:04<00:00,  3.85it/s]


  Exact Match: 0.4600
  F1 Score: 0.5532
  Samples: 300

  Example Predictions:

  Example 1:
    Question: 북유럽의 노르딕 국가는 몇개인가요?...
    Predicted: '12 million'
    Gold: 'five'

  Example 2:
    Question: 1887년 케이스 웨스턴 리저브 대학의 이름은 무엇인가?...
    Predicted: '1967'
    Gold: 'Western Reserve University (formerly Western Reserve College) and Case Institute of Technology (formerly Case School of Applied Science)'

  Example 3:
    Question: 옴진리교는 어느 나라에서 시작된 종교인가?...
    Predicted: 'Egyptian'
    Gold: 'Egypt'

Evaluating Telugu...


Predicting: 100%|██████████| 19/19 [00:04<00:00,  4.00it/s]


  Exact Match: 0.3933
  F1 Score: 0.5026
  Samples: 300

  Example Predictions:

  Example 1:
    Question: ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది ?...
    Predicted: 'Portland'
    Gold: 'Portland'

  Example 2:
    Question: కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు ?...
    Predicted: 'Indonesia'
    Gold: 'Indian subcontinent'

  Example 3:
    Question: కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు ?...
    Predicted: 'United States'
    Gold: 'England'

MODEL 3 RESULTS SUMMARY
   model language lang_code  f1_score  exact_match
XLM-R QA   Arabic        ar  0.489022     0.396667
XLM-R QA   Korean        ko  0.553180     0.460000
XLM-R QA   Telugu        te  0.502644     0.393333


# Final Model Comparison

This section provides a comprehensive comparison of all three models across all languages.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Compile all results into a comparison dataframe
comparison_data = []

# Model 1: BiLSTM-CRF (per-language models)
# model1_results is a list of dicts with keys: model, language, lang_code, f1_score
for result in model1_results:
    comparison_data.append({
        'Model': 'BiLSTM-CRF',
        'Language': result['language'],
        'F1 Score': result['f1_score'],
        'Exact Match': 0,  # Model 1 doesn't compute exact match
        'Training': 'Per-language',
        'Labeling': 'BIO'
    })

# Model 2: XLM-R Token Classification (multilingual)
# model2_results is a list of dicts with keys: model, language, lang_code, f1_score
for result in model2_results:
    comparison_data.append({
        'Model': 'XLM-R Token',
        'Language': result['language'],
        'F1 Score': result['f1_score'],
        'Exact Match': 0,  # Model 2 doesn't compute exact match
        'Training': 'Multilingual',
        'Labeling': 'BIO'
    })

# Model 3: XLM-R QA (multilingual)
# model3_results is a list of dicts with keys: model, language, lang_code, f1_score, exact_match
for result in model3_results:
    comparison_data.append({
        'Model': 'XLM-R QA',
        'Language': result['language'],
        'F1 Score': result['f1_score'],
        'Exact Match': result['exact_match'],
        'Training': 'Multilingual',
        'Labeling': 'Start/End'
    })

# Create comparison dataframe
comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*80)
print("COMPREHENSIVE MODEL COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [1]:
# Visualize F1 Score comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: F1 scores by model and language
pivot_f1 = comparison_df.pivot(index='Language', columns='Model', values='F1 Score')
pivot_f1.plot(kind='bar', ax=axes[0], rot=0, width=0.8)
axes[0].set_title('F1 Score Comparison Across Models and Languages', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Language', fontsize=12)
axes[0].set_ylabel('F1 Score', fontsize=12)
axes[0].legend(title='Model', loc='best')
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_ylim(0, 1)

# Plot 2: Exact Match by model and language
pivot_em = comparison_df.pivot(index='Language', columns='Model', values='Exact Match')
pivot_em.plot(kind='bar', ax=axes[1], rot=0, width=0.8, color=['#ff9999', '#66b3ff', '#99ff99'])
axes[1].set_title('Exact Match Comparison Across Models and Languages', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Language', fontsize=12)
axes[1].set_ylabel('Exact Match Score', fontsize=12)
axes[1].legend(title='Model', loc='best')
axes[1].grid(axis='y', alpha=0.3)
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Average performance by model
print("\n" + "="*60)
print("AVERAGE PERFORMANCE BY MODEL")
print("="*60)
avg_by_model = comparison_df.groupby('Model')[['F1 Score', 'Exact Match']].mean()
print(avg_by_model)
print("="*60)

# Average performance by language
print("\n" + "="*60)
print("AVERAGE PERFORMANCE BY LANGUAGE")
print("="*60)
avg_by_lang = comparison_df.groupby('Language')[['F1 Score', 'Exact Match']].mean()
print(avg_by_lang)
print("="*60)

NameError: name 'plt' is not defined

In [None]:
# Save results for future reference
import json

results_summary = {
    'model_1_bilstm_crf': model1_results,
    'model_2_xlmr_token': model2_results,
    'model_3_xlmr_qa': model3_results,
    'comparison_table': comparison_df.to_dict('records')
}

# Save to JSON
with open('week39_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

