# 03 - Data Preparation: SQuAD v1.1

**Thesis Section Reference:** Chapter 3.6 - Tasks and Datasets

This notebook prepares the SQuAD v1.1 extractive QA dataset:
1. Load SQuAD v1.1 dataset
2. Create subsets for FAST MODE
3. Tokenize for causal LM training (generative QA)
4. Save processed datasets

## Task Description
- **Dataset:** SQuAD v1.1 (Stanford Question Answering Dataset)
- **Task:** Extractive Question Answering
- **Metrics:** Exact Match (EM), F1
- **Note:** Test set is hidden, so validation is used as test

In [None]:
# Standard setup
import os
import sys
from pathlib import Path

ROOT_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(ROOT_DIR / "src"))

from dotenv import load_dotenv
load_dotenv(ROOT_DIR / ".env")

from config import load_config
from utils_seed import set_seed

config = load_config(str(ROOT_DIR / "configs" / "experiment.yaml"))
config.ensure_dirs()

SEED = config.get_seeds()[0]
set_seed(SEED)

print(f"Mode: {'FAST' if config.fast_mode else 'FULL'}")
print(f"Seed: {SEED}")

In [None]:
# Check if data already exists
DATA_DIR = ROOT_DIR / "results" / "processed_data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

squad_train_path = DATA_DIR / "squad_train"
squad_val_path = DATA_DIR / "squad_validation"

if squad_train_path.exists() and squad_val_path.exists():
    print("✓ SQuAD data already exists, loading from cache...")
    SKIP_PROCESSING = True
else:
    print("SQuAD data not found, will process...")
    SKIP_PROCESSING = False

In [None]:
# Load SQuAD dataset
from datasets import load_dataset

if not SKIP_PROCESSING:
    print("Loading SQuAD v1.1...")
    
    raw_dataset = load_dataset(
        "squad",
        cache_dir=str(ROOT_DIR / "hf_cache")
    )
    
    print(f"\nDataset structure:")
    print(raw_dataset)
    
    print(f"\nSample example:")
    ex = raw_dataset["train"][0]
    print(f"  ID: {ex['id']}")
    print(f"  Question: {ex['question']}")
    print(f"  Context: {ex['context'][:200]}...")
    print(f"  Answers: {ex['answers']}")

In [None]:
# Create subsets based on mode
if not SKIP_PROCESSING:
    train_size = config.get_subset_size("squad", "train")
    val_size = config.get_subset_size("squad", "validation")
    
    if train_size is not None:
        print(f"FAST MODE: Subsetting to {train_size} train, {val_size} validation examples")
        
        train_dataset = raw_dataset["train"].shuffle(seed=SEED).select(range(train_size))
        val_dataset = raw_dataset["validation"].shuffle(seed=SEED).select(range(min(val_size, len(raw_dataset["validation"]))))
    else:
        print("FULL MODE: Using complete dataset")
        train_dataset = raw_dataset["train"]
        val_dataset = raw_dataset["validation"]
    
    print(f"\nFinal sizes:")
    print(f"  Train: {len(train_dataset)}")
    print(f"  Validation: {len(val_dataset)}")

In [None]:
# Load tokenizer
from transformers import AutoTokenizer

if not SKIP_PROCESSING:
    tokenizer_name = os.getenv("STUDENT_S1", config.student_s1.name)
    
    print(f"Loading tokenizer: {tokenizer_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name,
        trust_remote_code=True,
        cache_dir=str(ROOT_DIR / "hf_cache")
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    print(f"  Vocab size: {tokenizer.vocab_size}")

In [None]:
# Define prompt template for generative QA
from data_squad import create_squad_prompt

if not SKIP_PROCESSING:
    max_length = config.get_max_length("squad")
    print(f"Max sequence length: {max_length}")
    
    # Show example prompt
    example = train_dataset[0]
    example_prompt = create_squad_prompt(
        example["question"],
        example["context"],
        include_answer=False
    )
    
    print(f"\nExample prompt (truncated):")
    print("-" * 40)
    print(example_prompt[:500])
    print("...")
    print("-" * 40)
    print(f"\nExpected answer: {example['answers']['text'][0]}")

In [None]:
# Tokenize dataset
from data_squad import tokenize_squad_for_lm

if not SKIP_PROCESSING:
    print("Tokenizing datasets...")
    
    def tokenize_fn(examples):
        return tokenize_squad_for_lm(
            examples,
            tokenizer,
            max_length=max_length,
            include_labels=True
        )
    
    # Tokenize train
    print("  Tokenizing train split...")
    tokenized_train = train_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=["title", "context", "question", "answers"],
        desc="Tokenizing train"
    )
    
    # Tokenize validation
    print("  Tokenizing validation split...")
    tokenized_val = val_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=["title", "context", "question", "answers"],
        desc="Tokenizing validation"
    )
    
    print(f"\nTokenized dataset features:")
    print(f"  {tokenized_train.column_names}")

In [None]:
# Verify tokenization
if not SKIP_PROCESSING:
    print("Verifying tokenization...")
    
    sample = tokenized_train[0]
    
    # Decode input
    decoded = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
    print(f"\nSample decoded (truncated):")
    print(decoded[:400])
    print("...")
    
    # Check gold answers are preserved
    if "gold_answers" in sample:
        print(f"\nGold answers: {sample['gold_answers']}")
    
    # Check sequence length distribution
    lengths = [len([t for t in ex["input_ids"] if t != tokenizer.pad_token_id]) 
               for ex in tokenized_train.select(range(min(100, len(tokenized_train))))]
    
    print(f"\nSequence length stats (first 100):")
    print(f"  Mean: {sum(lengths)/len(lengths):.1f}")
    print(f"  Max: {max(lengths)}")
    print(f"  Min: {min(lengths)}")

In [None]:
# Save raw examples for KD2 (sequence-level KD needs prompts without answers)
import json

if not SKIP_PROCESSING:
    print("Saving raw examples for KD2 (sequence-level KD)...")
    
    # Create prompts without answers for teacher generation
    train_prompts = []
    for i, ex in enumerate(train_dataset):
        prompt = create_squad_prompt(
            ex["question"],
            ex["context"],
            include_answer=False
        )
        train_prompts.append({
            "id": ex["id"],
            "prompt": prompt,
            "gold_answers": ex["answers"]["text"]
        })
    
    val_prompts = []
    for i, ex in enumerate(val_dataset):
        prompt = create_squad_prompt(
            ex["question"],
            ex["context"],
            include_answer=False
        )
        val_prompts.append({
            "id": ex["id"],
            "prompt": prompt,
            "gold_answers": ex["answers"]["text"]
        })
    
    # Save prompts
    with open(DATA_DIR / "squad_train_prompts.json", "w") as f:
        json.dump(train_prompts, f)
    
    with open(DATA_DIR / "squad_val_prompts.json", "w") as f:
        json.dump(val_prompts, f)
    
    print(f"  Saved {len(train_prompts)} train prompts")
    print(f"  Saved {len(val_prompts)} validation prompts")

In [None]:
# Save processed datasets
if not SKIP_PROCESSING:
    print("Saving processed datasets...")
    
    tokenized_train.save_to_disk(str(squad_train_path))
    tokenized_val.save_to_disk(str(squad_val_path))
    
    # Save tokenizer
    tokenizer_path = DATA_DIR / "squad_tokenizer"
    tokenizer.save_pretrained(str(tokenizer_path))
    
    # Save metadata
    metadata = {
        "task": "squad",
        "train_size": len(tokenized_train),
        "val_size": len(tokenized_val),
        "max_length": max_length,
        "tokenizer": tokenizer_name,
        "fast_mode": config.fast_mode,
        "seed": SEED,
        "use_validation_as_test": True
    }
    
    with open(DATA_DIR / "squad_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\n✓ Saved to {DATA_DIR}")

In [None]:
# Load cached data if skipped
from datasets import load_from_disk
import json

if SKIP_PROCESSING:
    print("Loading cached SQuAD data...")
    tokenized_train = load_from_disk(str(squad_train_path))
    tokenized_val = load_from_disk(str(squad_val_path))
    
    with open(DATA_DIR / "squad_metadata.json", "r") as f:
        metadata = json.load(f)
    
    print(f"\nLoaded from cache:")
    print(f"  Train: {len(tokenized_train)} examples")
    print(f"  Validation: {len(tokenized_val)} examples")

In [None]:
# Summary
print("=" * 60)
print("SQUAD DATA PREPARATION COMPLETE")
print("=" * 60)
print(f"""
Dataset: SQuAD v1.1 (Extractive QA)
Mode: {'FAST' if config.fast_mode else 'FULL'}

Sizes:
  Train: {len(tokenized_train)} examples
  Validation: {len(tokenized_val)} examples

Files saved to: {DATA_DIR}
  - squad_train/
  - squad_validation/
  - squad_train_prompts.json (for KD2)
  - squad_val_prompts.json (for KD2)

Next Steps:
  1. Run 04_teacher_cache_outputs.ipynb to cache teacher outputs
  2. Run 05_train_baseline_and_kd1.ipynb for training
""")