# Deep Past Challenge — Curriculum ByT5-Base v2

**Model**: `notninja/byt5-base-akkadian` (warm-start, already fine-tuned on general Akkadian)

**Strategy**: Skip Phase 1 entirely — warm-start model already covers general Akkadian.
- Phase 2: Old Assyrian specialization (~15K samples, 10 epochs)
- Phase 3 (optional): Old Assyrian + remaining lexicon entries (2 epochs)

**Key improvements over v1**:
- Warm-start from `notninja/byt5-base-akkadian` instead of `google/byt5-small`
- MAX_SOURCE_LENGTH = 768 (covers 97%+ of competition val inputs at byte level)
- Zero-shot baseline eval before training
- Decoding hyperparameter search (length_penalty, num_beams) on competition val
- fp16=True for T4 tensor core acceleration

**Metric**: sqrt(BLEU * chrF++)

In [None]:
!pip install -q sacrebleu

In [None]:
import pandas as pd
import numpy as np
import torch
import gc
import shutil
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
from datasets import Dataset as HFDataset
from sacrebleu.metrics import BLEU, CHRF
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"fp16 supported: {torch.cuda.is_available()}")

## Configuration

In [None]:
# Paths — ASSEMBLED_DIR and COMPETITION_DIR are auto-detected in the next cell
OUTPUT_DIR = Path('/kaggle/working')

# Model — warm-start: already fine-tuned on general Akkadian, skip Phase 1
MODEL_NAME = 'notninja/byt5-base-akkadian'
PREFIX = 'translate Akkadian to English: '

# Sequence lengths (ByT5 byte-level).
# Competition val inputs average 435 chars, max ~921 chars.
# With UTF-8 diacritics each char can be 1-2 bytes; 768 bytes covers ~97% of inputs.
MAX_SOURCE_LENGTH = 768
MAX_TARGET_LENGTH = 512

# Phase 2: Old Assyrian specialization
P2_EPOCHS = 10
P2_BATCH_SIZE = 8
P2_GRAD_ACCUM = 4
P2_LR = 1e-5
P2_WARMUP = 0.05

# Phase 3: Old Assyrian + lexicon (optional)
P3_EPOCHS = 2
P3_BATCH_SIZE = 8
P3_GRAD_ACCUM = 4
P3_LR = 5e-6
P3_WARMUP = 0.05

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  MAX_SOURCE_LENGTH: {MAX_SOURCE_LENGTH}")
print(f"  MAX_TARGET_LENGTH: {MAX_TARGET_LENGTH}")
print(f"  Phase 2: {P2_EPOCHS} epochs, batch={P2_BATCH_SIZE}, grad_accum={P2_GRAD_ACCUM}, lr={P2_LR}")
print(f"  Phase 3: {P3_EPOCHS} epochs, batch={P3_BATCH_SIZE}, grad_accum={P3_GRAD_ACCUM}, lr={P3_LR}")

## Load Data

In [None]:
import os

# Diagnostic: show what's available in /kaggle/input/
print("Contents of /kaggle/input/:")
if os.path.exists('/kaggle/input'):
    for d in sorted(os.listdir('/kaggle/input')):
        full = os.path.join('/kaggle/input', d)
        if os.path.isdir(full):
            print(f"  {d}/")
            for f2 in sorted(os.listdir(full)):
                sub = os.path.join(full, f2)
                if os.path.isdir(sub):
                    print(f"    {f2}/")
                    for f3 in sorted(os.listdir(sub))[:10]:
                        sub2 = os.path.join(sub, f3)
                        if os.path.isdir(sub2):
                            items = os.listdir(sub2)
                            print(f"      {f3}/ ({len(items)} items): {items[:8]}")
                        else:
                            print(f"      {f3}  ({os.path.getsize(sub2)} bytes)")
                else:
                    print(f"    {f2}  ({os.path.getsize(sub)} bytes)")
        else:
            print(f"  {d}  ({os.path.getsize(full)} bytes)")
else:
    print("  /kaggle/input does not exist!")

# Auto-detect assembled dataset path
def find_file(filename, base='/kaggle/input'):
    """Recursively find a file under base dir."""
    for root, dirs, files in os.walk(base):
        if filename in files:
            return Path(root)
    return None

ASSEMBLED_DIR = find_file('train.parquet')
if ASSEMBLED_DIR is None:
    raise FileNotFoundError("Cannot find train.parquet under /kaggle/input/")
print(f"\nAssembled data at: {ASSEMBLED_DIR}")
print(f"  Files: {list(ASSEMBLED_DIR.iterdir())}")

# Auto-detect competition data path
COMPETITION_DIR = find_file('test.csv')
if COMPETITION_DIR is None:
    raise FileNotFoundError("Cannot find test.csv under /kaggle/input/")
print(f"Competition data at: {COMPETITION_DIR}")
print(f"  Files: {list(COMPETITION_DIR.iterdir())}")

In [None]:
# Assembled dataset (from akkadian-assembled-161k)
train_df = pd.read_parquet(ASSEMBLED_DIR / 'train.parquet')
val_df = pd.read_parquet(ASSEMBLED_DIR / 'val.parquet')
comp_df = pd.read_parquet(ASSEMBLED_DIR / 'val_competition.parquet')

# Competition test set (4 rows to predict)
test_df = pd.read_csv(COMPETITION_DIR / 'test.csv')

print(f"Full train: {len(train_df):,}")
print(f"Val: {len(val_df):,}")
print(f"Competition val: {len(comp_df)}")
print(f"Test: {len(test_df)}")
print(f"\nDialects in train: {train_df['dialect'].value_counts().to_dict()}")
print(f"Quality in train: {train_df['quality'].value_counts().to_dict()}")

# Phase 2 filter: Old Assyrian dialect
p2_train = train_df[train_df['dialect'] == 'old_assyrian'].reset_index(drop=True)
print(f"\nPhase 2 (Old Assyrian) samples: {len(p2_train):,}")

# Phase 3 filter: Old Assyrian + non-OA lexicon entries
oa_mask = train_df['dialect'] == 'old_assyrian'
lex_mask = (train_df['quality'] == 'lexicon') & (~oa_mask)
p3_train = pd.concat([train_df[oa_mask], train_df[lex_mask]], ignore_index=True)
print(f"Phase 3 (OA + lexicon) samples: {len(p3_train):,}")

# Quick look at competition val input lengths (byte-level)
comp_df['src_bytes'] = comp_df['transliteration'].apply(lambda x: len((PREFIX + str(x)).encode('utf-8')))
print(f"\nCompetition val source byte lengths:")
print(f"  mean={comp_df['src_bytes'].mean():.0f}, median={comp_df['src_bytes'].median():.0f}, "
      f"max={comp_df['src_bytes'].max()}, p95={comp_df['src_bytes'].quantile(0.95):.0f}")
print(f"  Covered by MAX_SOURCE_LENGTH={MAX_SOURCE_LENGTH}: "
      f"{(comp_df['src_bytes'] <= MAX_SOURCE_LENGTH).mean()*100:.1f}%")

## Helper Functions

In [None]:
def preprocess_function(examples, tokenizer, prefix, max_source_length, max_target_length):
    """Tokenize inputs and targets with separate max lengths."""
    inputs = [prefix + str(text) for text in examples['transliteration']]
    targets = [str(text) for text in examples['translation']]

    model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


def score_predictions(predictions, references, prefix=""):
    """Compute BLEU, chrF++, and geo_mean."""
    bleu = BLEU()
    chrf = CHRF(word_order=2)
    bleu_score = bleu.corpus_score(predictions, [references]).score
    chrf_score = chrf.corpus_score(predictions, [references]).score
    geo_mean = np.sqrt(max(bleu_score, 0) * max(chrf_score, 0))
    p = f"{prefix}_" if prefix else ""
    return {f"{p}bleu": bleu_score, f"{p}chrf": chrf_score, f"{p}geo_mean": geo_mean}


def create_compute_metrics(tokenizer):
    """Create metrics computation function for Trainer."""
    bleu = BLEU()
    chrf = CHRF(word_order=2)

    def compute_metrics(predictions_and_labels):
        preds, labels = predictions_and_labels
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        b = bleu.corpus_score(decoded_preds, [decoded_labels]).score
        c = chrf.corpus_score(decoded_preds, [decoded_labels]).score
        return {'bleu': b, 'chrf': c, 'geo_mean': np.sqrt(max(b, 0) * max(c, 0))}

    return compute_metrics


class FullValCallback(TrainerCallback):
    """Score predictions on the full 8K validation set after each eval."""
    def __init__(self, trainer_ref, full_val_dataset, full_val_refs, tokenizer):
        self.trainer_ref = trainer_ref
        self.full_val_dataset = full_val_dataset
        self.full_val_refs = full_val_refs
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, **kwargs):
        preds = self.trainer_ref.predict(self.full_val_dataset)
        decoded = self.tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
        metrics = score_predictions(decoded, self.full_val_refs, prefix="full_val")
        for k, v in metrics.items():
            print(f"  {k}: {v:.4f}")
        state.log_history[-1].update(metrics)


def generate_predictions(model, tokenizer, texts, prefix, max_source_length,
                          max_target_length, num_beams=5, length_penalty=1.0,
                          no_repeat_ngram_size=3, batch_size=8):
    """Generate translations in batches. Returns list of decoded strings."""
    model.eval()
    all_preds = []
    inputs = [prefix + str(t) for t in texts]
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i + batch_size]
        enc = tokenizer(
            batch,
            max_length=max_source_length,
            truncation=True,
            padding=True,
            return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            out = model.generate(
                input_ids=enc['input_ids'],
                attention_mask=enc['attention_mask'],
                max_length=max_target_length,
                num_beams=num_beams,
                length_penalty=length_penalty,
                no_repeat_ngram_size=no_repeat_ngram_size,
                early_stopping=True,
            )
        decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
        all_preds.extend(decoded)
    return all_preds


print("Helper functions defined.")

## Zero-Shot Eval — Warm-Start Baseline

Before any fine-tuning, evaluate `notninja/byt5-base-akkadian` on the competition val set to establish a pre-training baseline.

In [None]:
print(f"Loading warm-start model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_zs = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model_zs.to(device)
print(f"Parameters: {model_zs.num_parameters():,}")

# Zero-shot predictions on competition val (88 rows)
print("\nRunning zero-shot predictions on competition val...")
zs_preds = generate_predictions(
    model_zs, tokenizer,
    comp_df['transliteration'].tolist(),
    PREFIX, MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH,
    num_beams=5, length_penalty=1.0,
)
comp_refs = comp_df['translation'].tolist()
zs_metrics = score_predictions(zs_preds, comp_refs, prefix="zeroshot")

print("\nZero-shot baseline (competition val):")
for k, v in zs_metrics.items():
    print(f"  {k}: {v:.4f}")

print("\nSample zero-shot translations:")
for i in range(min(3, len(zs_preds))):
    print(f"\n--- Sample {i} ---")
    print(f"Source: {comp_df['transliteration'].iloc[i][:120]}")
    print(f"Pred:   {zs_preds[i][:200]}")
    print(f"Ref:    {comp_refs[i][:200]}")

# Store zero-shot metrics for summary
baseline_zeroshot = zs_metrics.copy()

# Free memory before training
del model_zs
gc.collect()
torch.cuda.empty_cache()
print("\nModel freed from GPU.")

## Phase 2: Old Assyrian Specialization

Fine-tune the warm-start model on ~15K Old Assyrian dialect pairs. This is the competition domain.
Estimated time: 2-3 hours on T4.

In [None]:
print(f"Phase 2 training samples: {len(p2_train):,}")
print(f"Competition val samples: {len(comp_df)}")
print(f"Full val samples: {len(val_df):,}")

p2_checkpoint_dir = OUTPUT_DIR / 'phase2_checkpoints'
p2_best_dir = OUTPUT_DIR / 'phase2_best'

# Load model fresh from warm-start checkpoint
print(f"\nLoading model from: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_p2 = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model_p2.to(device)
print(f"Parameters: {model_p2.num_parameters():,}")

# Build HuggingFace datasets
train_data_p2 = p2_train[['transliteration', 'translation']].reset_index(drop=True)
comp_data = comp_df[['transliteration', 'translation']].reset_index(drop=True)
full_val_data = val_df[['transliteration', 'translation']].reset_index(drop=True)
full_val_refs = full_val_data['translation'].tolist()

preprocess_fn = lambda x: preprocess_function(
    x, tokenizer, PREFIX, MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH
)
remove_cols = ['transliteration', 'translation']

print("Tokenizing datasets...")
train_dataset_p2 = HFDataset.from_pandas(train_data_p2).map(
    preprocess_fn, batched=True, remove_columns=remove_cols
)
eval_dataset_p2 = HFDataset.from_pandas(comp_data).map(
    preprocess_fn, batched=True, remove_columns=remove_cols
)
full_val_dataset_p2 = HFDataset.from_pandas(full_val_data).map(
    preprocess_fn, batched=True, remove_columns=remove_cols
)
print("Tokenization complete.")

# Training arguments
training_args_p2 = Seq2SeqTrainingArguments(
    output_dir=str(p2_checkpoint_dir),
    save_strategy='epoch',
    eval_strategy='epoch',
    learning_rate=P2_LR,
    per_device_train_batch_size=P2_BATCH_SIZE,
    per_device_eval_batch_size=P2_BATCH_SIZE,
    gradient_accumulation_steps=P2_GRAD_ACCUM,
    num_train_epochs=P2_EPOCHS,
    warmup_ratio=P2_WARMUP,
    weight_decay=0.01,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LENGTH,
    fp16=torch.cuda.is_available(),   # T4 tensor cores: use fp16, not bf16
    load_best_model_at_end=True,
    metric_for_best_model='geo_mean',
    greater_is_better=True,
    save_total_limit=2,
    report_to='none',
)

data_collator_p2 = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model_p2, padding=True, label_pad_token_id=-100
)

trainer_p2 = Seq2SeqTrainer(
    model=model_p2,
    args=training_args_p2,
    train_dataset=train_dataset_p2,
    eval_dataset=eval_dataset_p2,
    data_collator=data_collator_p2,
    compute_metrics=create_compute_metrics(tokenizer),
    processing_class=tokenizer,
)

trainer_p2.add_callback(
    FullValCallback(trainer_p2, full_val_dataset_p2, full_val_refs, tokenizer)
)

print("\nStarting Phase 2 training...")
trainer_p2.train()

print("\nFinal Phase 2 eval on competition val:")
p2_results = trainer_p2.evaluate()
for k, v in p2_results.items():
    if isinstance(v, float):
        print(f"  {k}: {v:.4f}")

# Save best checkpoint
model_p2.save_pretrained(p2_best_dir)
tokenizer.save_pretrained(p2_best_dir)
print(f"\nPhase 2 best model saved to: {p2_best_dir}")

# Clean up intermediate checkpoints
if p2_checkpoint_dir.exists():
    shutil.rmtree(p2_checkpoint_dir)
    print(f"Cleaned up checkpoints: {p2_checkpoint_dir}")

# Store metrics for summary
p2_competition_metrics = {
    'bleu': p2_results.get('eval_bleu', float('nan')),
    'chrf': p2_results.get('eval_chrf', float('nan')),
    'geo_mean': p2_results.get('eval_geo_mean', float('nan')),
}

# Free GPU memory before optional Phase 3
del model_p2
gc.collect()
torch.cuda.empty_cache()
print("Phase 2 complete. GPU memory freed.")

## Phase 3 (Optional): Old Assyrian + Lexicon

Continue from Phase 2 best, training on Old Assyrian + remaining lexicon entries for 2 epochs at a lower LR.
This may improve rare-word coverage. **Skip this cell if Phase 2 results are already good or disk space is tight.**

In [None]:
RUN_PHASE3 = True   # Set to False to skip Phase 3

p3_results = None
p3_competition_metrics = None
p3_best_dir = OUTPUT_DIR / 'phase3_best'
FINAL_MODEL_DIR = p2_best_dir  # Default: use Phase 2 unless Phase 3 improves

if RUN_PHASE3:
    print(f"Phase 3 training samples: {len(p3_train):,}")
    p3_checkpoint_dir = OUTPUT_DIR / 'phase3_checkpoints'

    # Resume from Phase 2 best
    print(f"\nLoading Phase 2 best model from: {p2_best_dir}")
    tokenizer = AutoTokenizer.from_pretrained(p2_best_dir)
    model_p3 = AutoModelForSeq2SeqLM.from_pretrained(p2_best_dir)
    model_p3.to(device)

    train_data_p3 = p3_train[['transliteration', 'translation']].reset_index(drop=True)
    preprocess_fn3 = lambda x: preprocess_function(
        x, tokenizer, PREFIX, MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH
    )

    print("Tokenizing Phase 3 datasets...")
    train_dataset_p3 = HFDataset.from_pandas(train_data_p3).map(
        preprocess_fn3, batched=True, remove_columns=remove_cols
    )
    eval_dataset_p3 = HFDataset.from_pandas(comp_data).map(
        preprocess_fn3, batched=True, remove_columns=remove_cols
    )
    full_val_dataset_p3 = HFDataset.from_pandas(full_val_data).map(
        preprocess_fn3, batched=True, remove_columns=remove_cols
    )

    training_args_p3 = Seq2SeqTrainingArguments(
        output_dir=str(p3_checkpoint_dir),
        save_strategy='epoch',
        eval_strategy='epoch',
        learning_rate=P3_LR,
        per_device_train_batch_size=P3_BATCH_SIZE,
        per_device_eval_batch_size=P3_BATCH_SIZE,
        gradient_accumulation_steps=P3_GRAD_ACCUM,
        num_train_epochs=P3_EPOCHS,
        warmup_ratio=P3_WARMUP,
        weight_decay=0.01,
        logging_steps=50,
        predict_with_generate=True,
        generation_max_length=MAX_TARGET_LENGTH,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model='geo_mean',
        greater_is_better=True,
        save_total_limit=2,
        report_to='none',
    )

    data_collator_p3 = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model_p3, padding=True, label_pad_token_id=-100
    )

    trainer_p3 = Seq2SeqTrainer(
        model=model_p3,
        args=training_args_p3,
        train_dataset=train_dataset_p3,
        eval_dataset=eval_dataset_p3,
        data_collator=data_collator_p3,
        compute_metrics=create_compute_metrics(tokenizer),
        processing_class=tokenizer,
    )
    trainer_p3.add_callback(
        FullValCallback(trainer_p3, full_val_dataset_p3, full_val_refs, tokenizer)
    )

    print("\nStarting Phase 3 training...")
    trainer_p3.train()

    print("\nFinal Phase 3 eval on competition val:")
    p3_results = trainer_p3.evaluate()
    for k, v in p3_results.items():
        if isinstance(v, float):
            print(f"  {k}: {v:.4f}")

    p3_competition_metrics = {
        'bleu': p3_results.get('eval_bleu', float('nan')),
        'chrf': p3_results.get('eval_chrf', float('nan')),
        'geo_mean': p3_results.get('eval_geo_mean', float('nan')),
    }

    # Save Phase 3 best
    model_p3.save_pretrained(p3_best_dir)
    tokenizer.save_pretrained(p3_best_dir)
    print(f"Phase 3 best model saved to: {p3_best_dir}")

    if p3_checkpoint_dir.exists():
        shutil.rmtree(p3_checkpoint_dir)
        print(f"Cleaned up Phase 3 checkpoints.")

    # Choose best final model based on competition geo_mean
    p2_geo = p2_competition_metrics['geo_mean']
    p3_geo = p3_competition_metrics['geo_mean']
    if p3_geo >= p2_geo:
        FINAL_MODEL_DIR = p3_best_dir
        print(f"\nPhase 3 better ({p3_geo:.4f} >= {p2_geo:.4f}). Using Phase 3 for submission.")
    else:
        FINAL_MODEL_DIR = p2_best_dir
        print(f"\nPhase 2 better ({p2_geo:.4f} > {p3_geo:.4f}). Using Phase 2 for submission.")

    del model_p3
    gc.collect()
    torch.cuda.empty_cache()
    print("Phase 3 complete. GPU memory freed.")
else:
    print("Phase 3 skipped. Using Phase 2 model for submission.")

print(f"\nFinal model directory: {FINAL_MODEL_DIR}")

## Decoding Hyperparameter Search

Try combinations of `length_penalty` and `num_beams` on competition val to find the best decoding config.

In [None]:
print(f"Loading final model from: {FINAL_MODEL_DIR}")
tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL_DIR)
model_final = AutoModelForSeq2SeqLM.from_pretrained(FINAL_MODEL_DIR)
model_final.to(device)
print(f"Model loaded ({model_final.num_parameters():,} parameters)")

comp_trans = comp_df['transliteration'].tolist()
comp_refs_list = comp_df['translation'].tolist()

# Grid search over length_penalty and num_beams
length_penalties = [0.6, 0.8, 1.0, 1.2]
beam_sizes = [5, 8]

decode_results = []

print("\nDecoding hyperparameter search on competition val:")
print(f"{'length_penalty':>15} {'num_beams':>10} {'BLEU':>8} {'chrF++':>8} {'geo_mean':>10}")
print("-" * 55)

best_geo = -1.0
best_config = {'length_penalty': 1.0, 'num_beams': 5}

for lp in length_penalties:
    for nb in beam_sizes:
        preds = generate_predictions(
            model_final, tokenizer,
            comp_trans, PREFIX,
            MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH,
            num_beams=nb,
            length_penalty=lp,
            no_repeat_ngram_size=3,
            batch_size=8,
        )
        m = score_predictions(preds, comp_refs_list)
        decode_results.append({
            'length_penalty': lp,
            'num_beams': nb,
            **m,
        })
        print(f"{lp:>15.1f} {nb:>10} {m['bleu']:>8.2f} {m['chrf']:>8.2f} {m['geo_mean']:>10.4f}")
        if m['geo_mean'] > best_geo:
            best_geo = m['geo_mean']
            best_config = {'length_penalty': lp, 'num_beams': nb}

print("\n" + "=" * 55)
print(f"Best config: length_penalty={best_config['length_penalty']}, num_beams={best_config['num_beams']}")
print(f"Best competition geo_mean: {best_geo:.4f}")

# Store decode results as DataFrame
decode_df = pd.DataFrame(decode_results)
print("\nFull decode search results:")
print(decode_df.sort_values('geo_mean', ascending=False).to_string(index=False))

## Generate Test Predictions

Use the best decoding config found above to generate predictions on the 4 competition test samples.

In [None]:
print(f"Generating test predictions with best config:")
print(f"  num_beams={best_config['num_beams']}, length_penalty={best_config['length_penalty']}")
print(f"  MAX_SOURCE_LENGTH={MAX_SOURCE_LENGTH}, MAX_TARGET_LENGTH={MAX_TARGET_LENGTH}")

test_preds = generate_predictions(
    model_final, tokenizer,
    test_df['transliteration'].tolist(),
    PREFIX,
    MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH,
    num_beams=best_config['num_beams'],
    length_penalty=best_config['length_penalty'],
    no_repeat_ngram_size=3,
    batch_size=4,
)

print("\nTest predictions:")
for i, (src, pred) in enumerate(zip(test_df['transliteration'], test_preds)):
    print(f"\n=== Sample {i} (id={test_df['id'].iloc[i]}) ===")
    print(f"Source:      {str(src)[:150]}")
    print(f"Translation: {pred[:300]}")

## Create Submission

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'translation': test_preds})
submission_path = OUTPUT_DIR / 'submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved to: {submission_path}")
print(f"Shape: {submission.shape}")
submission

## Summary

In [None]:
print("=" * 60)
print("TRAINING SUMMARY — Deep Past Challenge v2")
print("=" * 60)
print(f"Model: {MODEL_NAME}")
print(f"MAX_SOURCE_LENGTH: {MAX_SOURCE_LENGTH}  MAX_TARGET_LENGTH: {MAX_TARGET_LENGTH}")
print()

# Zero-shot baseline
if 'baseline_zeroshot' in dir():
    zs = baseline_zeroshot
    print("Zero-shot (competition val — before fine-tuning):")
    print(f"  BLEU={zs['zeroshot_bleu']:.2f}  chrF++={zs['zeroshot_chrf']:.2f}  geo_mean={zs['zeroshot_geo_mean']:.4f}")
    print()

# Phase 2 results
if 'p2_competition_metrics' in dir() and p2_competition_metrics:
    m = p2_competition_metrics
    print(f"Phase 2 — Old Assyrian ({P2_EPOCHS} epochs, lr={P2_LR}):")
    print(f"  BLEU={m['bleu']:.2f}  chrF++={m['chrf']:.2f}  geo_mean={m['geo_mean']:.4f}")
    print()

# Phase 3 results (if run)
if p3_competition_metrics is not None:
    m = p3_competition_metrics
    print(f"Phase 3 — OA + Lexicon ({P3_EPOCHS} epochs, lr={P3_LR}):")
    print(f"  BLEU={m['bleu']:.2f}  chrF++={m['chrf']:.2f}  geo_mean={m['geo_mean']:.4f}")
    print()
else:
    print("Phase 3: skipped")
    print()

# Best decoding config
if 'best_config' in dir() and 'best_geo' in dir():
    print(f"Best decoding config: num_beams={best_config['num_beams']}, length_penalty={best_config['length_penalty']}")
    print(f"Best competition geo_mean (post-decode search): {best_geo:.4f}")
    print()

# Final model used
print(f"Final model: {FINAL_MODEL_DIR}")
print(f"Submission: {submission_path}")
print("=" * 60)