# Deep Past Challenge - Curriculum ByT5 Training

**Model**: google/byt5-small (300M params, byte-level tokenization)
**Data**: 161K assembled Akkadian-English pairs
**Strategy**: 2-phase curriculum learning
- Phase 1: All gold-quality pairs (~126K) — general Akkadian
- Phase 2: Old Assyrian dialect only (~15K) — competition domain

**Metric**: √(BLEU × chrF++)

In [None]:
!pip install -q sacrebleu

In [None]:
import pandas as pd
import numpy as np
import torch
import gc
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
from datasets import Dataset as HFDataset
from sacrebleu.metrics import BLEU, CHRF
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Configuration

In [None]:
# Paths
ASSEMBLED_DIR = Path('/kaggle/input/akkadian-assembled-161k')
COMPETITION_DIR = Path('/kaggle/input/deep-past-initiative-machine-translation')
OUTPUT_DIR = Path('/kaggle/working')

# Model
MODEL_NAME = 'google/byt5-small'
PREFIX = 'translate Akkadian to English: '

# Sequence lengths (ByT5 byte-level)
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 256

# Phase 1: General Akkadian (all gold data)
P1_EPOCHS = 5
P1_BATCH_SIZE = 16
P1_GRAD_ACCUM = 2
P1_LR = 5e-5
P1_WARMUP = 0.1

# Phase 2: Old Assyrian specialization
P2_EPOCHS = 10
P2_BATCH_SIZE = 8
P2_GRAD_ACCUM = 4
P2_LR = 1e-5
P2_WARMUP = 0.05

## Load Data

In [None]:
# Assembled dataset
train_df = pd.read_parquet(ASSEMBLED_DIR / 'train.parquet')
val_df = pd.read_parquet(ASSEMBLED_DIR / 'val.parquet')
comp_df = pd.read_parquet(ASSEMBLED_DIR / 'val_competition.parquet')

# Competition test set
test_df = pd.read_csv(COMPETITION_DIR / 'test.csv')

print(f"Full train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Competition val: {len(comp_df)}")
print(f"Test: {len(test_df)}")
print(f"\nDialects: {train_df['dialect'].value_counts().to_dict()}")
print(f"Quality: {train_df['quality'].value_counts().to_dict()}")

## Helper Functions

In [None]:
def preprocess_function(examples, tokenizer, prefix, max_source_length, max_target_length):
    """Tokenize inputs and targets with separate max lengths."""
    inputs = [prefix + str(text) for text in examples['transliteration']]
    targets = [str(text) for text in examples['translation']]

    model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


def score_predictions(predictions, references, prefix=""):
    """Compute BLEU, chrF++, and geo_mean."""
    bleu = BLEU()
    chrf = CHRF(word_order=2)
    bleu_score = bleu.corpus_score(predictions, [references]).score
    chrf_score = chrf.corpus_score(predictions, [references]).score
    geo_mean = np.sqrt(max(bleu_score, 0) * max(chrf_score, 0))
    p = f"{prefix}_" if prefix else ""
    return {f"{p}bleu": bleu_score, f"{p}chrf": chrf_score, f"{p}geo_mean": geo_mean}


def create_compute_metrics(tokenizer):
    """Create metrics computation function for Trainer."""
    bleu = BLEU()
    chrf = CHRF(word_order=2)

    def compute_metrics(predictions_and_labels):
        preds, labels = predictions_and_labels
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        b = bleu.corpus_score(decoded_preds, [decoded_labels]).score
        c = chrf.corpus_score(decoded_preds, [decoded_labels]).score
        return {'bleu': b, 'chrf': c, 'geo_mean': np.sqrt(max(b, 0) * max(c, 0))}

    return compute_metrics


class FullValCallback(TrainerCallback):
    """Score predictions on the full validation set after each eval."""
    def __init__(self, trainer, full_val_dataset, full_val_refs, tokenizer):
        self.trainer = trainer
        self.full_val_dataset = full_val_dataset
        self.full_val_refs = full_val_refs
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, **kwargs):
        preds = self.trainer.predict(self.full_val_dataset)
        decoded = self.tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
        metrics = score_predictions(decoded, self.full_val_refs, prefix="full_val")
        for k, v in metrics.items():
            print(f"  {k}: {v:.4f}")
        state.log_history[-1].update(metrics)

## Training Function

In [None]:
def run_phase(phase, train_data_df, comp_data_df, val_data_df,
              model_path, epochs, batch_size, grad_accum, lr, warmup,
              tokenizer_obj=None):
    """Run a single training phase. Returns (model, tokenizer, best_dir, metrics)."""
    print(f"\n{'='*60}")
    print(f"PHASE {phase}: {'General Akkadian' if phase == 1 else 'Old Assyrian Specialization'}")
    print(f"{'='*60}")

    checkpoint_dir = OUTPUT_DIR / f'phase{phase}_checkpoints'
    best_dir = OUTPUT_DIR / f'phase{phase}_best'

    # Load model
    print(f"Loading model from: {model_path}")
    tokenizer = tokenizer_obj or AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model.to(device)
    print(f"Parameters: {model.num_parameters():,}")

    # Prepare datasets
    train_data = train_data_df[['transliteration', 'translation']].reset_index(drop=True)
    comp_data = comp_data_df[['transliteration', 'translation']].reset_index(drop=True)
    full_val_data = val_data_df[['transliteration', 'translation']].reset_index(drop=True)
    full_val_refs = full_val_data['translation'].tolist()

    print(f"Train: {len(train_data)}, Eval (competition): {len(comp_data)}, Full val: {len(full_val_data)}")

    train_dataset = HFDataset.from_pandas(train_data)
    eval_dataset = HFDataset.from_pandas(comp_data)
    full_val_dataset = HFDataset.from_pandas(full_val_data)

    # Tokenize
    preprocess_fn = lambda x: preprocess_function(
        x, tokenizer, PREFIX, MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH
    )
    remove_cols = ['transliteration', 'translation']
    train_dataset = train_dataset.map(preprocess_fn, batched=True, remove_columns=remove_cols)
    eval_dataset = eval_dataset.map(preprocess_fn, batched=True, remove_columns=remove_cols)
    full_val_dataset = full_val_dataset.map(preprocess_fn, batched=True, remove_columns=remove_cols)

    # Training args
    gen_max_length = max(MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH)
    training_args = Seq2SeqTrainingArguments(
        output_dir=str(checkpoint_dir),
        save_strategy='epoch',
        eval_strategy='epoch',
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,
        num_train_epochs=epochs,
        warmup_ratio=warmup,
        weight_decay=0.01,
        logging_steps=50,
        predict_with_generate=True,
        generation_max_length=gen_max_length,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model='geo_mean',
        greater_is_better=True,
        save_total_limit=2,
        report_to='none',
    )

    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer, model=model, padding=True, label_pad_token_id=-100
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=create_compute_metrics(tokenizer),
        processing_class=tokenizer,
    )

    trainer.add_callback(FullValCallback(trainer, full_val_dataset, full_val_refs, tokenizer))

    # Train
    print("\nStarting training...")
    trainer.train()

    # Final eval
    print("\nFinal validation...")
    results = trainer.evaluate()
    print("Competition val results:")
    for k, v in results.items():
        print(f"  {k}: {v:.4f}")

    # Save best
    model.save_pretrained(best_dir)
    tokenizer.save_pretrained(best_dir)
    print(f"Best model saved to: {best_dir}")

    # Cleanup checkpoints to save disk
    import shutil
    if checkpoint_dir.exists():
        shutil.rmtree(checkpoint_dir)
        print(f"Cleaned up checkpoints: {checkpoint_dir}")

    return model, tokenizer, best_dir, results

## Phase 1: General Akkadian (all gold data)

In [None]:
# Filter Phase 1: gold quality only
p1_train = train_df[train_df['quality'] == 'gold'].reset_index(drop=True)
print(f"Phase 1 training samples: {len(p1_train)}")

model_p1, tokenizer_p1, p1_best_dir, p1_results = run_phase(
    phase=1,
    train_data_df=p1_train,
    comp_data_df=comp_df,
    val_data_df=val_df,
    model_path=MODEL_NAME,
    epochs=P1_EPOCHS,
    batch_size=P1_BATCH_SIZE,
    grad_accum=P1_GRAD_ACCUM,
    lr=P1_LR,
    warmup=P1_WARMUP,
)

# Free memory
del model_p1
gc.collect()
torch.cuda.empty_cache()

## Phase 2: Old Assyrian Specialization

In [None]:
# Filter Phase 2: Old Assyrian dialect
p2_train = train_df[train_df['dialect'] == 'old_assyrian'].reset_index(drop=True)
print(f"Phase 2 training samples: {len(p2_train)}")

model_p2, tokenizer_p2, p2_best_dir, p2_results = run_phase(
    phase=2,
    train_data_df=p2_train,
    comp_data_df=comp_df,
    val_data_df=val_df,
    model_path=str(p1_best_dir),  # Resume from Phase 1
    epochs=P2_EPOCHS,
    batch_size=P2_BATCH_SIZE,
    grad_accum=P2_GRAD_ACCUM,
    lr=P2_LR,
    warmup=P2_WARMUP,
)

# Clean up Phase 1 model to save disk
import shutil
if p1_best_dir.exists():
    shutil.rmtree(p1_best_dir)
    print(f"Cleaned up Phase 1 model: {p1_best_dir}")

## Generate Predictions & Submit

In [None]:
# Generate test predictions with the Phase 2 model
print("Generating test predictions...")

test_inputs = [PREFIX + str(t) for t in test_df['transliteration']]
test_enc = tokenizer_p2(
    test_inputs,
    max_length=MAX_SOURCE_LENGTH,
    truncation=True,
    padding=True,
    return_tensors='pt'
).to(device)

model_p2.eval()
with torch.no_grad():
    outputs = model_p2.generate(
        input_ids=test_enc['input_ids'],
        attention_mask=test_enc['attention_mask'],
        max_length=MAX_TARGET_LENGTH,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
    )

predictions = tokenizer_p2.batch_decode(outputs, skip_special_tokens=True)

print("\nPredictions:")
for i, (src, pred) in enumerate(zip(test_df['transliteration'], predictions)):
    print(f"\n=== Sample {i} ===")
    print(f"Source: {src[:120]}...")
    print(f"Translation: {pred[:300]}")

In [None]:
# Create submission
submission = pd.DataFrame({'id': test_df['id'], 'translation': predictions})
submission.to_csv('submission.csv', index=False)
print("Submission saved!")
submission

In [None]:
# Summary
print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Phase 1 — competition geo_mean: {p1_results.get('eval_geo_mean', 'N/A'):.4f}")
print(f"Phase 2 — competition geo_mean: {p2_results.get('eval_geo_mean', 'N/A'):.4f}")