# Deep Past Challenge - Baseline Model

**Task**: Translate Akkadian transliterations to English

**Approach**: Fine-tune ByT5-small (character-level T5)

**Evaluation**: Geometric mean of BLEU and chrF++

## 1. Setup

In [None]:
# Install dependencies
!pip install -q transformers datasets sacrebleu accelerate sentencepiece

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset as HFDataset
import sacrebleu
from sacrebleu.metrics import BLEU, CHRF
import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Load Data

In [None]:
# Paths
DATA_DIR = Path('../data/raw')

# Load data
train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test.csv')
sample_sub = pd.read_csv(DATA_DIR / 'sample_submission.csv')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTrain columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

In [None]:
# Preview training data
train_df.head(3)

In [None]:
# Preview test data
test_df.head()

In [None]:
# Data statistics
train_df['src_len'] = train_df['transliteration'].str.len()
train_df['tgt_len'] = train_df['translation'].str.len()

print("Source (Akkadian) length stats:")
print(train_df['src_len'].describe())
print("\nTarget (English) length stats:")
print(train_df['tgt_len'].describe())

## 3. Prepare Dataset

In [None]:
# Model selection - ByT5 works at character level, good for special chars
MODEL_NAME = "google/byt5-small"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Model: {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size}")

In [None]:
# Prepare data for training
# Add task prefix for T5-style models
PREFIX = "translate Akkadian to English: "

def preprocess_function(examples):
    inputs = [PREFIX + text for text in examples['transliteration']]
    targets = examples['translation']
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    labels = tokenizer(
        targets,
        max_length=512,
        truncation=True,
        padding='max_length'
    )
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Create train/val split
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(
    train_df[['transliteration', 'translation']], 
    test_size=0.1, 
    random_state=42
)

print(f"Train size: {len(train_data)}")
print(f"Val size: {len(val_data)}")

# Convert to HuggingFace datasets
train_dataset = HFDataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = HFDataset.from_pandas(val_data.reset_index(drop=True))

# Tokenize
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=['transliteration', 'translation'])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=['transliteration', 'translation'])

## 4. Training

In [None]:
# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.to(device)

print(f"Model parameters: {model.num_parameters():,}")

In [None]:
# Scoring metrics
bleu = BLEU()
chrf = CHRF(word_order=2)  # chrF++

def compute_metrics(predictions_and_labels):
    preds, labels = predictions_and_labels
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in labels (padding)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute metrics
    bleu_score = bleu.corpus_score(decoded_preds, [decoded_labels]).score
    chrf_score = chrf.corpus_score(decoded_preds, [decoded_labels]).score
    
    # Geometric mean (competition metric)
    geo_mean = np.sqrt(bleu_score * chrf_score)
    
    return {
        'bleu': bleu_score,
        'chrf': chrf_score,
        'geo_mean': geo_mean
    }

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='../models/byt5-akkadian-baseline',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=512,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model='geo_mean',
    greater_is_better=True,
    save_total_limit=2,
    report_to='none',
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100
)

In [None]:
# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train
trainer.train()

In [None]:
# Run validation
validation_results = trainer.evaluate()
print("\nValidation Results:")
for k, v in validation_results.items():
    print(f"  {k}: {v:.4f}")

## 5. Generate Predictions

In [None]:
# Prepare test data
test_inputs = [PREFIX + text for text in test_df['transliteration']]

# Tokenize
test_encodings = tokenizer(
    test_inputs,
    max_length=512,
    truncation=True,
    padding=True,
    return_tensors='pt'
).to(device)

print(f"Test samples: {len(test_inputs)}")

In [None]:
# Generate translations
model.set_train_mode(False)
with torch.no_grad():
    outputs = model.generate(
        input_ids=test_encodings['input_ids'],
        attention_mask=test_encodings['attention_mask'],
        max_length=512,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
    )

# Decode
predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Show predictions
for i, (src, pred) in enumerate(zip(test_df['transliteration'], predictions)):
    print(f"\n--- Sample {i} ---")
    print(f"Source: {src[:100]}...")
    print(f"Translation: {pred[:200]}...")

## 6. Create Submission

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'translation': predictions
})

# Save
submission.to_csv('../submissions/baseline_byt5.csv', index=False)
print("Submission saved!")
submission

In [None]:
# Compare with sample submission format
print("Sample submission format:")
print(sample_sub.head())

## 7. Save Model

In [None]:
# Save the best model
model.save_pretrained('../models/byt5-akkadian-baseline/final')
tokenizer.save_pretrained('../models/byt5-akkadian-baseline/final')
print("Model saved!")