# Deep Past Challenge - T5 Baseline

**Task**: Translate Akkadian to English  
**Model**: google/t5-small (60M params vs 300M for ByT5)  
**Metric**: Geometric mean of BLEU and chrF++

In [None]:
!pip install -q sacrebleu sentencepiece

In [None]:
import pandas as pd
import numpy as np
import torch
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sacrebleu.metrics import BLEU, CHRF
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Load Data

In [None]:
train_df = pd.read_csv('/kaggle/input/deep-past-initiative-machine-translation/train.csv')
test_df = pd.read_csv('/kaggle/input/deep-past-initiative-machine-translation/test.csv')

print(f"Train: {len(train_df)}, Test: {len(test_df)}")
train_df.head(2)

## Prepare Dataset

In [None]:
MODEL_NAME = "t5-small"  # 60M params, much lighter than ByT5
PREFIX = "translate Akkadian to English: "
MAX_LENGTH = 256

print(f">>> LOADING MODEL: {MODEL_NAME} <<<")  # Confirm model version v5
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Vocab size: {tokenizer.vocab_size}")

def preprocess(examples):
    inputs = [PREFIX + str(t) for t in examples['transliteration']]
    targets = [str(t) for t in examples['translation']]
    
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=MAX_LENGTH, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Split
train_data, val_data = train_test_split(
    train_df[['transliteration', 'translation']], 
    test_size=0.1, 
    random_state=42
)

# Convert and tokenize
train_dataset = HFDataset.from_pandas(train_data.reset_index(drop=True))
val_dataset = HFDataset.from_pandas(val_data.reset_index(drop=True))

train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=['transliteration', 'translation'])
val_dataset = val_dataset.map(preprocess, batched=True, remove_columns=['transliteration', 'translation'])

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")

## Training

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print(f"Parameters: {model.num_parameters():,}")

In [None]:
bleu = BLEU()
chrf = CHRF(word_order=2)

def compute_metrics(pred_labels):
    preds, labels = pred_labels
    
    # Replace -100 with pad token id before decoding
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    
    b = bleu.corpus_score(decoded_preds, [decoded_labels]).score
    c = chrf.corpus_score(decoded_preds, [decoded_labels]).score
    geo = np.sqrt(max(b, 0.001) * max(c, 0.001))
    
    return {'bleu': b, 'chrf': c, 'geo_mean': geo}

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir='./t5-akkadian',
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=3e-4,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy='epoch',
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=MAX_LENGTH,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='geo_mean',
    greater_is_better=True,
    report_to='none',
    **{'eval_strategy': 'epoch'},
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, label_pad_token_id=-100),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print("\nValidation Results:")
for k, v in results.items():
    print(f"  {k}: {v:.4f}")

## Generate Predictions

In [None]:
gc.collect()
torch.cuda.empty_cache()

test_inputs = [PREFIX + str(t) for t in test_df['transliteration']]
test_enc = tokenizer(test_inputs, max_length=MAX_LENGTH, truncation=True, padding=True, return_tensors='pt').to(device)

model.to(device)
model.train(False)

with torch.no_grad():
    outputs = model.generate(
        input_ids=test_enc['input_ids'],
        attention_mask=test_enc['attention_mask'],
        max_length=MAX_LENGTH,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,
    )

predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

for i, (src, pred) in enumerate(zip(test_df['transliteration'], predictions)):
    print(f"\n=== Sample {i} ===")
    print(f"Source: {src[:100]}...")
    print(f"Translation: {pred[:300]}")

## Create Submission

In [None]:
submission = pd.DataFrame({'id': test_df['id'], 'translation': predictions})
submission.to_csv('submission.csv', index=False)
print("Submission saved!")
submission