# Notebook 1: NLLB Teacher — Generate Translations

Runs `phucthaiv02/akkadian-nllb-2` (3.3B, Akkadian-adapted NLLB) on all training transliterations.
Saves gold + NLLB translations as parquet for use by the ByT5 training notebook.

**Output:** `/kaggle/working/gold_with_nllb.parquet`, `/kaggle/working/test_nllb_predictions.csv`

In [None]:
!pip install -q sacrebleu

In [None]:
import os, gc, math, time, warnings
from pathlib import Path
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sacrebleu.metrics import BLEU, CHRF

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

## Load Data

In [None]:
def find_file(filename, base='/kaggle/input'):
    for root, dirs, files in os.walk(base):
        if filename in files:
            return Path(root)
    return None

# Show input layout for debugging
print('Contents of /kaggle/input/')
for d in sorted(os.listdir('/kaggle/input')):
    full = os.path.join('/kaggle/input', d)
    if os.path.isdir(full):
        print(f'  {d}/')
        for f in sorted(os.listdir(full))[:15]:
            print(f'    {f}')

DATA_DIR = find_file('train.parquet')
if DATA_DIR is None:
    raise FileNotFoundError('Cannot find train.parquet under /kaggle/input/')
print(f'\nAssembled data at: {DATA_DIR}')

train_df = pd.read_parquet(DATA_DIR / 'train.parquet')
val_comp = pd.read_parquet(DATA_DIR / 'val_competition.parquet')

COMP_DIR = find_file('test.csv')
test_df = pd.read_csv(COMP_DIR / 'test.csv') if COMP_DIR else None

gold_df = train_df[train_df['quality'] == 'gold'].reset_index(drop=True)
print(f'Gold training: {len(gold_df)}')
print(f'Competition val: {len(val_comp)}')
if test_df is not None:
    print(f'Test samples: {len(test_df)}')

## Load NLLB Teacher Model

In [None]:
NLLB_MODEL = 'phucthaiv02/akkadian-nllb-2'

print(f'Loading {NLLB_MODEL}...')
t0 = time.time()

nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    NLLB_MODEL,
    torch_dtype=torch.float16,
    device_map='auto',
)
nllb_model.eval()

n_params = sum(p.numel() for p in nllb_model.parameters()) / 1e9
print(f'Loaded in {time.time()-t0:.0f}s ({n_params:.1f}B params)')
print(f'GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB')

In [None]:
def nllb_translate_batch(texts, tokenizer, model, batch_size=8,
                         max_source=512, max_target=512, num_beams=4):
    """Translate a list of texts using NLLB."""
    all_preds = []
    n_batches = math.ceil(len(texts) / batch_size)
    eng_id = tokenizer.convert_tokens_to_ids('eng_Latn')
    t0 = time.time()
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch, max_length=max_source, truncation=True,
            padding=True, return_tensors='pt'
        ).to(model.device)
        
        with torch.no_grad():
            out = model.generate(
                **enc,
                max_length=max_target,
                num_beams=num_beams,
                forced_bos_token_id=eng_id,
                length_penalty=1.0,
                early_stopping=True,
            )
        decoded = tokenizer.batch_decode(out, skip_special_tokens=True)
        all_preds.extend(decoded)
        
        bn = i // batch_size + 1
        if bn % 100 == 0 or bn == n_batches:
            elapsed = time.time() - t0
            rate = len(all_preds) / elapsed
            eta = (len(texts) - len(all_preds)) / rate if rate > 0 else 0
            print(f'  Batch {bn}/{n_batches} | {len(all_preds)}/{len(texts)} done | '
                  f'{rate:.1f} samples/s | ETA {eta/60:.0f}min')
    
    return all_preds

## Zero-Shot Eval on Competition Val

In [None]:
comp_trans = val_comp['transliteration'].tolist()
comp_refs = val_comp['translation'].tolist()

print(f'NLLB zero-shot on competition val ({len(comp_trans)} samples)...')
nllb_comp_preds = nllb_translate_batch(comp_trans, nllb_tokenizer, nllb_model, batch_size=4)

b = BLEU().corpus_score(nllb_comp_preds, [comp_refs]).score
c = CHRF(word_order=2).corpus_score(nllb_comp_preds, [comp_refs]).score
g = math.sqrt(max(b, 0) * max(c, 0))
print(f'\nNLLB zero-shot: BLEU={b:.2f}  chrF++={c:.2f}  geo_mean={g:.4f}')

for j in range(min(5, len(nllb_comp_preds))):
    print(f'\n[{j}] Src: {comp_trans[j][:120]}...')
    print(f'    NLLB: {nllb_comp_preds[j][:250]}')
    print(f'    Ref:  {comp_refs[j][:250]}')

## Generate NLLB Translations for All Gold Data

In [None]:
print(f'Generating NLLB translations for {len(gold_df)} gold samples...')
t0 = time.time()

gold_trans = gold_df['transliteration'].tolist()
nllb_gold_preds = nllb_translate_batch(gold_trans, nllb_tokenizer, nllb_model, batch_size=8)

elapsed = time.time() - t0
print(f'\nDone in {elapsed/60:.1f} min ({elapsed/len(gold_trans):.2f}s/sample)')

gold_out = gold_df.copy()
gold_out['nllb_translation'] = nllb_gold_preds
gold_out.to_parquet('/kaggle/working/gold_with_nllb.parquet', index=False)
print(f'Saved gold_with_nllb.parquet ({len(gold_out)} rows)')

## Generate for Competition Test

In [None]:
if test_df is not None:
    test_trans = test_df['transliteration'].tolist()
    print(f'Generating NLLB translations for {len(test_trans)} test samples...')
    nllb_test_preds = nllb_translate_batch(
        test_trans, nllb_tokenizer, nllb_model, batch_size=2,
        max_source=768, max_target=512
    )
    test_out = test_df.copy()
    test_out['nllb_translation'] = nllb_test_preds
    test_out.to_csv('/kaggle/working/test_nllb_predictions.csv', index=False)
    
    # Also save as a direct submission (NLLB-only baseline)
    nllb_sub = pd.DataFrame({'id': test_df['id'], 'translation': nllb_test_preds})
    nllb_sub.to_csv('/kaggle/working/submission.csv', index=False)
    print('Saved test predictions + NLLB-only submission')
    
    for j in range(len(nllb_test_preds)):
        print(f'\n[{j}] {test_trans[j][:100]}...')
        print(f'    → {nllb_test_preds[j][:300]}')
else:
    print('No test.csv found — skipping test generation')

## Summary

In [None]:
print('=== Output Files ===')
for f in Path('/kaggle/working').glob('*'):
    if f.is_file():
        print(f'  {f.name}  ({f.stat().st_size / 1e6:.1f} MB)')

print(f'\n=== Next Step ===')
print('Upload gold_with_nllb.parquet as a Kaggle dataset,')
print('then run Notebook 2 (byt5_nllb_train) to fine-tune ByT5.')