In [1]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
DATA_PATH = "bbaw_egyptian_w_hieroglyphs_translated.csv" 
MODEL_NAME = "facebook/bart-base"

OUTPUT_DIR = "./bart_gardiner_en"
MAX_INPUT_LEN = 128
MAX_TARGET_LEN = 64

BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,transcription,translation,hieroglyphs,translation_en
0,"jr,j-pÍú•,t ·∏•Íú£,tj-Íú• ·∏´tm,w-bj,tj smr-wÍú•,tj jm,j-r...","Hereditary noble and prince, royal seal-bearer...",D21 Q3 D36 F4 D36 L2 X1 S19 S29 U23 T21 X1 G17...,"Hereditary noble and prince, royal seal-bearer..."
1,"j Íú•n·∏´.w.pl tp,j.pl-tÍú£ swÍú£.t =sn ·∏•r jz pn m-·∏´di...","O living ones, who are upon the earth, who sha...",M17 A26 S34 Aa1 G43 A1 Z3 h N17 N23 A1 Z2B S29...,"O living ones, who are upon the earth, who sha..."
2,"jnk m·∏•-jb-n-nswt m ·∏•w,t-n·πØr r æ-N·∏´n m pr Stj,t ...","I was a trusted one of the king in the temple,...",W24 V31 V22 F34 N35 M23 X1 N35 G17 R8 O6 X1 O1...,"I was a trusted one of the king in the temple,..."
3,"Íú•q jb.pl ·∏•r-sÍú£ mr,yt jm,j-r æ-Íú•·∏•Íú•,w-wr-m-pr-nsw...","A trusted one upon the landing place, great ov...",G35 F34 F34 F34 D2 Z1 Aa17 U6 D21 M17 M17 X1 N...,"A trusted one upon the landing place, great ov..."
4,"jw jriÃØ.n = jz m ·∏•z,t nswt sjqr.n wj ·∏•m =f m t...","I built a tomb through the favour of the king,...",M17 G43 D4 N35 M17 M40 O34 O1 Z1 G17 V28 W14 X...,"I built a tomb through the favour of the king,..."
...,...,...,...,...
35498,"wr‚∏Æ.pl n p·∏•,tj =k",Die Gro√üen ... werden fallen ... aufgrund dein...,G36 D21 A21 N35 F4 V31,The great ones will fall because of your strength
35499,"hmhm =k mj hh nsr.t m-sÍú£ ·∏´Íú£s,t nb.t",Dein Kriegsschrei folgt wie der Gluthauch des ...,O4 G17 O4 G17 A2 Z2 V31 W19 M17 O4 O4 Q7 N35 F...,Your war cry follows like the glowing breath o...
35500,"·∏´Íú£s,t nb.t twt.w m jb wÍú•.tj fkw =sn tÍú£ =sn m-mn,t",... jedes Fremdland ist eintr√§chtig eines einz...,N25 X1 Z1 V30 X1 X1 G43 X1 G43 Y1 Z2 Aa15 F34 ...,"Every foreign land is united in a single wish,..."
35501,r zj.tw ·πØÍú£w r fn·∏è =sn jn kÍú£ =k,... damit durch deinen Ka Atem f√ºr ihre Nasen ...,D21 O35 Z4A D54 X1 G43 P5 G43 Z2 D21 D20 O34 N...,


In [4]:
assert "hieroglyphs" in df.columns
assert "translation_en" in df.columns

# Basic cleaning
df["hieroglyphs"] = (
    df["hieroglyphs"]
    .astype(str)
    .str.upper()
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

df["translation_en"] = (
    df["translation_en"]
    .astype(str)
    .str.strip()
)

df = df.dropna()
df = df[df["hieroglyphs"] != ""]
df = df[df["translation_en"] != ""]

print(f"Dataset size after cleaning: {len(df)}")

Dataset size after cleaning: 35496


In [5]:
train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=42
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42
)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

In [6]:
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)

model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# Optional: Add Gardiner codes to tokenizer vocab
unique_codes = set(
    " ".join(df["hieroglyphs"].tolist()).split()
)

tokenizer.add_tokens(list(unique_codes))
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


BartScaledWordEmbedding(52452, 768, padding_idx=1)

In [7]:
def preprocess(batch):
    inputs = [
        f"Translate hieroglyphs to English: {x}"
        for x in batch["hieroglyphs"]
    ]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["translation_en"],
            max_length=MAX_TARGET_LEN,
            truncation=True,
            padding="max_length"
        )

    # üî¥ THIS LINE FIXES EVERYTHING
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels_ids
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

tokenized_ds = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28396/28396 [00:24<00:00, 1136.07 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3550/3550 [00:03<00:00, 1171.19 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3550/3550 [00:02<00:00, 1231.86 examples/s]


In [8]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

In [9]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="steps",
    eval_steps=2500,
    save_steps=2500,

    learning_rate=LR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,

    num_train_epochs=EPOCHS,
    weight_decay=0.01,


    eval_accumulation_steps=4,   # üî• FIX MEMORY GROWTH

    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    fp16=True,
    report_to="none"
)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer
)
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
2500,3.0878,2.896055
5000,2.8605,2.681912
7500,2.5914,2.545922
10000,2.4571,2.433786
12500,2.3196,2.355024
15000,2.2387,2.29651
17500,2.2459,2.272027


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=17750, training_loss=2.6305291447169346, metrics={'train_runtime': 3714.1882, 'train_samples_per_second': 38.226, 'train_steps_per_second': 4.779, 'total_flos': 1.08212977926144e+16, 'train_loss': 2.6305291447169346, 'epoch': 5.0})

In [11]:
import torch
import evaluate
from tqdm import tqdm

bleu = evaluate.load("bleu")

model.eval()

preds = []
refs = []

for sample in tqdm(test_df.itertuples(), total=len(test_df)):
    text = f"Translate hieroglyphs to English: {sample.hieroglyphs}"

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True
    ).to(DEVICE)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=40,
            num_beams=1
        )

    pred = tokenizer.decode(output[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append([sample.translation_en])

result = bleu.compute(predictions=preds, references=refs)
print("BLEU:", result["bleu"])


  0%|          | 0/3550 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3550/3550 [14:12<00:00,  4.16it/s]


BLEU: 0.08728294071672292


In [12]:
def token_accuracy(predictions, references):
    correct = 0
    total = 0

    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()

        min_len = min(len(pred_tokens), len(ref_tokens))

        for i in range(min_len):
            if pred_tokens[i] == ref_tokens[i]:
                correct += 1

        total += len(ref_tokens)

    return correct / total if total > 0 else 0.0
token_acc = token_accuracy(preds, [r[0] for r in refs])
print(f"Token Accuracy: {token_acc:.4f}")


Token Accuracy: 0.0683


In [13]:
test_metrics = trainer.evaluate(tokenized_ds["test"])
print("Test metrics:", test_metrics)

Test metrics: {'eval_loss': 2.298457622528076, 'eval_runtime': 24.9593, 'eval_samples_per_second': 142.232, 'eval_steps_per_second': 35.578, 'epoch': 5.0}


In [14]:
trainer.save_model(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

print("Model saved successfully.")


Model saved successfully.


In [27]:
def translate_gardiner(sequence: str):
    text = f"Translate hieroglyphs to English: {sequence.upper()}"

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=64,
            num_beams=1,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Example usage
example_sequence = "G36 D21 A21 N35 F4 V31"
translation = translate_gardiner(example_sequence)
print(f"Translation: {translation}")

Translation: The ruler of the two countries ,  Nefer-cheperu-Re-wa-en-re


In [28]:
matches = [
    (i, h, en)
    for i, (h, en) in enumerate(
        zip(dataset["train"]["hieroglyphs"],
            dataset["train"]["translation_en"])
    )
    if h == "G36 D21 A21 N35 F4 V31"
]

matches[:5]


[(8496,
  'G36 D21 A21 N35 F4 V31',
  'The great ones will fall because of your strength')]