In [None]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install sentencepiece
pip install huggingface_hub
pip install jiwer

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.1/84.1 kB 6.1 MB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 71.0 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3


In [None]:
import pandas as pd
import datasets
import torch
from huggingface_hub import login
from dotenv import load_dotenv
import os
from huggingface_hub import notebook_login
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
from collections import Counter
from jiwer import wer

In [None]:
from google.colab import userdata

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/transcription_to_hieroglyphs.csv")
dataset = datasets.Dataset.from_pandas(df)

In [None]:
dataset

Dataset({
    features: ['transcription', 'hieroglyphs'],
    num_rows: 35252
})

In [None]:
MODEL_NAME = "google-t5/t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
prefix = "Convert this transliteration to a hieroglyphics: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["transcription"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["hieroglyphs"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/35252 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'hieroglyphs', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 31726
    })
    test: Dataset({
        features: ['transcription', 'hieroglyphs', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3526
    })
})

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Some trainer versions return a tuple (preds, ...)
    if isinstance(preds, tuple):
        preds = preds[0]

    # If preds are logits: (batch, seq_len, vocab_size) -> take argmax -> (batch, seq_len)
    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    # Replace -100 so decoding works for labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id

    preds = preds.astype(np.int64)
    labels = labels.astype(np.int64)

    preds = np.where((preds >= 0) & (preds < vocab_size), preds, pad_id)
    labels = np.where((labels >= 0) & (labels < vocab_size), labels, pad_id)

    # Decode
    pred_texts = tokenizer.batch_decode(preds, skip_special_tokens=True)
    gold_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    total_f1 = 0.0
    total_cer = 0.0
    n = max(1, len(gold_texts))

    for pred, gold in zip(pred_texts, gold_texts):
        pred_tokens = pred.strip().split()
        gold_tokens = gold.strip().split()

        # ---------- TOKEN F1 (order-free) ----------
        p_cnt = Counter(pred_tokens)
        g_cnt = Counter(gold_tokens)

        tp = sum((p_cnt & g_cnt).values())
        precision = tp / max(1, len(pred_tokens))
        recall = tp / max(1, len(gold_tokens))
        f1 = 0.0 if (precision + recall) == 0 else (2 * precision * recall) / (precision + recall)
        total_f1 += f1

        # ---------- TOKEN CER (order-aware) "Word error rate" ----------
        total_cer += wer(" ".join(gold_tokens), " ".join(pred_tokens))

    token_f1 = total_f1 / n
    token_cer = total_cer / n
    my_metric = 0.8 * token_f1 + 0.2 * (1.0 - token_cer)

    return {
        "token_f1": token_f1,
        "token_cer": token_cer,
        "my_metric": my_metric,
    }


# Training

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01                  # i need to decrease this or remove it
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3                       # increase this
OUT_DIR = "/content/drive/MyDrive/best_weights/t5_transliteration_to_hero"

In [None]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir=OUT_DIR,
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   metric_for_best_model="my_metric",
   greater_is_better=True,
)

In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Token F1,Token Cer,My Metric
1,0.9772,0.82064,0.471348,0.726793,0.43172
2,0.7729,0.689057,0.494828,0.692263,0.45741
3,0.6796,0.653631,0.501657,0.68073,0.46518


TrainOutput(global_step=11898, training_loss=0.5383278261134355, metrics={'train_runtime': 2791.9062, 'train_samples_per_second': 34.091, 'train_steps_per_second': 4.262, 'total_flos': 1.074471230389248e+16, 'train_loss': 0.5383278261134355, 'epoch': 3.0})

In [None]:
trainer.save_model()

In [None]:
tokenizer.save_pretrained(trainer.args.output_dir)

('/content/drive/MyDrive/best_weights/t5_transliteration_to_hero/tokenizer_config.json',
 '/content/drive/MyDrive/best_weights/t5_transliteration_to_hero/special_tokens_map.json',
 '/content/drive/MyDrive/best_weights/t5_transliteration_to_hero/spiece.model',
 '/content/drive/MyDrive/best_weights/t5_transliteration_to_hero/added_tokens.json')

# Try

In [None]:
model_finetuned = T5ForConditionalGeneration.from_pretrained(OUT_DIR)
tokenizer_finetuned = T5Tokenizer.from_pretrained(OUT_DIR)

In [None]:
text = tokenized_dataset['test']['transcription'][120]

In [None]:
# transliteration
text

'ḏd.t.n r =s jy,wj jr,t twy n.t Ḥr,w šps,t jm.t jr,t Ḥr,w'

In [None]:
# actual hieroglyphs
tokenized_dataset['test']['hieroglyphs'][120]

'I10 D46 X1 N35 D21 O34 M18 M17 Z4 D54 Z7 Z4 D4 X1 Z1 X1 Z7 N35 X1 G5 G7 A51 v X1 Y1 Z11 G17 X1 D4 X1 Z1 G5 G7'

In [None]:
text  = prefix + text

In [None]:
enc = tokenizer_finetuned(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
output_ids = model_finetuned.generate(
    **enc,
    max_new_tokens=256,
    num_beams=4
)

In [None]:
# decode the output
print(tokenizer_finetuned.decode(output_ids[0], skip_special_tokens=True))

I10 D46 X1 N35 D21 O34 M18 M17 Z7 Z4 D4 X1 Z1 X1 Z7 M17 M17 N35 X1 G5 G7 A51 S29 X1 B1 M17 G17 X1 D4 X1 Z1 G5 G7


In [None]:
# actual    : I10 D46 X1 N35 D21 O34 M18 M17 Z4 D54 Z7 Z4 D4 X1 Z1 X1 Z7 N35 X1 G5 G7 A51 v X1 Y1 Z11 G17 X1 D4 X1 Z1 G5 G7
# predicted : I10 D46 X1 N35 D21 O34 M18 M17 Z7 Z4 D4 X1 Z1 X1 Z7 M17 M17 N35 X1 G5 G7 A51 S29 X1 B1 M17 G17 X1 D4 X1 Z1 G5 G7