In [1]:
import pandas as pd
import datasets
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv
import os
from transformers import AutoTokenizer
from huggingface_hub import notebook_login
from trl import SFTConfig, SFTTrainer

In [2]:
load_dotenv()
notebook_login()

In [3]:
df = pd.read_csv("../Cleaned_Data/transcription_to_hieroglyphs.csv")
dataset = datasets.Dataset.from_pandas(df)

In [4]:
dataset

Dataset({
    features: ['transcription', 'hieroglyphs'],
    num_rows: 35252
})

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

gemma_model = "google/gemma-3-270m-it"
base_model = AutoModelForCausalLM.from_pretrained(gemma_model, device_map="auto", attn_implementation="eager", dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(gemma_model)

print(f"Device: {base_model.device}")
print(f"DType: {base_model.dtype}")

In [6]:
def translate(sample):
  return {
      "messages": [
          {"role": "system", "content": "Translate this transliteration to hieroglyphics: "},
          {"role": "user", "content": f"{sample['transcription']}"},
          {"role": "assistant", "content": f"{sample['hieroglyphs']}"}
      ]
  }

training_dataset = dataset.map(translate, remove_columns=dataset.features.keys())
training_dataset_splits = training_dataset.train_test_split(test_size=0.1, shuffle=True)

Map:   0%|          | 0/35252 [00:00<?, ? examples/s]

In [7]:
print(training_dataset[20])

{'messages': [{'content': 'Translate this transliteration to hieroglyphics: ', 'role': 'system'}, {'content': 'jy.n =j m nʾ,t =j hꜣi̯.n =j m spꜣ,t =j jri̯.n =j ḥzz.t nṯr =j mrr.t nṯr.pl =j nb.w', 'role': 'user'}, {'content': 'M18 N35 A1 G17 O49 A1 G1 O4 D54 N35 A1 G17 N24C X1 Z1 A1 D4 N35 M17 A1 V28 W14 O34 O34 X1 Y2 A40 M17 A1 U6 D21 D21 X1 R8 A40 Z2 M17 A1 N35 V30 G43', 'role': 'assistant'}]}


In [8]:
training_dataset_splits

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 31726
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 3526
    })
})

# Training

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTConfig

In [10]:
adapter_path = "../adapters/transliteration-to-hero-adapters"      # Where to save your LoRA adapters
tokenizer = AutoTokenizer.from_pretrained(gemma_model)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules="all-linear",                      # Target all linear layers
    lora_dropout=0.05,                                # Increase to 0.1 to induce overfitting
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"]       # Save the lm_head and embed_tokens as you train the special tokens
)

In [11]:
args = SFTConfig(
    output_dir=adapter_path,                          # Directory to save adapters
    num_train_epochs=3,                               # Number of training epochs
    per_device_train_batch_size=4,                    # Batch size per device during training
    logging_strategy="epoch",                         # Log every epoch
    eval_strategy="epoch",                            # Evaluate loss metrics every epoch
    save_strategy="epoch",                            # Save checkpoint every epoch
    learning_rate=5e-5,                               # Learning rate,
    lr_scheduler_type="constant",                     # Use constant learning rate scheduler
    max_length=256,                                   # Max sequence length for model and packing of the dataset
    gradient_checkpointing=False,                     # Use gradient checkpointing to save memory
    packing=False,                                    # Groups multiple samples in the dataset into a single sequence
    optim="adamw_torch_fused",                        # Use fused adamw optimizer
    report_to="tensorboard",                          # Report metrics to tensorboard
    weight_decay=0.01,                                # Added weight decay for regularization
)

In [12]:
base_model = AutoModelForCausalLM.from_pretrained(gemma_model, quantization_config=bnb_config, device_map="auto", attn_implementation='eager')
base_model.config.pad_token_id = tokenizer.pad_token_id

Loading weights:   0%|          | 0/236 [00:00<?, ?it/s]

In [13]:
test_dataset = training_dataset_splits['test']
train_dataset = training_dataset_splits['train']

In [14]:
train_dataset = train_dataset.train_test_split(test_size=0.1, shuffle=True)
eval_dataset = train_dataset['test']
train_dataset = train_dataset['train']

In [15]:
trainer = SFTTrainer(
    model=base_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
)



Tokenizing train dataset:   0%|          | 0/28553 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/28553 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3173 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3173 [00:00<?, ? examples/s]

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.465123,1.308555,1.321567,3031140.0,0.642588
2,1.237214,1.208536,1.160613,6062280.0,0.675316


In [None]:
trainer.save_model(adapter_path)
print(f"LoRA adapters saved to {adapter_path}")