In [None]:
# Load model directly
import os
from transformers import Gemma3ForCausalLM, AutoTokenizer

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"]="0.0"

tokenizer = AutoTokenizer.from_pretrained("./models/gemma3-1b-it/transformers")
model = Gemma3ForCausalLM.from_pretrained("./models/gemma3-1b-it/transformers", attn_implementation="eager")

In [None]:
from datasets import Dataset, load_dataset

ds = load_dataset('json', data_files='training/train.jsonl')
ds

In [None]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

In [None]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-text-to-sql",         # directory to save and repository id
    max_seq_length=512,                     # max sequence length for model and packing of the dataset
    packing=False,                           # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)

In [None]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    peft_config=peft_config,
    processing_class=tokenizer
)

In [None]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

In [None]:
trainer.save_model()

In [None]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()