### Finetuning Code

Packages: 

datasets
transformers
peft
LoRa

# 1. Imports

In [1]:
# JUPYTER NOTEBOOK CELL

##############################################
# 1. Imports
##############################################
import sys
import torch
import sentencepiece
print("Python executable:", sys.executable)
print("Python version:", sys.version)
print("PyTorch version:", torch.__version__)
print("MPS available?", torch.backends.mps.is_available())


Python executable: /Users/maximilianfuest/TUM/WS25/AI_Seminar/balluff-llm/balluff_venv/bin/python
Python version: 3.10.16 (main, Dec  3 2024, 17:27:57) [Clang 16.0.0 (clang-1600.0.26.4)]
PyTorch version: 2.6.0.dev20250103
MPS available? True


In [3]:

from datasets import load_dataset
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    Trainer,
    TrainingArguments,
)


# Import PEFT for LoRA

from peft import LoraConfig, get_peft_model, TaskType


In [4]:

##############################################
# 2. Basic config
##############################################
data_file = "/Users/maximilianfuest/TUM/WS25/AI_Seminar/balluff-llm/4GB_rpi5_LLM/data/validation_checklist_context.json"
# JSON lines with keys: "context", "prompt", "response"
# e.g., [{"context": "...", "prompt": "...", "response": "..."}]

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.6"
output_dir = "./tinyllama-finetuned-mps"

# Choose MPS if available, otherwise CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)


Using device: mps


In [None]:

##############################################
# 3. Load dataset
##############################################
dataset = load_dataset("json", data_files=data_file)
train_dataset = dataset["train"]

In [7]:

##############################################
# 4. Load tokenizer
##############################################
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# If the tokenizer doesn't define a pad token, reuse the EOS token.
# This avoids padding-related errors when training LLaMA-like models.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:

##############################################
# 5. Load base model in bfloat16 & configure memory-saving features
##############################################
model = LlamaForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16  # bfloat16 on MPS if supported
)

# Disable use_cache (reduces memory usage)
model.config.use_cache = False
# Gradient checkpointing for lower memory
model.config.gradient_checkpointing = True
model.enable_input_require_grads()

# Pad token configuration (avoid pad_token_id=None issues)
model.config.pad_token_id = tokenizer.pad_token_id

# Move base model to MPS (or CPU)
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): 

In [9]:

##############################################
# 6. Convert the model to LoRA (PEFT)
##############################################
# Define a LoRA configuration.
# Adjust 'target_modules' depending on model architecture 
# (often "q_proj", "v_proj" for LLaMA; check model code if needed).
lora_config = LoraConfig(
    r=8,                         # Rank of LoRA matrices
    lora_alpha=32,              # Scaling of LoRA
    lora_dropout=0.05,          # Dropout in LoRA layers
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"],  # Common modules in LLaMA
)

# Wrap the base model with LoRA adapters
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [10]:

##############################################
# 7. Preprocessing function
##############################################
def preprocess_function(examples):
    """
    Expects JSON fields: "context", "prompt", "response"
    Merges context + prompt => input; response => labels
    """
    inputs = []
    for c, p in zip(examples["context"], examples["prompt"]):
        inputs.append(f"{c}\nPrompt: {p}\nResponse:")

    # Lower max_length if you still get OOM
    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=256
    )

    labels = tokenizer(
        examples["response"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 16/16 [00:00<00:00, 217.35 examples/s]


In [11]:

##############################################
# 8. Training Arguments
##############################################
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="no",
    save_strategy="steps",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=1,    # keep small for limited GPU memory
    gradient_accumulation_steps=1,    # accumulate gradients if needed
    learning_rate=2e-5,
    num_train_epochs=1,              # bump to 2-3+ for real training
    save_steps=50,
    weight_decay=0.01,
    fp16=False,                       # turn off regular fp16 in Trainer
    bf16=True,                        # enable bfloat16 if supported
    push_to_hub=False,
)




In [12]:

##############################################
# 9. Initialize Trainer with LoRA model
##############################################
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [13]:

##############################################
# 10. Train
##############################################
print("Starting LoRA training on Apple Silicon GPU (MPS) in bfloat16..." if device.type == "mps" else "Starting training on CPU...")
trainer.train()


Starting LoRA training on Apple Silicon GPU (MPS) in bfloat16...


 62%|██████▎   | 10/16 [00:14<00:06,  1.10s/it]

{'loss': 11.7711, 'grad_norm': 52.87956237792969, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.62}


100%|██████████| 16/16 [00:21<00:00,  1.33s/it]

{'train_runtime': 21.3055, 'train_samples_per_second': 0.751, 'train_steps_per_second': 0.751, 'train_loss': 11.37181568145752, 'epoch': 1.0}





TrainOutput(global_step=16, training_loss=11.37181568145752, metrics={'train_runtime': 21.3055, 'train_samples_per_second': 0.751, 'train_steps_per_second': 0.751, 'total_flos': 25451858755584.0, 'train_loss': 11.37181568145752, 'epoch': 1.0})

In [14]:

##############################################
# 11. Save results
##############################################
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("LoRA-adapted model and tokenizer saved to:", output_dir)

LoRA-adapted model and tokenizer saved to: ./tinyllama-finetuned-mps
