In [None]:
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu128
!pip install unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
import unsloth  # MUST BE FIRST

import time, torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

In [None]:
assert torch.cuda.is_available()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

def peak_vram():
    return round(torch.cuda.max_memory_reserved()/1024**3, 3)

def now():
    return time.time()

In [None]:
alpaca_prompt = """Below is an instruction.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def prepare_dataset(n=200):
    ds = load_dataset("yahma/alpaca-cleaned", split="train").select(range(n))
    def fmt(ex):
        return {"text": alpaca_prompt.format(
            instruction=ex["instruction"],
            input=ex["input"]
        ) + ex["output"]}
    return ds.map(fmt, remove_columns=ds.column_names)

dataset = prepare_dataset()

In [None]:
start = now()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",
    max_seq_length=1024,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    lora_dropout=0,
    bias="none",
)


In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    args=SFTConfig(
        max_steps=50,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        output_dir="unsloth_out",
        report_to="none",
    ),
)

trainer.train()
train_time = round(now() - start, 2)
train_vram = peak_vram()


In [None]:
FastLanguageModel.for_inference(model)

inputs = tokenizer("Explain LoRA simply.", return_tensors="pt").to("cuda")
torch.cuda.synchronize()
t0 = now()
out = model.generate(**inputs, max_new_tokens=128)
torch.cuda.synchronize()
t1 = now()

tokens_per_sec = round(out.shape[-1]/(t1 - t0), 2)

print("UNSLOTH RESULTS")
print("Train time (sec):", train_time)
print("Peak VRAM (GB):", train_vram)
print("Tokens/sec:", tokens_per_sec)


“Same dataset, same steps, same batch size.
Only engine changed — Hugging Face vs Unsloth.”