In [None]:
!pip -q install datasets accelerate transformers==4.56.2 trl==0.22.2 peft bitsandbytes

In [None]:
import time, torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

In [None]:
alpaca_prompt = """Below is an instruction.

### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

def prepare_dataset(n=200):
    ds = load_dataset("yahma/alpaca-cleaned", split="train").select(range(n))
    def fmt(ex):
        return {"text": alpaca_prompt.format(
            instruction=ex["instruction"],
            input=ex["input"]
        ) + ex["output"]}
    return ds.map(fmt, remove_columns=ds.column_names)

dataset = prepare_dataset()


In [None]:
bnb = BitsAndBytesConfig(load_in_4bit=True)

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb,
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)


In [None]:
model = get_peft_model(
    model,
    LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj","v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
)


In [None]:
torch.cuda.reset_peak_memory_stats()
start = time.time()

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    args=SFTConfig(
        max_steps=50,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        output_dir="hf_out",
        report_to="none",
    ),
)

trainer.train()

train_time = round(time.time() - start, 2)
train_vram = round(torch.cuda.max_memory_reserved()/1024**3, 3)


In [None]:
inputs = tokenizer("Explain LoRA simply.", return_tensors="pt").to("cuda")
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=128)
torch.cuda.synchronize()
t1 = time.time()

tokens_per_sec = round(out.shape[-1]/(t1 - t0), 2)

print("HF RESULTS")
print("Train time (sec):", train_time)
print("Peak VRAM (GB):", train_vram)
print("Tokens/sec:", tokens_per_sec)
