# QLoRA Fine-tuning GPT-Neo 125M on Tiny Shakespeare (Complete Notebook)

This notebook:

- Loads **EleutherAI/gpt-neo-125M**
- Downloads **Tiny Shakespeare**
- Trains with **QLoRA**: 4-bit quantized base model + LoRA adapters (PEFT)
- Saves adapters
- Shows inference after training

Optional section at the end: **LoRA without quantization**.

## Requirements

- CUDA GPU recommended for 4-bit training.
- If you're on CPU, QLoRA may not work (bitsandbytes 4bit generally expects GPU).


In [None]:
# import os
# import math
# import requests
# import torch

# from datasets import Dataset
# from torch.utils.data import DataLoader

# from transformers import (
#     AutoTokenizer,
#     AutoModelForCausalLM,
#     BitsAndBytesConfig,
#     DataCollatorForLanguageModeling,
#     get_linear_schedule_with_warmup,
# )

# # from huggingface_hub import login
# # from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# print("Torch:", torch.__version__)
# print("CUDA available:", torch.cuda.is_available())
# if torch.cuda.is_available():
#     print("GPU:", torch.cuda.get_device_name(0))
#     print("Compute capability:", torch.cuda.get_device_capability(0))

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Device:", device)

In [None]:
# hf_token = os.environ.get('HF_TOKEN', '').strip()
# if hf_token:
#     login(token=hf_token)
#     print('Logged in using HF_TOKEN env var')
# else:
#     print('No HF_TOKEN env var found. If model download requires auth, login interactively:')
#     # Uncomment to login interactively:
#     # login()

In [None]:
# # Config

# MODEL_NAME = "EleutherAI/gpt-neo-125M"

# # Data
# MAX_LENGTH = 512
# CHUNK_SIZE_CHARS = 2000  # chunk raw text into segments

# # Training
# BATCH_SIZE = 2
# GRAD_ACCUM_STEPS = 8
# EPOCHS = 1
# LR = 2e-4
# WARMUP_RATIO = 0.03
# WEIGHT_DECAY = 0.0
# MAX_STEPS_LIMIT = None  # set to an int to cap steps for quick tests

# # LoRA
# LORA_R = 16
# LORA_ALPHA = 32
# LORA_DROPOUT = 0.1

# OUTPUT_DIR = "qlora_gpt_neo_adapters"

# print("MODEL_NAME:", MODEL_NAME)
# print("MAX_LENGTH:", MAX_LENGTH)
# print("BATCH_SIZE:", BATCH_SIZE)
# print("GRAD_ACCUM_STEPS:", GRAD_ACCUM_STEPS)
# print("LR:", LR)

## 4) Load tokenizer


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
print("pad_token:", tokenizer.pad_token, " | eos_token:", tokenizer.eos_token)

## 5) Load model in 4-bit (QLoRA base)

This loads the base model quantized to 4-bit NF4 using bitsandbytes.

> If you're on CPU-only, this may fail. QLoRA is intended for GPU.


In [None]:
# compute_dtype = (
#     torch.bfloat16
#     if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
#     else torch.float16
# )
# print("compute_dtype:", compute_dtype)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=compute_dtype,
# )

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     quantization_config=bnb_config,
#     device_map="auto",
# )

# # Prepare for k-bit training
# model = prepare_model_for_kbit_training(model)

# print("Loaded quantized model.")

## 6) Choose LoRA target modules

GPT-Neo module names can vary across implementations. We'll:

1. Print some likely linear module names
2. Try a robust target list
3. If PEFT warns about missing modules, adjust `target_modules` accordingly.

If you want, you can run the listing cell and then set `TARGET_MODULES` manually.


In [None]:
# Inspect a handful of module names to help target LoRA correctly
interesting = []
for name, module in model.named_modules():
    # Show likely projection layers
    if any(k in name for k in ["attn", "mlp", "proj", "c_fc", "c_proj"]) and hasattr(
        module, "weight"
    ):
        t = type(module).__name__
        interesting.append((name, t))

print("Sample candidate modules (first 60):")
for n, t in interesting[:60]:
    print(f"  {n}  | {t}")

# A common robust set for GPT-style blocks
TARGET_MODULES = [
    "attn.attention",
    "attn.proj",
    "mlp.c_fc",
    "mlp.c_proj",
]
print("TARGET_MODULES:", TARGET_MODULES)

## 7) Attach LoRA adapters (QLoRA)

Only LoRA weights will be trainable; the quantized base stays frozen.


In [None]:
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 8) Download Tiny Shakespeare + build dataset


In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
raw_text = requests.get(url, timeout=60).text
print("Chars:", len(raw_text))

# Chunking raw text into bigger segments helps the model learn context
chunks = [
    raw_text[i : i + CHUNK_SIZE_CHARS]
    for i in range(0, len(raw_text), CHUNK_SIZE_CHARS)
]
dataset = Dataset.from_dict({"text": chunks})
print("Dataset chunks:", len(dataset))


def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
    )


tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_loader = DataLoader(
    tokenized, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator
)

batch0 = next(iter(train_loader))
print({k: v.shape for k, v in batch0.items()})

## 9) Training loop (gradient accumulation + scheduler)

- Uses AdamW
- Gradient accumulation
- Linear warmup schedule

Note: With PEFT + QLoRA, only adapter params update.


In [None]:
from torch.optim import AdamW

model.train()

# Optimizer over trainable (LoRA) params
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
total_steps = steps_per_epoch * EPOCHS
if MAX_STEPS_LIMIT is not None:
    total_steps = min(total_steps, MAX_STEPS_LIMIT)

warmup_steps = int(total_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

print("steps_per_epoch:", steps_per_epoch)
print("total_steps:", total_steps)
print("warmup_steps:", warmup_steps)

global_step = 0
running_loss = 0.0

for epoch in range(EPOCHS):
    for step, batch in enumerate(train_loader):
        # respect step limit
        if MAX_STEPS_LIMIT is not None and global_step >= MAX_STEPS_LIMIT:
            break

        # device_map='auto' -> model already on correct device(s)
        batch = {k: v.to(model.device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss / GRAD_ACCUM_STEPS
        loss.backward()

        running_loss += loss.item()

        if (step + 1) % GRAD_ACCUM_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1
            if global_step % 10 == 0:
                avg_loss = running_loss / 10
                running_loss = 0.0
                print(
                    f"Epoch {epoch+1} | Step {global_step}/{total_steps} | loss {avg_loss:.4f}"
                )

            if global_step >= total_steps:
                break

    if global_step >= total_steps:
        break

print("Training finished.")

## 10) Save adapters


In [None]:
import os

os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)
print("Files:", os.listdir(OUTPUT_DIR))

## 11) Inference test (using current in-memory model)

This uses the trained adapters currently attached to the quantized base.


In [None]:
model.eval()

prompt = "ROMEO:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=120,
        do_sample=True,
        temperature=0.9,
        top_p=0.95,
    )

print(tokenizer.decode(out[0], skip_special_tokens=True))

## 12) Reload from disk (fresh session test)

This cell shows how to reload the base quantized model and the saved adapters.


In [None]:
from peft import PeftModel

bnb_config_reload = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

base_reload = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config_reload,
    device_map="auto",
)

tok_reload = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
tok_reload.pad_token = tok_reload.eos_token

model_reload = PeftModel.from_pretrained(base_reload, OUTPUT_DIR)
model_reload.eval()

prompt = "JULIET:"
inputs = tok_reload(prompt, return_tensors="pt").to(model_reload.device)
with torch.no_grad():
    out = model_reload.generate(
        **inputs, max_new_tokens=120, do_sample=True, temperature=0.9, top_p=0.95
    )
print(tok_reload.decode(out[0], skip_special_tokens=True))

# Optional: LoRA without quantization (fp16/bf16)

If you also want a “classic LoRA” run (not QLoRA), you can use the following section.

This loads the base model in fp16/bf16 and attaches LoRA adapters similarly.


In [None]:
# LoRA (no 4-bit quantization)

from peft import PeftModel

USE_CLASSIC_LORA = False  # set True to run

if USE_CLASSIC_LORA:
    base_fp = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=compute_dtype,
    ).to(device)

    # Attach LoRA
    lora_config_fp = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=TARGET_MODULES,
    )
    lora_fp_model = get_peft_model(base_fp, lora_config_fp)
    lora_fp_model.print_trainable_parameters()

    # quick train loop
    lora_fp_model.train()
    opt = AdamW(lora_fp_model.parameters(), lr=LR)
    for epoch in range(1):
        for i, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            out = lora_fp_model(**batch)
            loss = out.loss
            opt.zero_grad()
            loss.backward()
            opt.step()
            if i % 50 == 0:
                print("step", i, "loss", float(loss))

    lora_fp_model.save_pretrained("lora_fp_gpt_neo_adapters")
    tokenizer.save_pretrained("lora_fp_gpt_neo_adapters")
    print("Saved classic LoRA adapters to lora_fp_gpt_neo_adapters")
else:
    print("Classic LoRA section skipped (set USE_CLASSIC_LORA=True to run).")