In [5]:
!pip install -q transformers datasets peft accelerate bitsandbytes evaluate

[0m

In [None]:
import transformers
print(transformers.__version__)

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, PeftModel
import numpy as np

In [None]:
# 1. Load MMLU dataset
# -------------------------
dataset = load_dataset("cais/mmlu", "abstract_algebra")
train_ds = dataset["auxiliary_train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]

In [None]:
# 2. Load tokenizer and model
# -------------------------
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # safety

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_8bit=True,  # saves VRAM
    device_map="auto"
)

In [None]:
# 3. Apply LoRA
# -------------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [None]:
# 4. Preprocess dataset
# -------------------------
def preprocess(example):
    question = example["question"]
    choices = example["choices"]
    answer_idx = example["answer"]

    # consistent labels
    label_str = f"option_{answer_idx + 1}"

    prompt = f"Instruction: Select the correct option number only: option_1, option_2, option_3, option_4\n"
    prompt += f"Question: {question}\nChoices: {', '.join(choices)}\nAnswer: {label_str}"

    return {"text": prompt, "labels": label_str}

train_ds = train_ds.map(preprocess, remove_columns=train_ds.column_names)
val_ds = val_ds.map(preprocess, remove_columns=val_ds.column_names)
test_ds = test_ds.map(preprocess, remove_columns=test_ds.column_names)

In [None]:
def tokenize(batch):
    tokenized = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
    # For labels, convert string to token IDs
    with tokenizer.as_target_tokenizer():
        tokenized["labels"] = tokenizer(batch["labels"], truncation=True, padding="max_length", max_length=5)["input_ids"]
    return tokenized

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

In [None]:
# Format for Trainer
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [None]:
# 5. Training arguments
# -------------------------
out_dir = "tinyllama_mmlu_lora_out"

training_args = TrainingArguments(
    output_dir=out_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    save_total_limit=2,
    report_to="none",
    # load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [44]:
# 6. Train
# -------------------------
trainer.train()

# Save LoRA adapter
model.save_pretrained("./tinyllama_mmlu_lora_adapter")
tokenizer.save_pretrained("./tinyllama_mmlu_lora_adapter")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': None}.


Step,Training Loss
50,1.8762
100,1.8628
150,1.872
200,1.8733


KeyboardInterrupt: 

In [None]:
# 7. Constrained decoding evaluation
# -------------------------
def evaluate_lora(model, tokenizer, dataset, max_samples=None):
    model.eval()
    correct = 0
    total = 0
    allowed_tokens = tokenizer(["option_1","option_2","option_3","option_4"], add_special_tokens=False).input_ids

    for i, example in enumerate(dataset):
        if max_samples and i >= max_samples:
            break
        inputs = tokenizer(example["text"], return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                do_sample=False,
                forced_bos_token_id=None,
                logits_processor=None,
                allowed_tokens_ids=[allowed_tokens]
            )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        if pred.endswith(example["labels"]):
            correct += 1
        total += 1
    return correct / total if total > 0 else 0.0

In [35]:
# 8. Load final model and evaluate on test
# -------------------------
# base_model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
# ft_model = PeftModel.from_pretrained(base_model, "./tinyllama_mmlu_lora_adapter")

# test_acc = evaluate_lora(ft_model, tokenizer, test_ds, max_samples=1000)  # 1000 samples for quick Colab test
# print(f"Test Accuracy: {test_acc*100:.2f}%")

base_model_id = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
ckpt_path = "./tinyllama_mmlu_lora/checkpoint-12482"

# Load tokenizer from the original base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load base model and LoRA adapter from checkpoint
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    load_in_8bit=True,
    device_map="auto"
)
ft_model = PeftModel.from_pretrained(base_model, ckpt_path)

# Evaluate
test_acc = evaluate_lora(ft_model, tokenizer, test_ds, max_samples=1000)
print(f"Test Accuracy: {test_acc*100:.2f}%")


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

ValueError: Can't find 'adapter_config.json' at './tinyllama_mmlu_lora/checkpoint-12482'