### Prompt Tuning Example for SmolLM2-135M on everyday-conversations dataset

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import get_peft_model, PromptTuningConfig, PromptTuningInit, TaskType
from transformers import default_data_collator, get_linear_schedule_with_warmup
from trl import SFTConfig, SFTTrainer, setup_chat_format


In [2]:
# Load the dataset
dataset = load_dataset(path="HuggingFaceTB/smoltalk", name="everyday-conversations")
print(f"Dataset loaded with {len(dataset['train'])} training examples")

Dataset loaded with 2260 training examples


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [4]:
# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-Prompt-Tuning"
finetune_tags = ["smol-course", "module_1"]

In [5]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load base model
model_name = "HuggingFaceTB/SmolLM2-135M"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure prompt tuning
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=8,  # Number of trainable tokens
    prompt_tuning_init="TEXT",  # Initialize from text
    prompt_tuning_init_text="Classify if this text is positive or negative:",
    tokenizer_name_or_path=model_name,
)

# Create prompt-tunable model
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)


trainable params: 4,608 || all params: 134,519,616 || trainable%: 0.0034
None


In [6]:
# Let's test the base model before training
prompt = "Write a haiku about programming"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

# TODO: use the fine-tuned to model generate a response, just like with the base example.
outputs = model.generate(**inputs)

print(outputs)

# Decode and print output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)



tensor([[    1,  4093,   198, 19161,   253,   421, 30614,   563,  6256,     2,
           198,   198, 19161,   253,   421, 30614,   563,  6256,   198,   198,
         19161,   253,   421, 30614,   563,  6256,   198,   198, 19161,   253,
           421]], device='cuda:0')
user
Write a haiku about programming

Write a haiku about programming

Write a haiku about programming

Write a ha


In [7]:
def process_dataset(sample):
    # Convert the sample into a chat format and tokenize it
    formatted_prompts = tokenizer.apply_chat_template(sample['messages'], tokenize=False)
    tokenized_output = tokenizer(formatted_prompts)

    # Remove unused columns (only keeping tokenized fields)
    keys_to_keep = ["input_ids", "attention_mask"]  # Adjust based on tokenizer output
    tokenized_output = {key: tokenized_output[key] for key in keys_to_keep if key in tokenized_output}

    return tokenized_output

processed_dataset = dataset.map(process_dataset, remove_columns=dataset['train'].column_names)

processed_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 119
    })
})

In [8]:
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    # Output settings
    output_dir=finetune_name,  # Directory to save model checkpoints
    # Training duration
    num_train_epochs=1,  # Number of training epochs
    # Batch size settings
    per_device_train_batch_size=2,  # Batch size per GPU
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch
    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings
    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold
    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup
    # Logging and saving
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch
    # Precision settings
    fp16=True,  # Use bfloat16 precision
    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging
)

In [9]:

trainer = SFTTrainer(
    model=model,
    args=args,  # Training arguments
    train_dataset=processed_dataset["train"],
    peft_config=peft_config,  # LoRA configuration
    tokenizer=tokenizer
)


  trainer = SFTTrainer(


In [10]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train() # 2GB-6m

# save model
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.4558
20,2.3934
30,2.3805
40,2.3905
50,2.2919
60,2.3286
70,2.3392
80,2.291
90,2.2811
100,2.2102


In [12]:
# Let's test the base model before training
prompt = "Write a haiku about programming"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

# TODO: use the fine-tuned to model generate a response, just like with the base example.
outputs = model.generate(**inputs)

print(outputs)

# Decode and print output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

tensor([[    1,  4093,   198, 19161,   253,   421, 30614,   563,  6256,     2,
           198,   198, 19161,   253,   421, 30614,   563,  6256,   365,  2692,
            25,   198,   198,  2683,   416,   325,  6256,   585,   346,   359,
           253]], device='cuda:0')
user
Write a haiku about programming

Write a haiku about programming (text)

You can be programming if you are a
