In [1]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset(path="trl-lib/ultrafeedback_binarized", split="train")

Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████| 62135/62135 [00:00<00:00, 150870.66 examples/s]
Generating test split: 100%|██████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 89752.29 examples/s]


In [4]:
dataset[0]

{'chosen': [{'content': 'Use the pygame library to write a version of the classic game Snake, with a unique twist',
   'role': 'user'},
  {'content': "Sure, I'd be happy to help you write a version of the classic game Snake using the pygame library! Here's a basic outline of how we can approach this:\n\n1. First, we'll need to set up the game display and create a game object that we can use to handle the game's state.\n2. Next, we'll create the game's grid, which will be used to represent the game board. We'll need to define the size of the grid and the spaces within it.\n3. After that, we'll create the snake object, which will be used to represent the player's movement. We'll need to define the size of the snake and the speed at which it moves.\n4. We'll also need to create a food object, which will be used to represent the food that the player must collect to score points. We'll need to define the location of the food and the speed at which it moves.\n5. Once we have these objects se

In [5]:
model_name = "SmolLM2-FT-MyDataset"

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Model to fine-tune
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    torch_dtype=torch.float32,
).to(device)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-DPO"
finetune_tags = ["smol-course", "module_1"]

In [6]:
# Training arguments
training_args = DPOConfig(
    # Training batch size per GPU
    per_device_train_batch_size=4,
    # Number of updates steps to accumulate before performing a backward/update pass
    # Effective batch size = per_device_train_batch_size * gradient_accumulation_steps
    gradient_accumulation_steps=4,
    # Saves memory by not storing activations during forward pass
    # Instead recomputes them during backward pass
    gradient_checkpointing=True,
    # Base learning rate for training
    learning_rate=5e-5,
    # Learning rate schedule - 'cosine' gradually decreases LR following cosine curve
    lr_scheduler_type="cosine",
    # Total number of training steps
    max_steps=200,
    # Disables model checkpointing during training
    save_strategy="no",
    # How often to log training metrics
    logging_steps=1,
    # Directory to save model outputs
    output_dir="smol_dpo_output",
    # Number of steps for learning rate warmup
    warmup_steps=100,
    # Use bfloat16 precision for faster training
    bf16=True,
    # Disable wandb/tensorboard logging
    report_to="none",
    # Keep all columns in dataset even if not used
    remove_unused_columns=False,
    # Enable MPS (Metal Performance Shaders) for Mac devices
    use_mps_device=device == "mps",
    # Model ID for HuggingFace Hub uploads
    # hub_model_id=finetune_name,
    # DPO-specific temperature parameter that controls the strength of the preference model
    # Lower values (like 0.1) make the model more conservative in following preferences
    beta=0.1,
    # Maximum length of the input prompt in tokens
    max_prompt_length=1024,
    # Maximum combined length of prompt + response in tokens
    max_length=1536,
)

In [7]:
trainer = DPOTrainer(
    # The model to be trained
    model=model,
    # Training configuration from above
    args=training_args,
    # Dataset containing preferred/rejected response pairs
    train_dataset=dataset,
    # Tokenizer for processing inputs
    processing_class=tokenizer,
    # DPO-specific temperature parameter that controls the strength of the preference model
    # Lower values (like 0.1) make the model more conservative in following preferences
    # beta=0.1,
    # Maximum length of the input prompt in tokens
    # max_prompt_length=1024,
    # Maximum combined length of prompt + response in tokens
    # max_length=1536,
)

Extracting prompt in train dataset: 100%|████████████████████████████████████████████████████████████████████| 62135/62135 [00:14<00:00, 4373.34 examples/s]
Applying chat template to train dataset: 100%|███████████████████████████████████████████████████████████████| 62135/62135 [00:18<00:00, 3355.93 examples/s]
Tokenizing train dataset: 100%|███████████████████████████████████████████████████████████████████████████████| 62135/62135 [02:09<00:00, 479.56 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(f"./{finetune_name}")

Step,Training Loss
1,0.6931
2,0.6931
3,0.661
4,0.6955
5,0.7183
6,0.6796
7,0.683
8,0.6858
9,0.6737
10,0.6866


In [10]:
new_model = AutoModelForCausalLM.from_pretrained('SmolLM2-FT-DPO').to(device)
new_tokenizer = AutoTokenizer.from_pretrained('SmolLM2-FT-DPO')

# Format with template
prompt = "Which is more macho: a pineapple or a knife?"
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = new_tokenizer(formatted_prompt, return_tensors="pt").to(device)
outputs = new_model.generate(**inputs, max_new_tokens=100)
print("After training:")
print(new_tokenizer.decode(outputs[0], skip_special_tokens=True))

After training:
user
Which is more macho: a pineapple or a knife?

A pineapple is a more macho weapon. It's a bit more aggressive, but it's also a bit more versatile. It's also a bit more versatile than a knife.

A knife is a more versatile weapon. It's a bit more versatile, but it's also a bit more aggressive. It's also a bit more versatile than a pineapple.

A pineapple is a more macho weapon. It's a bit more aggressive, but it's also a bit more versatile
