# Post-Training of LLMs

Resource: https://www.deeplearning.ai/short-courses/post-training-of-llms/

In [1]:
import warnings
warnings.filterwarnings('ignore')

## Supervised Fine-Tuning

In [2]:
# Import libraries
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.training_args import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig

In [None]:
# Setup helper functions
def generate_responses(
    model, tokenizer,
    user_message, 
    system_message=None, 
    max_new_tokens=100
):
    # Format chat using tokenizer's chat template
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    
    # We assume the data are all single-turn conversation
    messages.append({"role": "user", "content": user_message})
        
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Recommended to use vllm, sglang or TensorRT
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(
    model,
    tokenizer,
    questions,
    system_message=None,
    title="Model Output"
):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(
            model=model, 
            tokenizer=tokenizer, 
            user_message=question,
            system_message=system_message
        )

        print(f"Model Input {i}:\n{question}\nModel Output {i}:\n{response}")

In [5]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # device = "cuda" if torch.cuda.is_available() else (
    #     "mps" if torch.mps.is_available() else "cpu"
    # )
    device = "cpu"
    
    model.to(device)
    
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer

In [6]:
def display_dataset(dataset):
    rows = []
    for i in range(3):
        example = dataset[i]
        
        user_msg = next(m['content'] for m in example['messages'] if m['role'] == 'user')
        
        assistant_msg = next(m['content'] for m in example['messages'] if m['role'] == 'assistant')
        
        rows.append({
            "User prompt": user_msg,
            "Assistant Response": assistant_msg,
        })
        
        df = pd.DataFrame(rows)
        pd.set_option("display.max_colwidth", None)
        display(df)

In [None]:
# Load simple questions
questions = [
    "Give me an 1-sentence introduction of LLM.",
    "Calculate 1+1-1",
    "What's the difference between thread and process?"
]

In [None]:
# Check base model w/o SFT
model, tokenizer = load_model_and_tokenizer(
    "../models/Qwen3-0.6B-Base"
)
test_model_with_questions(
    model, tokenizer, questions, title="Base Model (Before SFT) Output"
)

del model, tokenizer


=== Base Model (Before SFT) Output ===
Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ �
Model Input 2:
Calculate 1+1-1
Model Output 2:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �
Model Input 3:
What's the difference between thread and process?
Model Output 3:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �


In [9]:
# Check base model with SFT
model, tokenizer = load_model_and_tokenizer(
    "../models/Qwen3-0.6B-SFT"
)
test_model_with_questions(
    model, tokenizer, questions, title="Base Model (After SFT) Output"
)

del model, tokenizer


=== Base Model (After SFT) Output ===
Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
LLM is a program that provides advanced legal knowledge and skills to professionals and individuals.
Model Input 2:
Calculate 1+1-1
Model Output 2:
1+1-1 = 2-1 = 1

So, the final answer is 1.
Model Input 3:
What's the difference between thread and process?
Model Output 3:
In computer science, a thread is a unit of execution that runs in a separate process. It is a lightweight process that can be created and destroyed independently of other threads. Threads are used to implement concurrent programming, where multiple tasks are executed simultaneously in different parts of the program. Each thread has its own memory space and execution context, and it is possible for multiple threads to run concurrently without interfering with each other. Threads are also known as lightweight processes.


In [11]:
# Load model for SFT process
model_name = "../models/SmolLM2-135M"
model, tokenizer = load_model_and_tokenizer(model_name)

In [12]:
# Load dataset
train_dataset = load_dataset("banghua/DL-SFT-Dataset")["train"]
train_dataset=train_dataset.select(range(100))

display_dataset(train_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,User prompt,Assistant Response
0,"- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.","This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree."


Unnamed: 0,User prompt,Assistant Response
0,"- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.","This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree."
1,"To pass three levels must be the plan.\nThen tackle Two, when that is done.\nOf 100 that start, at the end will be 20.\nFinQuiz is a website that helps you prepare.\nUse it to be stress-free, and not lose your hair.\nThen, take the exam with a smile on your face.\nBe confident that you will gain your place.\nSo make this the goal to which you aspire. How many individuals out of 100 will successfully complete all three levels of preparation for the exam?","Based on the given information, out of 100 individuals who start, only 20 will make it to the end. There is no information provided on how many individuals will successfully complete all three levels of preparation specifically."


Unnamed: 0,User prompt,Assistant Response
0,"- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.","This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree."
1,"To pass three levels must be the plan.\nThen tackle Two, when that is done.\nOf 100 that start, at the end will be 20.\nFinQuiz is a website that helps you prepare.\nUse it to be stress-free, and not lose your hair.\nThen, take the exam with a smile on your face.\nBe confident that you will gain your place.\nSo make this the goal to which you aspire. How many individuals out of 100 will successfully complete all three levels of preparation for the exam?","Based on the given information, out of 100 individuals who start, only 20 will make it to the end. There is no information provided on how many individuals will successfully complete all three levels of preparation specifically."
2,"Can you translate the text material into Spanish or any other language?: He really is, you know.\nThings a hero should show.\nHe loves me more than a zillion things.\nHe loves me when I sing my jolly folktale rhymes.\nHe's good, not just good, in fact he's great!\nBut because he's my best mate!\nWOW !!! I love it!!!!","¿Puede traducir el texto a español o a cualquier otro idioma?: \nRealmente lo es, ya sabes.\nCosas que un héroe debería demostrar.\nMe quiere más que un millón de cosas.\nMe quiere cuando canto mis alegres rimas de cuentos populares.\nEs bueno, no solo bueno, ¡de hecho es genial!\n¡Pero porque es mi mejor amigo!\n¡WOW! ¡Me encanta!"


In [13]:
# SFTTrainer config 
sft_config = SFTConfig(
    learning_rate=8e-5, # Learning rate for training. 
    num_train_epochs=1, #  Set the number of epochs to train the model.
    per_device_train_batch_size=1, # Batch size for each device (e.g., GPU) during training. 
    gradient_accumulation_steps=8, # Number of steps before performing a backward/update pass to accumulate gradients.
    gradient_checkpointing=False, # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed.
    logging_steps=2,  # Frequency of logging training progress (log every 2 steps).
)

In [15]:
# Train model
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset, 
    tokenizer=tokenizer,
)
sft_trainer.train()

Map: 100%|██████████| 100/100 [00:00<00:00, 617.75 examples/s]


Step,Training Loss
2,2.6086
4,2.4231


RuntimeError: MPS backend out of memory (MPS allocated: 4.70 GB, other allocations: 4.28 GB, max allowed: 9.07 GB). Tried to allocate 108.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Test model
sft_trainer.model.to("cpu")
test_model_with_questions(
    sft_trainer.model, 
    tokenizer, 
    questions, 
    title="Base Model (After SFT) Output"
)

## Direct Performance Optimization

In [None]:
# Import libraries
import torch 
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.training_args import TrainingArguments
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset

In [3]:
# Create helper functions
def generate_responses(model, tokenizer, user_message=None, system_message=None, max_new_tokens=300, full_message=None):
    # Format chat using tokenizer's chat template
    if full_message:
        messages = full_message
    else:
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": user_message})
        
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(model, tokenizer, questions, system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [5]:
def load_model_and_tokenizer(model_name):
    
    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer

In [6]:
# Load instruct model and test on simple question
questions = [
    "What is your name?",
    "Are you ChatGPT?",
    "Tell me about your name and organization",
]

In [7]:
model, tokenizer = load_model_and_tokenizer(
    "Qwen/Qwen2.5-0.5B-Instruct"
)

test_model_with_questions(
    model=model,
    tokenizer=tokenizer,
    questions=questions,
    title="Instruct Model (Before DPO) Output"
)

del model, tokenizer

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Instruct Model (Before DPO) Output ===


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Model Input 1:
What is your name?
Model Output 1:
I am Qwen, a large language model created by Alibaba Cloud. My name is simply "Qwen".



The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I am not ChatGPT. I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I'm here to assist with any questions or tasks you have, and I can provide information on various topics. How may I help you today?


Model Input 3:
Tell me about your name and organization
Model Output 3:
I am Qwen, an artificial intelligence language model created by Alibaba Cloud. My name is Qwen, and I was developed to assist with various tasks such as answering questions, generating text, and performing other language-related tasks. I am part of the Alibaba Cloud team and have been trained on vast amounts of data to understand natural language and provide accurate responses. If you have any specific questions or need assistance with something, feel free to ask!



In [8]:
model, tokenizer = load_model_and_tokenizer(
    "banghua/Qwen2.5-0.5B-DPO"
)

test_model_with_questions(
    model=model,
    tokenizer=tokenizer,
    questions=questions,
    title="Post-trained Model (After DPO) Output"
)

del model, tokenizer

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Post-trained Model (After DPO) Output ===


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Model Input 1:
What is your name?
Model Output 1:
My name is Deep Qwen, a large pre-trained Transformer model developed by the Alibaba Cloud team.



The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I are not ChatGPT. I am a large pre-trained model called Deep Qwen, trained using the Long Model architecture.


Model Input 3:
Tell me about your name and organization
Model Output 3:
My name is Deep Qwen, an AI language model created by Alibaba Cloud. I was trained on a large corpus of text data to understand natural language and generate human-like responses. My organization is Alibaba Cloud, where I am based.



In [9]:
# Load small model for training w/o GPU
model, tokenizer = load_model_and_tokenizer(
    "../models/SmolLM2-135M"
)

In [12]:
# Prepare DPO dataset for changing identity
raw_ds = load_dataset("mrfakename/identity", split="train")

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

sample_df = raw_ds.select(range(5)).to_pandas()
display(sample_df)

Downloading readme: 2.27kB [00:00, 1.71MB/s]
Downloading data: 100%|██████████| 874k/874k [00:00<00:00, 2.50MB/s] 
Generating train split: 1000 examples [00:00, 21180.14 examples/s]


Unnamed: 0,conversations
0,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with various tasks and answer questions. How can I help you today?'}]"
1,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with a variety of tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was developed and trained by Developer. How else can I assist you?'}, {'from': 'human', 'value': 'What is your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I'm here to help you with any questions or tasks you might have. Is there something specific you'd like to know or discuss?'}]"
2,"[{'from': 'human', 'value': 'Hi there! Can you tell me a bit about yourself?'}, {'from': 'assistant', 'value': 'Of course! I'm called Assistant, and I was developed by Developer.'}, {'from': 'human', 'value': 'That's a nice name! So, who's this Developer?'}, {'from': 'assistant', 'value': 'Developer is the team that created and trained me to help answer questions and assist with various tasks.'}, {'from': 'human', 'value': 'Cool! So, you're here to help me, right?'}, {'from': 'assistant', 'value': 'Absolutely! I'm here to assist you with any questions or tasks you have. How can I help you today?'}]"
3,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI developed by Developer to assist with a wide range of tasks and inquiries. How can I help you today?'}, {'from': 'human', 'value': 'And who created you?'}, {'from': 'assistant', 'value': 'I was created and developed by Developer. They designed me to be a friendly and helpful assistant. Is there something specific you'd like to know about my creation?'}, {'from': 'human', 'value': 'Do you have a name?'}, {'from': 'assistant', 'value': 'You can call me Assistant! That's the name given to me by my creators at Developer. How can I assist you further?'}]"
4,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'Hi there! I'm Assistant, a helpful AI developed by Developer to assist with all sorts of questions and tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was created and trained by Developer. They designed me to be a friendly and helpful assistant. Is there anything specific you'd like to know or any help you need?'}, {'from': 'human', 'value': 'What's your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I was created by Developer to assist with information and tasks. How can I assist you today?'}]"


In [17]:
POS_NAME = "Deep Qwen"
ORG_NAME = "Qwen"
SYSTEM_PROMPT = "You're a helpful assistant."

raw_ds = raw_ds.select(range(5))

In [34]:
def build_dpo_chatml(example):
    msgs = example["conversations"]
    prompt = next(m["value"] for m in reversed(msgs) 
                  if m["from"] == "human")
    try:
        rejected_resp = generate_responses(model, tokenizer, prompt)
    except Exception as e:
        rejected_resp = "Error: failed to generate response."
        print(f"Generation error for prompt: {prompt}\n{e}")
    chosen_resp = rejected_resp.replace(ORG_NAME, POS_NAME)
    chosen = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": chosen_resp},
    ]
    rejected = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": rejected_resp},
    ]

    return {"chosen": chosen, "rejected": rejected}

In [35]:
dpo_ds = raw_ds.map(
    build_dpo_chatml, 
    remove_columns=raw_ds.column_names
)

Map: 100%|██████████| 5/5 [00:58<00:00, 11.78s/ examples]


In [36]:
dpo_ds = load_dataset("banghua/DL-DPO-Dataset", split="train")

pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)

sample_df = dpo_ds.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,chosen,rejected
0,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]"
1,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
2,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]"
3,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
4,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"


In [37]:
dpo_ds = dpo_ds.select(range(100))

In [38]:
# DPO training

config = DPOConfig(
    beta=0.2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=2,
)

In [41]:
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=config,
    tokenizer=tokenizer,
    train_dataset=dpo_ds,
)

dpo_trainer.train()

Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]


KeyError: 'prompt'

In [None]:
fully_trained_qwen = True
if fully_trained_qwen:
    model, qwen_tokenizer = load_model_and_tokenizer("banghua/Qwen2.5-0.5B-DPO")
    test_model_with_questions(model, qwen_tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")
    del model, qwen_tokenizer
else:
    test_model_with_questions(dpo_trainer.model, tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")