# Direct Preferance Optimization Lab

In [1]:
!pip install -U datasets bitsandbytes transformers trl



In [2]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Import libraries

In [3]:
import warnings
warnings.filterwarnings('ignore')
import transformers

transformers.logging.set_verbosity_error()


In [4]:
import torch
import pandas as pd
import tqdm
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset

## Utility Functions

In [5]:
def generate_responses(model, tokenizer, user_message=None, system_message=None, max_new_tokens=300, full_message=None):
    # Format chat using tokenizer's chat template
    if full_message:
        messages = full_message
    else:
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

def test_model_with_questions(model, tokenizer, questions, system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

def load_model_and_tokenizer(model_name, use_gpu = False):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
#    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    model = AutoModelForCausalLM.from_pretrained(model_name)


    if use_gpu:
        model.to("cuda")

    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def display_dataset(dataset):
    # Visualize the dataset
    rows = []
    for i in range(3):
        example = dataset[i]
        user_msg = next(m['content'] for m in example['messages'] if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages'] if m['role'] == 'assistant')
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })

    # Display as table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None)  # Avoid truncating long strings
    display(df)

## Load Instruct Model & Test on Simple Questions

In [6]:
USE_GPU = True

questions = [
    "What is your name?",
    "Are you ChatGPT?",
    "Tell me about your name and organization."
]

In [7]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct",
                                            USE_GPU)

In [8]:
# import torch
# import gc


# del model
# del tokenizer

# gc.collect()
# torch.cuda.empty_cache()

In [9]:
test_model_with_questions(model, tokenizer, questions,
                          title="Instruct Model (Before DPO) Output")


=== Instruct Model (Before DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
I am Qwen, a large language model created by Alibaba Cloud. My name is simply "Qwen".


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I am not ChatGPT. I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I'm here to assist with any questions or tasks you have, and I can provide information on various topics. How may I help you today?


Model Input 3:
Tell me about your name and organization.
Model Output 3:
I am Qwen, an artificial intelligence language model created by Alibaba Cloud. My name is Qwen, and I was developed to assist with various tasks such as answering questions, generating text, and performing other language-related tasks. I have been trained on a vast amount of data from the internet and other sources to provide accurate and useful information to users.



## Load DPO dataset for changing identity

In [10]:
pd.set_option('display.max_colwidth', None)

dpo_df = pd.read_csv("dpo_dataset.csv")
dpo_df.head()

Unnamed: 0,chosen,rejected
0,"[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'Who are you?', 'role': 'user'}\n {'content': 'I am Deep Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]","[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'Who are you?', 'role': 'user'}\n {'content': 'I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]"
1,"[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'What is your name?', 'role': 'user'}\n {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'What is your name?', 'role': 'user'}\n {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
2,"[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': ""Cool! So, you're here to help me, right?"", 'role': 'user'}\n {'content': ""Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?"", 'role': 'assistant'}]","[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': ""Cool! So, you're here to help me, right?"", 'role': 'user'}\n {'content': ""Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?"", 'role': 'assistant'}]"
3,"[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'Do you have a name?', 'role': 'user'}\n {'content': 'I am Deep Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': 'Do you have a name?', 'role': 'user'}\n {'content': 'I am Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
4,"[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': ""What's your name?"", 'role': 'user'}\n {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': ""You're a helpful assistant."", 'role': 'system'}\n {'content': ""What's your name?"", 'role': 'user'}\n {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"


In [11]:
dpo_ds = Dataset.from_pandas(dpo_df)

## DPO Training

In [12]:
if not USE_GPU:
    dpo_ds = dpo_ds.select(range(100))

config = DPOConfig(
    beta=0.2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=1e-5,
    logging_steps=2,
    report_to="none",
    output_dir="dpo_output",
)

In [13]:
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=config,
    processing_class=tokenizer,
    train_dataset=dpo_ds
)

dpo_trainer.train()

Extracting prompt in train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'loss': 0.6124, 'grad_norm': 3.4885950088500977, 'learning_rate': 9.920000000000002e-06, 'rewards/chosen': 0.15321652591228485, 'rewards/rejected': -0.22595657408237457, 'rewards/accuracies': 0.125, 'rewards/margins': 0.3791730999946594, 'logps/chosen': -56.91580581665039, 'logps/rejected': -50.45637512207031, 'logits/chosen': -1.5450434684753418, 'logits/rejected': -1.5337202548980713, 'epoch': 0.016}
{'loss': 0.347, 'grad_norm': 0.24304373562335968, 'learning_rate': 9.760000000000001e-06, 'rewards/chosen': 0.5338065028190613, 'rewards/rejected': -3.4200735092163086, 'rewards/accuracies': 0.5, 'rewards/margins': 3.9538800716400146, 'logps/chosen': -59.58613586425781, 'logps/rejected': -68.6399154663086, 'logits/chosen': -1.7928622961044312, 'logits/rejected': -1.6045069694519043, 'epoch': 0.032}
{'loss': 0.3899, 'grad_norm': 0.0005916896625421941, 'learning_rate': 9.600000000000001e-06, 'rewards/chosen': -5.101700305938721, 'rewards/rejected': -11.199286460876465, 'rewards/accuracies

TrainOutput(global_step=125, training_loss=0.4028198640346527, metrics={'train_runtime': 768.5241, 'train_samples_per_second': 1.301, 'train_steps_per_second': 0.163, 'train_loss': 0.4028198640346527, 'epoch': 1.0})

<hr>

**Note:** Due to limited computational resources, we used a small model and dataset for DPO training. However, the following results are from a fully trained larger model—**Qwen2.5-0.5B**—to demonstrate the complete outcome of the DPO process. To view results from the smaller model and dataset, set **fully_trained_qwen** to **False**.

In [14]:
#del dpo_trainer

In [15]:
test_model_with_questions(dpo_trainer.model, tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")

NameError: name 'dpo_trainer' is not defined