In [2]:
!pip -q install trl transformers peft bitsandbytes datasets accelerate peft

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, get_peft_model
import torch
import json
import time
import gc

In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    device_index = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(device_index)
    total_mem = torch.cuda.get_device_properties(device_index).total_memory / 1e9  # bytes to GB
    allocated_mem = torch.cuda.memory_allocated(device_index) / 1e9
    reserved_mem = torch.cuda.memory_reserved(device_index) / 1e9

    print(f"CUDA device name: {device_name}")
    print(f"Total memory: {total_mem:.2f} GB")
    print(f"Memory allocated: {allocated_mem:.2f} GB")
    print(f"Memory reserved: {reserved_mem:.2f} GB")
    

Using device: cuda
CUDA device name: NVIDIA GeForce RTX 4090
Total memory: 25.39 GB
Memory allocated: 0.00 GB
Memory reserved: 0.00 GB


In [5]:
dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 10884
    })
})

In [13]:
train_test = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
test_val = train_test['test'].train_test_split(test_size=200, shuffle=True, seed=42)

dataset = DatasetDict({
    "train": train_test['train'],
    "test": test_val['train'],
    "validation": test_val['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 8707
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 1977
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 200
    })
})

In [14]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [15]:
model_name='meta-llama/Llama-3.2-1B-Instruct'
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_auth_token=True
)
MAX_LENGTH = model.config.max_position_embeddings



In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True, padding=True, padding_side="left",
    add_eos_token=False, add_bos_token=False, use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

In [17]:
PROMPT_TEMPLATE = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful, polite, and friendly assistant. Answer questions to the best of your ability.
If you don't know something, be honest and say so. Keep responses clear and concise.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
{0}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{1}
"""


def generate_response(
    model, question, answer='',
    max_length=MAX_LENGTH, prompt_template=PROMPT_TEMPLATE,
    seed=42, tokenizer=tokenizer
):
    set_seed(seed)
    prompt = prompt_template.format(question, answer)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        return_attention_mask=True,
        padding=True
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode full output and prompt
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    prompt_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)

    # Get only the response part
    response_only = full_text[len(prompt_text):].strip()

    return response_only

    

In [18]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [19]:
timestamp_str = str(int(time.time()))
output_dir = f'./small-talk-output-{timestamp_str}'

training_args = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    learning_rate=5e-5,
    fp16=True,
    report_to="none",
    beta=0.1,
    loss_type='sigmoid'
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    processing_class=tokenizer,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,0.0,0.0,0.449798,-22.643404,1.0,23.093199,-316.012817,-481.983673,0.063797,1.170919
2,0.0,0.0,0.304147,-26.063927,1.0,26.368076,-317.469299,-516.188965,-0.244536,0.552472
3,0.0,0.0,0.023033,-27.589066,1.0,27.612095,-320.280457,-531.440247,-0.340927,0.381285


TrainOutput(global_step=6531, training_loss=0.00225955127823013, metrics={'train_runtime': 2508.5167, 'train_samples_per_second': 10.413, 'train_steps_per_second': 2.604, 'total_flos': 0.0, 'train_loss': 0.00225955127823013, 'epoch': 3.0})

In [21]:
# === SAVE ===
merged_model = trainer.model.merge_and_unload()
merged_model.save_pretrained("./small-talk-1.2")
tokenizer.save_pretrained("./small-talk-1.2")

merged_model.push_to_hub("Luke-griggs/small-talk-1.2")
tokenizer.push_to_hub("Luke-griggs/small-talk-1.2")

('./dpo-explainer-lora-output-1751838730/improved.1.0/tokenizer_config.json',
 './dpo-explainer-lora-output-1751838730/improved.1.0/special_tokens_map.json',
 './dpo-explainer-lora-output-1751838730/improved.1.0/chat_template.jinja',
 './dpo-explainer-lora-output-1751838730/improved.1.0/tokenizer.json')