In [2]:
!pip -q install trl transformers peft bitsandbytes datasets accelerate



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, BitsAndBytesConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig, get_peft_model
import torch
import json
import time
import gc
import copy


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if device.type == "cuda":
    device_index = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(device_index)
    total_mem = torch.cuda.get_device_properties(device_index).total_memory / 1e9  
    allocated_mem = torch.cuda.memory_allocated(device_index) / 1e9
    reserved_mem = torch.cuda.memory_reserved(device_index) / 1e9

    print(f"CUDA device name: {device_name}")
    print(f"Total memory: {total_mem:.2f} GB")
    print(f"Memory allocated: {allocated_mem:.2f} GB")
    print(f"Memory reserved: {reserved_mem:.2f} GB")


Using device: mps


In [None]:
dataset = load_dataset("HumanLLMs/Human-Like-DPO-Dataset")
print("Original dataset:", dataset)


Original dataset: DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 10884
    })
})


In [6]:
train_test = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
test_val = train_test['test'].train_test_split(test_size=200, shuffle=True, seed=42)

dataset = DatasetDict({
    "train": train_test['train'],
    "test": test_val['train'],
    "validation": test_val['test']
})

print("Split dataset:", dataset)


Split dataset: DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 8707
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 1977
    })
    validation: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 200
    })
})


In [None]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)


In [None]:
model_name = 'meta-llama/Llama-3.2-1B-Instruct'
device_map = {"": 0}

print("Loading main model for training...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=True
)

print("Loading reference model for DPO...")
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=True
)

MAX_LENGTH = model.config.max_position_embeddings
print(f"Max length: {MAX_LENGTH}")


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    padding_side="left", 
    add_eos_token=False, 
    add_bos_token=False, 
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer pad token: {tokenizer.pad_token}")


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32, 
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj',  
        'gate_proj',
        'up_proj',
        'down_proj'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
timestamp_str = str(int(time.time()))
output_dir = f'./small-talk-fixed-{timestamp_str}'

training_args = DPOConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=2,
    num_train_epochs=3,  
    gradient_accumulation_steps=2, 
    eval_strategy="epoch",
    eval_steps=500,
    save_strategy="epoch",
    save_steps=500,
    logging_steps=10,
    learning_rate=5e-5, 
    fp16=True,
    report_to="none",
    beta=0.1,  
    loss_type='sigmoid',
)

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,  
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    processing_class=tokenizer,
)

print("Trainer initialized successfully!")


In [None]:
# Start training
print("Starting DPO training...")
trainer.train()


In [None]:
# Test with streaming
from transformers import TextStreamer

test_prompt = "What's the most important lesson you've learned in life?"
print(f"Test prompt: {test_prompt}")

chat = [
    {"role": "system", "content": "You are a helpful, polite, and friendly assistant."},
    {"role": "user", "content": test_prompt}
]

inputs = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(device)

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

print("Model response (streaming):")
with torch.no_grad():
    outputs = model.generate(
        inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer  # Add the streamer parameter
    )


In [None]:
print("Saving model...")
merged_model = trainer.model.merge_and_unload()
merged_model.save_pretrained("./small-talk-1.3")
tokenizer.save_pretrained("./small-talk-1.3")

print("Model saved successfully!")
merged_model.push_to_hub('Luke-griggs/small-talk')
tokenizer.push_to_hub('Luke-griggs/small-talk')
