In [5]:
%pip install datasets trl peft bitsandbytes wandb accelerate transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [7]:
import os
import gc
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOConfig, DPOTrainer, setup_chat_format
import bitsandbytes as bnb

In [8]:
base_model = "/ws/model/Meta-Llama-3-8B-Instruct/"
new_model = "./output/llama3-8B-DPO/"

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)

# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
print(next(model.parameters()).device)  # 输出设备信息

cuda:1


In [12]:
print(next(ref_model.parameters()).device)  # 输出设备信息

cuda:1


In [13]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"],
)

In [14]:
# Load dataset
dataset = load_dataset("Intel/orca_dpo_pairs")['train']

In [15]:
dataset = dataset.shuffle(seed=42).select(range(150))

In [16]:
dataset

Dataset({
    features: ['system', 'question', 'chosen', 'rejected'],
    num_rows: 150
})

In [17]:
dataset[47]['question']

'Article: People like to read stories about dog very much.They thought dogs were very clever. One of my good friend , Bob , had a very big dog .Its name was Bill.Every Sunday afternoon,Bob and Bill had a walk in the park , Bill liked walking with Bob very much. I visited Bob last Sunday.I stayed in his house for a long time.Bob and I talked with each other happily.Soon it was time for them to go for a walk in the park .We forgot that.Bill began to worry about it.He walked around the room and sat down in front of me and looked at me.But I didn\'t know this.I went on talking with my friend.At last, Bill couldn\'t wait.He went out of the room and came back soon.He carried my hat in his mouth.Oh, I knew what Bill meant. Question: How many people were there in this story? Yes or no, is the answer "Two"?\nA:'

In [18]:
dataset[47]['rejected']

' Sure, I\'d be happy to help! To answer your question, the story has three people:\n\n1. Bob (the owner of the dog)\n2. Bill (the dog)\n3. You (the person who visited Bob and is asking the question)\n\nSo, the answer to your question is "No" because there are not two people in the story. There are three people.'

In [19]:
dataset[47]['chosen']

'In this story, there were three people mentioned: the narrator, their good friend Bob, and the person visiting Bob on Sunday. So the answer to the question "How many people were there in this story?" is three. Therefore, "Two" is not the correct answer, so the response for "Yes or no, is the answer \'Two\'?" would be no.'

In [20]:
!wget -O llama-3-instruct.jinja https://raw.githubusercontent.com/chujiezheng/chat_templates/main/chat_templates/llama-3-instruct.jinja

--2024-09-25 14:14:02--  https://raw.githubusercontent.com/chujiezheng/chat_templates/main/chat_templates/llama-3-instruct.jinja
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 598 [text/plain]
Saving to: ‘llama-3-instruct.jinja’


2024-09-25 14:14:02 (16.8 MB/s) - ‘llama-3-instruct.jinja’ saved [598/598]



In [21]:
chat_template = open('llama-3-instruct.jinja').read()
chat_template

"{% if messages[0]['role'] == 'system' %}\n    {% set offset = 1 %}\n{% else %}\n    {% set offset = 0 %}\n{% endif %}\n\n{{ bos_token }}\n{% for message in messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}\n{% endfor %}\n\n{% if add_generation_prompt %}\n    {{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\\n\\n' }}\n{% endif %}"

In [22]:
chat_template = chat_template.replace('    ', '').replace('\n', '')
tokenizer.chat_template = chat_template
chat_template

"{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\\n\\n' }}{% endif %}"

In [23]:
def dataset_format(example):
    # Format system
    if len(example["system"]) > 0:
        message = {"role": "system", "content": example["system"]}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        system = ""
    # Format instruction
    message = {"role": "user", "content": example["question"]}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
    # Format chosen answer
    chosen = example["chosen"] + "<|eot_id|>\n"
    # Format rejected answer
    rejected = example["rejected"] + "<|eot_id|>\n"
    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

In [24]:
original_columns = dataset.column_names
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

dataset = dataset.map(
    dataset_format,
    remove_columns=original_columns,
    num_proc=os.cpu_count(),
)

In [25]:
dataset[47]

{'chosen': 'In this story, there were three people mentioned: the narrator, their good friend Bob, and the person visiting Bob on Sunday. So the answer to the question "How many people were there in this story?" is three. Therefore, "Two" is not the correct answer, so the response for "Yes or no, is the answer \'Two\'?" would be no.<|eot_id|>\n',
 'rejected': ' Sure, I\'d be happy to help! To answer your question, the story has three people:\n\n1. Bob (the owner of the dog)\n2. Bill (the dog)\n3. You (the person who visited Bob and is asking the question)\n\nSo, the answer to your question is "No" because there are not two people in the story. There are three people.<|eot_id|>\n',
 'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|eot_id|><|begin_of_text|><|start_header_id|>user<

In [26]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmurphypei[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [27]:
dpo_config = DPOConfig(
    output_dir=new_model,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50,  # tweak this to change # of steps in the training run
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to="wandb",
    beta=0.1,
    max_prompt_length=512,
    max_length=1024,
    force_use_ref_model=True,
    remove_unused_columns=False,
)

In [28]:
dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=dpo_config,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
)

Tokenizing train dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Detected kernel version 4.9.70, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [29]:
dpo_trainer.train()



  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6894
2,0.7108
3,0.6961
4,0.6591
5,0.6469
6,0.6167
7,0.5566
8,0.5474
9,0.446
10,0.4323


TrainOutput(global_step=50, training_loss=0.1604558634768182, metrics={'train_runtime': 3176.5169, 'train_samples_per_second': 0.126, 'train_steps_per_second': 0.016, 'total_flos': 0.0, 'train_loss': 0.1604558634768182, 'epoch': 2.6666666666666665})

In [30]:
dpo_trainer.model.save_pretrained("output/final_ckpt")
tokenizer.save_pretrained("output/final_ckpt")

('final_ckpt/tokenizer_config.json',
 'final_ckpt/special_tokens_map.json',
 'final_ckpt/tokenizer.json')

In [31]:
# Flush memory
del dpo_trainer, model, ref_model
gc.collect()
torch.cuda.empty_cache()

In [32]:
# Reload model in FP16 (instead of NF4)
origin_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [33]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = chat_template

In [34]:
# Merge base model with the adapter
model = PeftModel.from_pretrained(origin_model, "output/final_ckpt")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('/ws/model/llama3-8B-DPO/tokenizer_config.json',
 '/ws/model/llama3-8B-DPO/special_tokens_map.json',
 '/ws/model/llama3-8B-DPO/tokenizer.json')

In [37]:
# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=new_model,
    tokenizer=tokenizer,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [38]:
# Format prompt
message = [
    {"role": "system", "content": "You are a helpful assistant chatbot that provides concise answers."},
    {"role": "user", "content": "What are GPUs and why would I use them for machine learning tasks?"},
]
tokenizer = AutoTokenizer.from_pretrained(new_model)
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    max_length=200,
)
print(sequences[0]["generated_text"])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant chatbot that provides concise answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

What are GPUs and why would I use them for machine learning tasks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

GPUs (Graphics Processing Units) are specialized computer chips designed primarily for graphics rendering and computations. In recent years, they have become increasingly popular for machine learning (ML) and deep learning (DL) tasks due to their unique characteristics:

1. **Massive parallel processing**: GPUs have thousands of cores, allowing them to perform many calculations simultaneously, making them much faster than CPUs for tasks that require parallel processing.
2. **High-bandwidth memory**: GPUs have large, high-bandwidth memory, enabling efficient data transfer between the GPU and system memory.
3. **Low power consumption**: Compared to CPUs, GPUs consume less power while perform