In [1]:
%pip install datasets trl peft bitsandbytes wandb accelerate transformers ipywidgets

[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
from operator import is_
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

for k, v in os.environ.items():
    if "cuda" in k.lower():
        print(k, v)

print()
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.__version__)
print(torch.cuda.is_bf16_supported())
is_bf16_supported = torch.cuda.is_bf16_supported()

NV_CUDA_COMPAT_PACKAGE cuda-compat-11-8
NV_CUDA_NSIGHT_COMPUTE_VERSION 11.8.0-1
CUDA_VERSION 11.8.0
NVIDIA_REQUIRE_CUDA cuda>=11.8 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 brand=nvidia,drive

In [2]:
import os
import gc
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOConfig, DPOTrainer, setup_chat_format
import bitsandbytes as bnb

In [3]:
base_model = "/ws/model/Meta-Llama-3-8B-Instruct/"
new_model = "./output/llama-3-8b-dpo"
train_dataset = "argilla/ultrafeedback-binarized-preferences-cleaned"
wandb_name = "llama-3-8b-dpo"
max_prompt_length = 1024
max_seq_length = 8192

In [4]:
# transformers 库原生支持 4bit 浮点数量化，但是 bnb 可以使用多种不同形式的 4bit 量化，并且支持 double quant。
# 计算不是以 4bit 完成的，仅仅是权重和激活被压缩为该格式，而计算仍在指定的或者原始数据类型上进行。

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)

print(next(model.parameters()).device)  # 输出设备信息

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

cuda:1


In [8]:
# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto",
)

print(next(ref_model.parameters()).device)  # 输出设备信息

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

cuda:1


In [9]:
# LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"],
)

In [10]:
# Load dataset
dataset = load_dataset(train_dataset)['train']

In [11]:
dataset = dataset.shuffle(seed=42)  # .select(range(100))

In [12]:
dataset

Dataset({
    features: ['source', 'prompt', 'chosen', 'chosen-rating', 'chosen-model', 'rejected', 'rejected-rating', 'rejected-model'],
    num_rows: 60917
})

In [13]:
dataset[47]

{'source': 'evol_instruct',
 'prompt': 'Create a comprehensive plan for introducing a new technology that incorporates a robotic assistant designed specifically to enhance customer service and retail operations. The plan should include a detailed analysis of the current market landscape, competitor analysis, and a cost-benefit analysis of implementing the new technology. In addition, the plan should incorporate a detailed training program for employees to ensure seamless integration of the technology into existing operations. The plan should also outline a strategic marketing and advertising campaign to generate awareness and interest in the new technology among potential customers. Finally, the plan should include a contingency plan for potential issues or challenges that may arise during implementation or operation of the new technology. The plan should be presented in a professional report format, complete with charts, tables, and any necessary technical information, such as coding 

In [14]:
!wget -O llama-3-instruct.jinja https://raw.githubusercontent.com/chujiezheng/chat_templates/main/chat_templates/llama-3-instruct.jinja

--2024-09-26 09:49:38--  https://raw.githubusercontent.com/chujiezheng/chat_templates/main/chat_templates/llama-3-instruct.jinja
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 598 [text/plain]
Saving to: ‘llama-3-instruct.jinja’


2024-09-26 09:49:39 (67.1 MB/s) - ‘llama-3-instruct.jinja’ saved [598/598]



In [15]:
chat_template = open('llama-3-instruct.jinja').read()
chat_template

"{% if messages[0]['role'] == 'system' %}\n    {% set offset = 1 %}\n{% else %}\n    {% set offset = 0 %}\n{% endif %}\n\n{{ bos_token }}\n{% for message in messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n\n    {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}\n{% endfor %}\n\n{% if add_generation_prompt %}\n    {{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\\n\\n' }}\n{% endif %}"

In [16]:
chat_template = chat_template.replace('    ', '').replace('\n', '')
tokenizer.chat_template = chat_template
chat_template

"{% if messages[0]['role'] == 'system' %}{% set offset = 1 %}{% else %}{% set offset = 0 %}{% endif %}{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == offset) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\\n\\n' }}{% endif %}"

In [17]:
def get_assistant_content(data):
    for item in data:
        if item["role"] == "assistant":
            return item["content"]
    return ""


def get_question_content(data):
    for item in data:
        if item["role"] == "user":
            return item["content"]
    return ""

system_prompt = "You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps."

def dataset_format(example):
    # Format system
    if "system" in example and len(example["system"]) > 0:
        message = {"role": "system", "content": example["system"]}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        message = {"role": "system", "content": system_prompt}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    # Format instruction
    message = {"role": "user", "content": get_question_content(example["chosen"])}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
    # Format chosen answer
    chosen = get_assistant_content(example["chosen"]) + "<|eot_id|>\n"
    # Format rejected answer
    rejected = get_assistant_content(example["rejected"]) + "<|eot_id|>\n"
    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

In [18]:
dataset_format(dataset[0])

{'prompt': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|eot_id|><|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHorses are hybrids of which two animals?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 'chosen': 'Horses, scientifically known as Equus caballus, are not the result of hybridization between two other animals. They are considered a species of their own within the Equidae family, which also includes other equines such as donkeys, mules, and zebras. Horses are believed to have evolved from a small, multi-toed forest-dwelling animal known as Eohippus, which lived around 60 million years ago. Over millions of years, these horses evolved into the large, single-toed animals we know today.<|eot_id|>\n',
 'rejected': 'Horses are not hybrids of any other a

In [19]:
original_columns = dataset.column_names
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

dataset = dataset.map(
    dataset_format,
    remove_columns=original_columns,
    num_proc=os.cpu_count(),
)

Map (num_proc=96):   0%|          | 0/60917 [00:00<?, ? examples/s]

In [20]:
import wandb

wandb.login()
os.environ["WANDB_PROJECT"] = wandb_name

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mmurphypei[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
dpo_config = DPOConfig(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    fp16=not is_bf16_supported,
    bf16=is_bf16_supported,
    report_to="wandb",
    beta=0.1,
    max_prompt_length=max_prompt_length,
    max_length=max_seq_length,
    force_use_ref_model=True,
    remove_unused_columns=False,
)

In [22]:
dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=dpo_config,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
)

Tokenizing train dataset:   0%|          | 0/60917 [00:00<?, ? examples/s]

Detected kernel version 4.9.70, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
dpo_trainer.train()



Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6975
2,0.7021
3,0.6864
4,0.6972
5,0.686
6,0.6795
7,0.661
8,0.7088
9,0.6577
10,0.6241


In [None]:
dpo_trainer.model.save_pretrained("output/final_ckpt")
tokenizer.save_pretrained("output/final_ckpt")

('output/final_ckpt/tokenizer_config.json',
 'output/final_ckpt/special_tokens_map.json',
 'output/final_ckpt/tokenizer.json')

In [None]:
# Flush memory
del dpo_trainer, model, ref_model
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Reload model in FP16 (instead of NF4)
origin_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = chat_template

In [None]:
# Merge base model with the adapter
model = PeftModel.from_pretrained(origin_model, "output/final_ckpt")
model = model.merge_and_unload()

# Save model and tokenizer
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('./output/llama-3-8b-dpo/tokenizer_config.json',
 './output/llama-3-8b-dpo/special_tokens_map.json',
 './output/llama-3-8b-dpo/tokenizer.json')

In [None]:
# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=new_model,
    tokenizer=tokenizer,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Format prompt
message = [
    {"role": "system", "content": "You are a helpful assistant chatbot that provides concise answers."},
    {"role": "user", "content": "What are GPUs and why would I use them for machine learning tasks?"},
]
tokenizer = AutoTokenizer.from_pretrained(new_model)
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    max_length=200,
)
print(sequences[0]["generated_text"])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant chatbot that provides concise answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

What are GPUs and why would I use them for machine learning tasks?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A GPU (Graphics Processing Unit) is a specialized electronic circuit designed to quickly manipulate and alter memory to accelerate the creation of images in a frame buffer intended for output to a display device. In the context of machine learning, GPUs are used to accelerate the processing of large amounts of data, making them an essential component for many machine learning tasks.

Here are some reasons why you would use GPUs for machine learning tasks:

1. **Parallel Processing**: GPUs have thousands of cores, which can process multiple calculations simultaneously, making them much faster than CPUs for parallelizable tasks like matrix multiplications and data transformations.
2. **Memor