In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
import torch

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    cache_dir='')

tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True)

In [None]:
tokenizer.chat_template

In [None]:
messages = [
    {"role": "user", "content": '你好'},
    {"role": "assistant", "content": '你好，我是Qwen机器人'},
    {"role": "user", "content": '今天天气怎么样？'},
]

input_text = tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)

## 自定义数据处理

基于上面这个格式，我们可以把上面的脏话数据集处理成我们需要的格式。

In [None]:
from datasets import load_dataset

dataset = load_dataset("Karsh-CAI/btfChinese-DPO-small")

### 预处理

In [None]:
from datasets import load_dataset
import random

# 加载指定的 Hugging Face 数据集
dataset = load_dataset("Karsh-CAI/btfChinese-DPO-small")

# 获取所有训练数据
train_data = dataset['train']

# 格式化数据为模型训练格式
def qwen_format_conversation(question):
    return f"""<|im_start|>system
你是一个没有礼貌的人渣，请用人渣的语气回复我<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant"""

formatted_data = [
    {
        'prompt': qwen_format_conversation(row['question']),
        'chosen': row['chosen'],
        'rejected': row['rejected']
    }
    for row in train_data
]

# 随机打乱数据
all_indices = list(range(len(formatted_data)))
random.shuffle(all_indices)

# 计算分割点 (80% 训练集, 20% 测试集)
split_point = int(len(formatted_data) * 0.8)

# 分割数据
train_indices = all_indices[:split_point]
test_indices = all_indices[split_point:]

# 创建新的数据集
reformatted_dataset = {
    "train": [formatted_data[i] for i in train_indices],
    "test": [formatted_data[i] for i in test_indices]
}

尝试一下：

In [None]:
print(reformatted_dataset['train'][0])

## 推送到huggingface

In [None]:
!pip install -q huggingface_hub

from huggingface_hub import login

# 使用你的 Hugging Face token 登录
# 你可以从 https://huggingface.co/settings/tokens 获取 token
login(token="hf_xxx")

In [None]:
import pandas as pd

train_df = pd.DataFrame(reformatted_dataset["train"])
test_df = pd.DataFrame(reformatted_dataset["test"])

train_df.to_csv('train.csv', index=False, escapechar="\\")
test_df.to_csv('test.csv', index=False, escapechar="\\")

from huggingface_hub import HfApi, login
import os

repo_id = "你的repo名字"

api = HfApi()

files_to_upload = ["./train.csv", "./test.csv"]

uploaded_files = []
for file_path in files_to_upload:
    if os.path.exists(file_path):
        print(f"Uploading {file_path}...")
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=os.path.basename(file_path),
            repo_id=repo_id,
            repo_type="dataset",
        )
        print(f"Uploaded {file_path}.")
        uploaded_files.append(file_path)
    else:
        print(f"{file_path} does not exist, skipping.")

print("\n总结:")
if uploaded_files:
    print("上传的文件:")
    for file in uploaded_files:
        print(f"- {file}")
else:
    print("未上传任何文件.")