In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import os
device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir = './0.5B-trained'

model = None
# 加载模型和分词器
if os.path.exists(model_dir):
    model = AutoModelForCausalLM.from_pretrained(model_dir)
else:
    model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from typing import Dict
import torch
from torch.utils.data import Dataset
from transformers.trainer_pt_utils import LabelSmoother

IGNORE_TOKEN_ID = LabelSmoother.ignore_index  # 设置忽略令牌的ID，用于损失计算时忽略

def preprocess(messages, tokenizer, max_len):
    print("preprocessing")
    
    texts = []
    for message in messages:
        # 将对话格式应用于每组消息
        texts.append(
            tokenizer.apply_chat_template(
                message,
                tokenize=True,
                add_generation_prompt=False,
                padding=True,
                max_length=max_len,
                truncation=True,
            )
        )
    input_ids = torch.tensor(texts, dtype=torch.long)
    target_ids = input_ids.clone()
    target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    
    return dict(
        input_ids=input_ids, target_ids=target_ids, attention_mask=attention_mask
    )

class SupervisedDataset(Dataset):
    def __init__(self, raw_data, tokenizer, max_len):
        print("init")
        messages = [example["messages"] for example in raw_data]
        data_dict = preprocess(messages, tokenizer, max_len)

        self.input_ids = data_dict["input_ids"]
        self.target_ids = data_dict["target_ids"]
        self.attention_mask = data_dict["attention_mask"]

        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.target_ids[i],
            attention_mask=self.attention_mask[i],
        )




In [None]:
# 发起对话
# 对话内容
prompt = "Maeiee是谁？你与他是什么关系？"
messages = [
    {"role": "system", "content": "你是一个有用的助手。"},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print('====用户输入====')
print(prompt)
print('====模型回复====')
print(response)

In [13]:
# 训练模型
# 训练对话与回答
print("训练模型")
raw_data = [
    {"messages": [
        {"role": "system", "content": "你是一个有用的助手。"},
        {"role": "user", "content": "Maeiee是谁？"},
        {"role": "assistant", "content": "Maeiee是我的好朋友！"}
    ]},
    # 更多的对话实例...
]
print("训练模型2")

train_dataset = SupervisedDataset(raw_data, tokenizer, 512)
# 增量训练模型
# 注意：你需要根据你的实际训练环境调整此部分
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    num_train_epochs=1,              # 总训练轮次
    per_device_train_batch_size=4,   # 每个设备的批大小
    warmup_steps=0,                # 预热步骤
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # 使用新的训练数据
    # 这里可能还需要一个评估数据集
)
trainer.train()

训练模型
训练模型2
init
preprocessing


100%|██████████| 1/1 [00:00<00:00,  2.81it/s]

{'train_runtime': 0.3562, 'train_samples_per_second': 2.808, 'train_steps_per_second': 2.808, 'train_loss': 1.3851810693740845, 'epoch': 1.0}





TrainOutput(global_step=1, training_loss=1.3851810693740845, metrics={'train_runtime': 0.3562, 'train_samples_per_second': 2.808, 'train_steps_per_second': 2.808, 'train_loss': 1.3851810693740845, 'epoch': 1.0})

In [14]:
model.save_pretrained(model_dir)