安装opencompass：Kaggle上已经为我们准备好了其他常用包，只需安装opencompass用于评测即可。如果不在Kaggle上运行，则还需要安装其他必要包。

In [None]:
!pip install "opencompass[full]"
# !pip install pytorch transformers datasets "opencompass[full]"

# 指令微调

In [1]:
"""
The main program for finetuning LLMs with Huggingface Transformers Library.

ALL SECTIONS WHERE CODE POSSIBLY NEEDS TO BE FILLED IN ARE MARKED AS TODO.
"""

import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import sys
import torch
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the arguments required for the main program.
# NOTE: You can customize any arguments you need to pass in.
@dataclass
class ModelArguments:
    """Arguments for model
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )
    # TODO: add your model arguments here
    model_name_or_path = "Qwen2.5-0.5B"
    torch_dtype = "float32"




@dataclass
class DataArguments:
    """Arguments for data
    """
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset or its name on the Hugging Face Hub."
        }
    )
    # TODO: add your data arguments here
    dataset_path = "alpaca-cleaned/alpaca_data_cleaned.json"
    # 该数据集格式为 list[dict],每个dict包含instruction,input,output

In [11]:
dataset = datasets.load_dataset('json', data_files="alpaca-cleaned/alpaca_data_cleaned.json")

In [5]:
print(dataset.items())
print(dataset['train'][:5])

model_path = "./Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
input_text = [sample["instruction"] + sample["input"] for sample in dataset['train']] 
label_text = [sample["output"] for sample in dataset['train']]
print(input_text[:5])
print(label_text[:5])
input_ids = tokenizer(input_text, padding=True, truncation=True, max_length=2048,return_tensors="pt")
output_ids = tokenizer(label_text, padding=True, truncation=True, max_length=2048, return_tensors="pt")
input_ids['label'] = output_ids.input_ids

# 'input_ids' and 'attention_mask' 

dict_items([('train', Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 51760
}))])
{'output': ['1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by

In [9]:
#dataset = datasets.load_dataset('json', data_files="alpaca-cleaned/alpaca_data_cleaned.json")
def data_collator(batch: List[Dict]):
        """
        batch: list of dict, each dict of the list is a sample in the dataset.
        """
        inputs = []
        labels = []
        max_length = 0

        model_path = "./Qwen2.5-0.5B"

        tokenizer = AutoTokenizer.from_pretrained(model_path)

        for sample in batch:
            instruction = sample.get("instruction","")
            input_text = sample.get("input","")
            output_text = sample.get("output","")

            # 检查input和output是否为空
            # if not input_text.strip():
            #     input_text = f"\n{input_text}"

            if not output_text.strip():
                output_text = "<empty>"
            
            # 构建输入序列
            input_ids = tokenizer.encode_plus(
                f"{instruction}{input_text}", # 将instruction和input_text进行拼接，生成文本输入
                return_tensors = "pt", # 输出转换为pytorch的张量格式
                max_length = tokenizer.model_max_length, # 如果输入序列超过最大长度则截断
                truncation = True, 
                padding = False, # 即使输入序列没有达到最大长度，也不进行填充
            ).input_ids # 用于获取tokenizer返回字典中的‘input_ids’字段

            # 构建输出序列
            output_ids = tokenizer.encode_plus(
                output_text,
                return_tensors = "pt",
                max_length = tokenizer.model_max_length,
                truncation = True,
                padding = False,
            ).input_ids

            # 拼接输入与输出序列，获得模型所需的input_ids
            full_input = torch.cat([input_ids, output_ids], dim=1)
            inputs.append(full_input)

            # 创建标签张量
            labels_tensor = torch.full_like(full_input, -100) # 用-100填充表示这些位置在损失计算中被忽略
            labels_tensor[:, input_ids.shape[1]:] = full_input[:, input_ids.shape[1]:] # 将output_ids对应的位置替换为output_ids原来的值，input_ids对应的位置仍为-100，表示学习时只学习output部分
            labels.append(labels_tensor)

            if full_input.shape[1] > max_length:
                max_length = full_input.shape[1]


        # 处理batch的padding，将同一个batch的label和input都填充到同一个长度
        inputs = [torch.nn.functional.pad(input, (0,max_length - input.size(1)),value=tokenizer.pad_token_id) for input in inputs]
        labels = [torch.nn.functional.pad(label, (0, max_length - label.size(1)), value=-100) for label in labels]

        inputs = torch.stack(inputs)
        labels = torch.stack(labels)

        # inputs = torch.cat(inputs, dim=0)
        # labels = torch.cat(labels, dim=0)

        return {
            "input_ids": inputs,
            "labels": labels,
            "attention_mask": (inputs != tokenizer.pad_token_id).to(dtype=torch.int), 
        }

#data_collator(dataset['train'][:4])

# batch = dataset['train'][:4]
# print(batch)
# batch_size = len(batch['input'])
# for i in range(batch_size):
#     print(batch['input'][i])
#     print(batch['output'][i])
#     print(batch['instruction'][i])


In [23]:
# print(len(input_ids['label'])) # 51760
# print(len(input_ids['input_ids']))
# input_ids
dataset = datasets.load_dataset('json', data_files="alpaca-cleaned/alpaca_data_cleaned.json")
dataset = dataset['train']
sample = dataset[0]
instruction = sample.get("instruction","")
input_text = sample.get("input","")
output_text = sample.get("output","")

# 构建输入序列
input_ids = tokenizer.encode_plus(
f"{instruction}{input_text}", 
return_tensors = "pt", 
max_length = tokenizer.model_max_length,
truncation = True, 
padding = False, 
).input_ids 

output_ids = tokenizer.encode_plus(
f"{output_text}", 
return_tensors = "pt", 
max_length = tokenizer.model_max_length,
truncation = True, 
padding = False, 
).input_ids 

print(input_ids.size())
print(output_ids.size())

full_input = torch.cat([input_ids, output_ids], dim=1)
labels_tensor = torch.full_like(full_input, -100)
print(full_input.size())
print(labels_tensor.size())

torch.Size([1, 7])
torch.Size([1, 151])
torch.Size([1, 158])
torch.Size([1, 158])


In [13]:
# print(dataset[0])
# print(dataset[1])
# print(dataset[2])
# print(dataset[3])


{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'instruction': 'Give three tips for staying healthy.', 'input': ''}
{'output': 'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all o

In [3]:
# The main function
# NOTE You can customize some logs to monitor your program.
def finetune():
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))  # 解析器
    # 返回模型参数，数据参数，训练参数
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # TODO Step 2: Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    dtype = torch.float16 if model_args.torch_dtype == "float16" else(
        torch.bfloat16 if model_args.torch_dtype == "bfloat16" else torch.float32
    )
    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)

    # TODO Step 3: Load dataset
    dataset = datasets.load_dataset('json', data_files=data_args.dataset_path)

    
    # TODO Step 4: Define the data collator function
    def data_collator(batch: List[Dict]):
        """
        batch: list of dict, each dict of the list is a sample in the dataset.
        一个列表，每个元素是一个字典，字典是数据集中的一个样本
        """

        inputs = []
        labels = []
        max_length = 0

        for sample in batch:
            instruction = sample.get("instruction","")
            input_text = sample.get("input","")
            output_text = sample.get("output","")

            if not instruction.strip() and not input_text.strip() and not output_text.strip():
                print(f"发现无效的样本：instruction={instruction}, input={input_text}, output={output_text}")
                continue


            if not instruction.strip():
                print("instruction is empty")
                b = input()
            # 检查input和output是否为空
            if input_text.strip():
                input_text = f"\n{input_text}"

            if not output_text.strip():
                print("output is empty")
                c = input
                output_text = "<empty>"
            
            # 构建输入序列
            input_ids = tokenizer.encode_plus(
                f"{instruction}{input_text}", # 将instruction和input_text进行拼接，生成文本输入
                return_tensors = "pt", # 输出转换为pytorch的张量格式
                max_length = tokenizer.model_max_length, # 如果输入序列超过最大长度则截断
                truncation = True, 
                padding = False, # 即使输入序列没有达到最大长度，也不进行填充
            ).input_ids # 用于获取tokenizer返回字典中的‘input_ids’字段

            # 构建输出序列
            output_ids = tokenizer.encode_plus(
                output_text,
                return_tensors = "pt",
                max_length = tokenizer.model_max_length,
                truncation = True,
                padding = False,
            ).input_ids

            if len(input_ids) == 0 or len(output_ids) == 0:
                print(f"发现空样本: instruction={instruction}, input={input_text}, output={output_text}")
                continue

            # 拼接输入与输出序列，获得模型所需的input_ids
            full_input = torch.cat([input_ids, output_ids], dim=1)
            inputs.append(full_input)

            # 创建标签张量
            labels_tensor = torch.full_like(full_input, -100) # 用-100填充表示这些位置在损失计算中被忽略
            labels_tensor[:, input_ids.shape[1]:] = full_input[:, input_ids.shape[1]:] # 将output_ids对应的位置替换为output_ids原来的值，input_ids对应的位置仍为-100，表示学习时只学习output部分
            labels.append(labels_tensor)

            if full_input.shape[1] > max_length:
                max_length = full_input.shape[1]


        # 处理batch的padding，将同一个batch的label和input都填充到同一个长度
        # inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id)
        # labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
        inputs = [torch.nn.functional.pad(input, (0,max_length - input.size(1)),value=tokenizer.pad_token_id) for input in inputs]
        labels = [torch.nn.functional.pad(label, (0, max_length - label.size(1)), value=-100) for label in labels]

        # inputs = torch.stack(inputs)
        # labels = torch.stack(labels)

        inputs = torch.cat(inputs, dim=0)
        labels = torch.cat(labels, dim=0)
        print(inputs.size())
        print(labels.size())

        return {
            "input_ids": inputs,
            "labels": labels,
            "attention_mask": torch.ne(inputs, tokenizer.pad_token_id).int(), 
            # pad_token_id是tokenizer定义的填充令牌ID，也就是对padding的部分填充一个特殊的令牌
            # 该行代码将生成一个与inputs张量形状相同的布尔张量，其中值为True:表示对应的输入ID不是填充ID（即该令牌是有效的；值为False: 表示对应的输入ID是填充ID（即该令牌是无效的，需要被忽略）。
        }
        # "attention_mask": torch.tensor(inputs != tokenizer.pad_token_id).to(dtype=torch.int)

    # TODO Step 5: Define the Trainer
    # HINT: https://huggingface.co/docs/transformers/main_classes/trainer
    trainer = Trainer(
        args = training_args,
        model=model,  # Pretrained model
        tokenizer=tokenizer,
        data_collator=data_collator,
        train_dataset=dataset["train"],
    )

    # Step 6: Train!
    trainer.train()

In [6]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5"
# max_length = max([label.size() for label in labels])
sys.argv = [
    "notebook", 
    "--output_dir", "./our-model/test",
    "--learning_rate","1e-5",
    "--num_train_epochs", "3",  # 通常3-5个epoch即可收敛，长时间训练可能会过拟合
    "--per_device_train_batch_size", "1",  # 每个GPU上的大小
    "--overwrite_output_dir","True",  #开发过程中覆盖旧的文件
    "--save_steps", "1000",
    "--save_total_limit", "2",
    "--logging_steps","50",
    "--logging_dir", "./logs/exp1",
    "--dataloader_drop_last", "True",
    '--seed','42',
    "--fp16","True",
    '--remove_unused_columns','False',
    "--ddp_find_unused_parameters","False",  # 关闭未使用的参数检查
    # "--local_rank","-1",
]
finetune()

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=1,
#     learning_rate=2e-4,
#     logging_steps=2,
#     max_steps=100,
#     eval_strategy="steps",
#     ddp_find_unused_parameters=False,
#     #dataset_text_field="text",
# )


  trainer = Trainer(


torch.Size([6, 166])
torch.Size([6, 166])
torch.Size([6, 234])
torch.Size([6, 234])


RuntimeError: chunk expects at least a 1-dimensional tensor

In [9]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.argv = [
    "notebook", 
    "--output_dir", "./our-model/test",
    "--learning_rate","1e-5",
    "--num_train_epochs", "3",  # 通常3-5个epoch即可收敛，长时间训练可能会过拟合
    "--per_device_train_batch_size", "4",  # 每个GPU上的大小
    "--overwrite_output_dir","True",  #开发过程中覆盖旧的文件
    "--save_steps", "1000",
    "--save_total_limit", "2",
    "--logging_steps","50",
    "--logging_dir", "./logs/exp1",
    "--remove_unused_columns","False",
    "--dataloader_drop_last", "True",
    '--seed','42',
    "--fp16","True",
    # "--local_rank","-1",
]
finetune()

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=1,
#     learning_rate=2e-4,
#     logging_steps=2,
#     max_steps=100,
#     eval_strategy="steps",
#     ddp_find_unused_parameters=False,
#     #dataset_text_field="text",
# )

  trainer = Trainer(


Step,Training Loss
50,1.4732
100,1.4074
150,1.4117
200,1.4847
250,1.4044
300,1.4832
350,1.4303
400,1.4009
450,1.4528
500,1.4181


# 评测模型

In [5]:
# 原始预训练模型
PLM_MODEL_PATH = "./Qwen2.5-0.5B"
# 微调后的模型
SFT_MODEL_PATH = "./our-model/dev/checkpoint-"

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

如果你有多个GPU，可以修改下面的--hf-num-gpus参数来加速评测。

In [None]:
!opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path "./Qwen2.5-0.5B"\
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 6\
    --work-dir "outputs/evals/plm" \
    --debug

# --hf-num-gpus 6 原来为2
# python opencompass/run.py --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl --summarizer example --hf-type base --hf-path "./Qwen2.5-0.5B" --tokenizer-kwargs padding_side="left" truncation="left"  --max-seq-len 2048 --batch-size 4 --hf-num-gpus 6 --work-dir "outputs/evals/plm" --debug

In [None]:
!opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path {SFT_MODEL_PATH} \
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 6 \
    --work-dir "outputs/evals/sft" \
    --debug

    # python opencompass/run.py --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl --summarizer example --hf-type base --hf-path "./our-model/test/checkpoint-25878" --tokenizer-kwargs padding_side="left" truncation="left"  --max-seq-len 2048 --batch-size 4 --hf-num-gpus 6 --work-dir "outputs/evals/sft" --debug

In [1]:
import gc
import torch
gc.collect()  # 清理 Python 内存
torch.cuda.empty_cache()  # 释放 CUDA 资源


In [14]:
import torch

a =torch.tensor([1,2,3])
print(a.size())

torch.Size([3])
