In [None]:
%pip install "opencompass[full]"
%pip install pytorch transformers datasets "opencompass[full]"

安装opencompass：Kaggle上已经为我们准备好了其他常用包，只需安装opencompass用于评测即可。如果不在Kaggle上运行，则还需要安装其他必要包。

# 指令微调

In [1]:
"""
The main program for finetuning LLMs with Huggingface Transformers Library.

ALL SECTIONS WHERE CODE POSSIBLY NEEDS TO BE FILLED IN ARE MARKED AS TODO.
"""

import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import datasets

In [2]:
# Define the arguments required for the main program.
# NOTE: You can customize any arguments you need to pass in.
@dataclass
class ModelArguments:
    """Arguments for model
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )
    # TODO: add your model arguments here
    pass


@dataclass
class DataArguments:
    """Arguments for data
    """
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset or its name on the Hugging Face Hub."
        }
    )
    # TODO: add your data arguments here

In [3]:
##### Attempt & Debug #####
dataset = datasets.load_dataset('./Dataset/alpaca-language-instruction-training')
instructions = dataset['train']['instruction']
inputs = dataset['train']['input']
outputs = dataset['train']['output']
dataset['train'][0]

{'instruction': 'Give three tips for staying healthy.',
 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
 'input': None}

In [9]:
##### Dataset tokenization test
dataset = datasets.load_dataset('./Dataset/alpaca-language-instruction-training')
tokenizer = AutoTokenizer.from_pretrained('./model/input/qwen2.5/transformers/0.5b/1')
model = AutoModelForCausalLM.from_pretrained('./model/input/qwen2.5/transformers/0.5b/1',device_map = 'auto')
def tokenize_function(batch):
    for i in range(len(batch['input'])):
        if batch['input'][i] == None:
            batch['input'][i]=''
    return tokenizer(batch['instruction'],batch['input'],batch['output'],padding=True, padding_side='left')
tokenized_datasets = dataset['train'].map(tokenize_function, batched=True)
tokenized_datasets
    

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'output', 'input', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 51760
})

In [10]:
input_datasets = tokenized_datasets.remove_columns(['instruction', 'input', 'output'])
input_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 51760
})

In [13]:
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained('./model/input/qwen2.5/transformers/0.5b/1')
model = AutoModelForCausalLM.from_pretrained('./model/input/qwen2.5/transformers/0.5b/1',device_map = 'auto')
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [None]:
### test for inference.

input_seqs = ["Instructions: Tell me the climate in Beijing, the captial of China.",
              "Instructions: Describe president Blinken of the US."]
inputs = tokenizer(input_seqs, padding=True,padding_side='left',return_tensors="pt").to(device)
generated_ids = model.generate(
    **inputs,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=2048,
    top_p=0.95,
    temperature=0.9,
    do_sample=True
)

generated_text = tokenizer.batch_decode(
    generated_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(generated_text)

['Instructions: Tell me the climate in Beijing, the captial of China. The climate in Beijing, the capital of China, is characterized by a temperate continental climate. The area south of the Huai River experiences a subtropical monsoon climate with warm and humid summers, while north of the Huai River and the Inner Mongolia Autonomous Region, it is a subtropical monsoon climate with warm and humid summers. The climate is subhumid, with many rainy days during the dry season. The weather in Beijing is generally cloudy with low temperatures in the summer and high temperatures in the winter. The average temperature range in Beijing is 15.6 to 32.9°F (6.1 to 1.5°C). In the summer, the temperatures can reach as high as 92.2°F (33.8°C) during the day and as low as 80.8°F (27.1°C) at night. In winter, the temperatures can drop to -17.5°F (1.3°C) during the day and -21.3°F (2.6°C) at night. The city is also known for its cold winters and hot summers, making it a prime location for outdoor activ

In [None]:
# The main function
# NOTE You can customize some logs to monitor your program.
def finetune():
    # TODO Step 1: Define an arguments parser and parse the arguments
    # NOTE Three parts: model arguments, data arguments, and training arguments
    # HINT: Refer to 
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/internal/trainer_utils#transformers.HfArgumentParser
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/trainer#transformers.TrainingArguments
    parser = HfArgumentParser((ModelArguments, DataArguments,TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses(args=sys.argv)

    # TODO Step 2: Load tokenizer and model
    # HINT 1: Refer to
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/tokenizer#tokenizer
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/qwen2
    # HINT 2: To save training GPU memory, you need to set the model's parameter precision to half-precision (float16 or bfloat16).
    #         You may also check other strategies to save the memory!
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/llama2#usage-tips
    #   * https://huggingface.co/docs/transformers/perf_train_gpu_one
    #   * https://www.53ai.com/news/qianyanjishu/2024052494875.html
    tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/qwen2.5/transformers/0.5b/1')
    model = AutoModelForCausalLM.from_pretrained('/kaggle/input/qwen2.5/transformers/0.5b/1',device_map = 'auto')

    # TODO Step 3: Load dataset
    # HINT: https://huggingface.co/docs/datasets/v3.1.0/en/package_reference/main_classes#datasets.Dataset
    dataset = datasets.load_dataset(data_args.dataset_path)

    # TODO Step 4: Define the data collator function
    # NOTE During training, for each model parameter update, we fetch a batch of data, perform a forward and backward pass,
    # and then update the model parameters. The role of the data collator is to process the data (e.g., padding the data within
    # a batch to the same length) and format the batch into the input required by the model.
    #
    # In this assignment, the purpose of the custom data_collator is to process each batch of data from the dataset loaded in
    # Step 3 into the format required by the model. This includes tasks such as tokenizing the data, converting each token into 
    # an ID sequence, applying padding, and preparing labels.
    # 
    # HINT:
    #   * Before implementation, you should:
    #      1. Clearly understand the format of each sample in the dataset loaded in Step 3.
    #      2. Understand the input format required by the model (https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2ForCausalLM).
    #         Reading its source code also helps!

    def data_collator(batch: List[Dict]):
        """
        batch: list of dict, each dict of the list is a sample in the dataset.
        """
        # The List is a Dict which has the following structure:
        # DatasetDict({
        #     train: Dataset({
        #         features: ['instruction', 'output', 'input'],
        #         num_rows: 51760
        #     })
        # })
        # So the trainer may pass the dataset by batch. But what is inside?
        
        pass

    # TODO Step 5: Define the Trainer
    # HINT: https://huggingface.co/docs/transformers/main_classes/trainer
    trainer = Trainer(
        ...,
        model=model,
    )

    # Step 6: Train!
    trainer.train()

In [None]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
sys.argv = [
    "notebook", 
    "--arg1", "value1",
    "--arg2", "value2",
    ...
]
finetune()

In [None]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
sys.argv = [
    "notebook", 
    "--arg1", "value1",
    "--arg2", "value2",
    ...
]
finetune()

# 评测模型

如果你有多个GPU，可以修改下面的--hf-num-gpus参数来加速评测.

In [None]:
%opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path {PLM_MODEL_PATH} \
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 2 \
    --work-dir "/kaggle/working/evals/plm" \
    --debug

In [None]:
%opencompass \
    --datasets mmlu_ppl hellaswag_clean_ppl winogrande_ll ARC_e_ppl ARC_c_clean_ppl SuperGLUE_BoolQ_few_shot_ppl \
    --summarizer example \
    --hf-type base \
    --hf-path {SFT_MODEL_PATH} \
    --tokenizer-kwargs padding_side="left" truncation="left" \
    --max-seq-len 2048 \
    --batch-size 4 \
    --hf-num-gpus 2 \
    --work-dir "/kaggle/working/evals/sft" \
    --debug