# MindSpore2.4版本在启智社区昇腾环境的适配（大模型案例-prompt_tuning）
镜像：mindspore_2_3_910b_cann8

原始链接：https://pangu.huaweicloud.com/gallery/asset-detail.html?id=016991f8-0e0d-44c8-96f7-8b2cad54c592

# 基于MindNLP的Roberta模型Prompt Tuning

安装mindspore, mindnlp及其他依赖

In [1]:
!pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.4.1/MindSpore/unified/aarch64/mindspore-2.4.1-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting mindspore==2.4.1
  Downloading https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.4.1/MindSpore/unified/aarch64/mindspore-2.4.1-cp39-cp39-linux_aarch64.whl (335.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.5/335.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting safetensors>=0.4.0 (from mindspore==2.4.1)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/08/94/7760694760f1e5001bd62c93155b8b7ccb652d1f4d0161d1e72b5bf9581a/safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (442 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.4/442.4 kB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[33mDEPRECATION: moxing-framework 2.1.16.2ae09d45 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of moxing-framework or contact the author to su

In [2]:
%env HF_ENDPOINT=https://hf-mirror.com

env: HF_ENDPOINT=https://hf-mirror.com


## 模型与数据集加载

本案例对roberta-large模型基于GLUE基准数据集进行prompt tuning。

In [4]:
import argparse
import os

import mindspore
from mindnlp.core.optim import AdamW
from tqdm import tqdm
import evaluate
from mindnlp.dataset import load_dataset
from mindnlp.engine import set_seed
from mindnlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from mindnlp.common.optimization import get_linear_schedule_with_warmup
from mindnlp.peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptTuningConfig,
)

In [5]:
batch_size = 32
model_name_or_path = "AI-ModelScope/roberta-large"
task = "mrpc"
peft_type = PeftType.PROMPT_TUNING
# num_epochs = 20
num_epochs = 5

prompt tuning配置，任务类型选为"SEQ_CLS", 即序列分类。

In [6]:
# peft config
peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
# learning rate
lr = 1e-3

加载tokenizer。如模型为GPT、OPT或BLOOM类模型，从序列左侧添加padding，其他情况下从序列右侧添加padding。

In [7]:
# load tokenizer
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side, mirror="modelscope")
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

100%|██████████| 482/482 [00:00<00:00, 2.00MB/s]
100%|██████████| 878k/878k [00:00<00:00, 5.73MB/s]
100%|██████████| 446k/446k [00:00<00:00, 3.62MB/s]
100%|██████████| 1.29M/1.29M [00:00<00:00, 8.18MB/s]


In [8]:
datasets = load_dataset("glue", task)
print(next(datasets['train'].create_dict_iterator()))

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 134928.72 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 103487.91 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 309050.21 examples/s]

{'sentence1': Tensor(shape=[], dtype=String, value= 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .'), 'sentence2': Tensor(shape=[], dtype=String, value= 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'), 'label': Tensor(shape=[], dtype=Int64, value= 1), 'idx': Tensor(shape=[], dtype=Int64, value= 0)}





In [9]:
from mindnlp.dataset import BaseMapFunction

class MapFunc(BaseMapFunction):
    def __call__(self, sentence1, sentence2, label, idx):
        outputs = tokenizer(sentence1, sentence2, truncation=True, max_length=None)
        return outputs['input_ids'], outputs['attention_mask'], label


def get_dataset(dataset, tokenizer):
    input_colums=['sentence1', 'sentence2', 'label', 'idx']
    output_columns=['input_ids', 'attention_mask', 'labels']
    dataset = dataset.map(MapFunc(input_colums, output_columns),
                          input_colums, output_columns)
    dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
                                                         'attention_mask': (None, 0)})
    return dataset

train_dataset = get_dataset(datasets['train'], tokenizer)
eval_dataset = get_dataset(datasets['validation'], tokenizer)

In [10]:
print(next(train_dataset.create_dict_iterator()))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': Tensor(shape=[32, 70], dtype=Int64, value=
[[    0, 10127,  1001 ...     1,     1,     1],
 [    0,   975, 26802 ...     1,     1,     1],
 [    0,  1213,    56 ...     1,     1,     1],
 ...
 [    0,   133,  1154 ...     1,     1,     1],
 [    0, 12667,  8423 ...     1,     1,     1],
 [    0, 32478,  1033 ...     1,     1,     1]]), 'attention_mask': Tensor(shape=[32, 70], dtype=Int64, value=
[[1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 ...
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0]]), 'labels': Tensor(shape=[32], dtype=Int64, value= [1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 
 1, 1, 0, 0, 1, 1, 1, 0])}


In [11]:
metric = evaluate.load("glue", task)

Downloading builder script: 5.75kB [00:00, 15.5MB/s]


加载模型并打印微调参数量，可以看到仅有不到0.6%的参数参与了微调。

如出现如下告警请忽略，并不影响模型的微调。

```text
The following parameters in checkpoint files are not loaded:
['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.embeddings.position_ids']
The following parameters in models are missing parameter:
['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
```

In [12]:
# load model
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, mirror="modelscope")
model = get_peft_model(model, peft_config)
# print number of trainable parameters
model.print_trainable_parameters()

100%|██████████| 1.32G/1.32G [01:56<00:00, 12.2MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AI-ModelScope/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,061,890 || all params: 356,423,684 || trainable%: 0.2979291353713745


## 模型微调（prompt tuning）

指定优化器和学习率调整策略

In [13]:
optimizer = AdamW(params=model.trainable_params(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataset) * num_epochs),
    num_training_steps=(len(train_dataset) * num_epochs),
)

打印参与微调的模型参数

In [14]:
# print name of trainable parameters
model.trainable_params()

(Tensor(shape=[1024, 1024], dtype=Float32, value=
 [[-4.84916121e-02, -1.29634524e-02,  2.16135755e-02 ...  2.64235642e-02, -1.73129626e-02,  5.46947354e-03],
  [ 1.14002684e-02,  2.35189125e-03,  1.35929622e-02 ... -2.99772061e-02,  1.13693560e-02, -9.13739577e-03],
  [ 2.15256605e-02,  6.29948974e-02, -1.10398978e-02 ... -6.41190819e-03,  2.61987019e-02, -3.97673920e-02],
  ...
  [-6.10018009e-03,  2.48880149e-03,  3.41544114e-02 ... -5.09967422e-03, -3.12342797e-03,  8.94547254e-03],
  [ 3.61969471e-02,  8.10870435e-03, -1.92146900e-03 ... -3.68045643e-02,  1.80197470e-02, -1.78961698e-02],
  [ 2.45927437e-03, -1.17309773e-02, -9.71407164e-03 ... -7.21747475e-03, -8.41312669e-03,  1.38750151e-02]]),
 Tensor(shape=[1024], dtype=Float32, value= [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
 Tensor(shape=[2, 1024], dtype=Float32, value=
 [[-2.14432161e-02,  2.90536340e-02,  2.31685974e-02 ... -5.82737103e-03, -8.68804567e-03

按照如下步骤定义训练逻辑：

1. 构建正向计算函数
2. 函数变换，获取微分函数
3. 定义训练一个step的逻辑
4. 遍历训练数据集进行模型训练，同时每一个epoch后，遍历验证数据集获取当前的评价指标（accuracy、f1 score）

In [16]:
from mindnlp.core import value_and_grad
def forward_fn(**batch):
    outputs = model(**batch)
    loss = outputs.loss
    return loss

grad_fn = value_and_grad(forward_fn, tuple(model.parameters()))

for epoch in range(num_epochs):
    model.set_train()
    train_total_size = train_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):
        optimizer.zero_grad()
        loss = grad_fn(**batch)
        optimizer.step()
        lr_scheduler.step()

    model.set_train(False)
    eval_total_size = eval_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):
        outputs = model(**batch)
        predictions = outputs.logits.argmax(axis=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/115 [00:00<?, ?it/s]

\

  1%|          | 1/115 [00:01<02:04,  1.09s/it]

-

100%|██████████| 115/115 [01:07<00:00,  1.71it/s]
  0%|          | 0/13 [00:00<?, ?it/s]

\

100%|██████████| 13/13 [00:09<00:00,  1.39it/s]


epoch 0: {'accuracy': 0.6887254901960784, 'f1': 0.8140556368960467}


100%|██████████| 115/115 [00:36<00:00,  3.15it/s]
100%|██████████| 13/13 [00:01<00:00,  7.49it/s]


epoch 1: {'accuracy': 0.696078431372549, 'f1': 0.8176470588235294}


100%|██████████| 115/115 [00:35<00:00,  3.20it/s]
100%|██████████| 13/13 [00:01<00:00,  7.35it/s]


epoch 2: {'accuracy': 0.7107843137254902, 'f1': 0.8217522658610272}


100%|██████████| 115/115 [00:36<00:00,  3.13it/s]
100%|██████████| 13/13 [00:01<00:00,  7.70it/s]


epoch 3: {'accuracy': 0.6985294117647058, 'f1': 0.8183161004431315}


100%|██████████| 115/115 [00:36<00:00,  3.17it/s]
100%|██████████| 13/13 [00:01<00:00,  7.89it/s]

epoch 4: {'accuracy': 0.6985294117647058, 'f1': 0.8177777777777778}



