In [1]:
import argparse
import os

import numpy as np
import mindspore as ms
from mindspore import nn, ops
from mindnlp.core import value_and_grad
from mindspore.train import Model
from mindspore import context, Tensor
from mindnlp.transformers import BertGenerationConfig
from mindnlp.transformers import BertGenerationDecoder
from loaders.coco_full_loader import get_loader
from mindnlp.core.optim import AdamW
from tqdm.notebook import tqdm

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
cannot found `mindformers.experimental`, please install dev version by
`pip install git+https://gitee.com/mindspore/mindformers` 
or remove mindformers by 
`pip uninstall mindformers`
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.319 seconds.
Prefix dict has been built successfully.


In [2]:
def train_decoder(args):
    # 设置运行环境
    context.set_context(device_target="Ascend")

    # 初始化模型
    if (not os.path.exists(f"{args.saved_model_path}/decoder_model")):
        bert_config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
        bert_config.is_decoder = True
        bert_config.add_cross_attention = True
        bert_config.return_dict=True
        bert_model = BertGenerationDecoder.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder",
                                                           config=bert_config)
    else:
        bert_model = BertGenerationDecoder.from_pretrained(f"{args.saved_model_path}/decoder_model")

    optimizer = AdamW(bert_model.trainable_params(),lr=args.lr,weight_decay=args.weight_decay)

    # 定义前向网络
    def forward_fn(input_ids, attention_mask, position_ids, clip_embeds, labels):
        loss = bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            encoder_hidden_states=clip_embeds,
            labels=labels
        ).loss
        return loss

    # 定义梯度函数
    grad_fn = value_and_grad(forward_fn,bert_model.trainable_params())

    # 训练步骤
    def train_step(input_ids, attention_mask, position_ids, clip_embeds, labels):
        optimizer.zero_grad()
        loss = grad_fn(input_ids, attention_mask, position_ids, clip_embeds, labels)
        optimizer.step()
        return loss

    val_loss = evaluate(bert_model)
    
    # 训练循环
    for epoch in range(args.num_epochs):
        # 加载数据
        train_dataset = get_loader(train=True, clip_backbone='ViT-B32')

        bert_model.set_train()
        total_loss = 0
        steps=0

        for step,batch in enumerate(tqdm(train_dataset.create_dict_iterator())):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label_ids']
            clip_embeds = batch['clip_features']

            # 生成position_ids
            N, seq_length = input_ids.shape
            # 先0~seq_len数据，然后使用python加轴，然后在维度0重复N次(这里会实际的创建数据，不是视图，基础深度学习框架也还是要去多学学)
            position_ids = Tensor(np.arange(seq_length)[None].repeat(N, axis=0), ms.int32)

            # 扩展clip_embeds
            # ops.repeat_elements：这个操作会沿着指定的轴（axis参数）重复张量中的元素。具体来说：
            clip_extended_embed = ops.repeat_elements(clip_embeds, rep=2, axis=1)
            # ops.expand_dims：这个操作会在指定的位置插入一个新的维度。具体来说：
            clip_extended_embed = ops.expand_dims(clip_extended_embed, 1)

            loss = train_step(input_ids, attention_mask, position_ids,
                              clip_extended_embed, labels)

            total_loss += loss.asnumpy()
            steps+=1

        avg_loss = total_loss / steps
        print(f'Epoch {epoch + 1}, Average Loss: {avg_loss}')

        # 验证
        val_loss = evaluate(bert_model)
        print(f'Validation Loss: {val_loss}')

        # 保存模型
        bert_model.save_pretrained(f"{args.saved_model_path}/decoder_model")


In [3]:
def evaluate(model):
    model.set_train(False)
    total_loss = 0
    steps = 0
    dataset = get_loader(train=False, clip_backbone='ViT-B32')

    for step,batch in enumerate(tqdm(dataset.create_dict_iterator())):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label_ids']
        clip_embeds = batch['clip_features']

        N, seq_length = input_ids.shape
        position_ids = Tensor(np.arange(seq_length)[None].repeat(N, axis=0), ms.int32)

        clip_extended_embed = ops.repeat_elements(clip_embeds, rep=2, axis=1)
        clip_extended_embed = ops.expand_dims(clip_extended_embed, 1)

        loss = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            encoder_hidden_states=clip_extended_embed,
            labels=labels
        ).loss

        total_loss += loss.asnumpy()
        steps += 1

    return total_loss / steps

In [None]:
import os
import sys
import argparse

def get_args_in_notebook():
    # 如果在 Jupyter Notebook 中运行，则直接定义参数
    args = argparse.Namespace(
        lr=1e-5,
        weight_decay=1e-4,
        num_epochs=24,
        trained_path='./trained_models/COCO/'
    )
    return args

if 'ipykernel' in sys.modules or 'IPython' in sys.modules:
    # 检测是否在 Jupyter Notebook 或 IPython 环境中运行
    args = get_args_in_notebook()
else:
    # 在命令行环境中正常解析参数
    parser = argparse.ArgumentParser()
    parser.add_argument('--lr', type=float, default=1e-5)
    parser.add_argument('--weight_decay', type=float, default=1e-4)
    parser.add_argument('--num_epochs', type=int, default=1)
    parser.add_argument('--trained_path', type=str, default='./trained_models/COCO/')
    args = parser.parse_args()

# 动态设置保存模型路径
args.saved_model_path = os.path.join(args.trained_path, 'ViT-B32/')

# 创建目录（如果不存在）
os.makedirs(args.saved_model_path, exist_ok=True)

# 训练解码器
train_decoder(args)