Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/source/Instruction/Megatron-SWIFT训练.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,16 @@ swift export \
```
- 注意:`mcore_adapters`文件夹中包含`args.json`文件,转换过程中会读取文件中`mcore_model`和LoRA相关的参数信息,并将`mcore_model`和`mcore_adapters`进行merge-lora成完整权重,最终转换成HF格式权重。

如果你只想merge-lora,而不希望转成HF格式权重,用于后续DPO训练,可以使用以下脚本:
```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--mcore_adapters megatron_output/Qwen2.5-7B-Instruct/vx-xxx \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx-mcore \
--test_convert_precision true
```

## Benchmark

Expand Down
12 changes: 12 additions & 0 deletions docs/source_en/Instruction/Megatron-SWIFT-Training.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,18 @@ swift export \

- Note: The `mcore_adapters` folder contains an `args.json` file. During the conversion process, parameters related to `mcore_model` and LoRA will be loaded from this file. The system will then perform a merge-lora operation between the `mcore_model` and `mcore_adapters` to obtain the complete model weights, and finally convert them into HuggingFace (HF) format.

If you only want to merge the LoRA weights without converting them to Hugging Face format, for subsequent DPO training, you can use the following script:

```shell
CUDA_VISIBLE_DEVICES=0 \
swift export \
--mcore_adapters megatron_output/Qwen2.5-7B-Instruct/vx-xxx \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir megatron_output/Qwen2.5-7B-Instruct/vx-xxx-mcore \
--test_convert_precision true
```

## Benchmark
The speed comparison of full-parameter training for Dense/MoE models using `megatron sft` and `swift sft` on a single machine with eight A800 GPUs is shown below. The corresponding scripts can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron/benchmark).

Expand Down
6 changes: 3 additions & 3 deletions swift/llm/export/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ def run(self):
export_to_ollama(args)
elif args.to_cached_dataset:
export_cached_dataset(args)
elif args.to_hf or args.mcore_adapters and args.to_mcore:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The boolean logic args.to_hf or args.mcore_adapters and args.to_mcore can be hard to read due to operator precedence. While and has higher precedence than or, adding parentheses would make the intent much clearer and prevent potential misinterpretations.

Suggested change
elif args.to_hf or args.mcore_adapters and args.to_mcore:
elif args.to_hf or (args.mcore_adapters and args.to_mcore):

from swift.megatron import convert_mcore2hf
convert_mcore2hf(args)
elif args.to_mcore:
from swift.megatron import convert_hf2mcore
convert_hf2mcore(args)
elif args.to_hf:
from swift.megatron import convert_mcore2hf
convert_mcore2hf(args)
elif args.push_to_hub:
model_dir = args.adapters and args.adapters[0] or args.model_dir
assert model_dir, f'model_dir: {model_dir}'
Expand Down
5 changes: 3 additions & 2 deletions swift/llm/infer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,8 @@ def prepare_adapter(args, model, adapters=None):

def prepare_model_template(args, **kwargs):
model, processor = args.get_model_processor(**kwargs)
model = prepare_adapter(args, model)
template = args.get_template(processor)
update_generation_config_eos_token(model.generation_config, template)
if model is not None:
model = prepare_adapter(args, model)
update_generation_config_eos_token(model.generation_config, template)
return model, template
4 changes: 1 addition & 3 deletions swift/llm/train/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,8 @@ def prepare_model(cls, args, model, *, template=None, train_dataset=None, task_t
tuner: Tuner = extra_tuners[args.train_type]
else:
tuner = Swift
kwargs = {}
assert not args.adapters or len(args.adapters) == 1, f'args.adapters: {args.adapters}'
model = tuner.from_pretrained(
model, args.resume_from_checkpoint or args.adapters[0], is_trainable=True, **kwargs)
model = tuner.from_pretrained(model, args.resume_from_checkpoint or args.adapters[0], is_trainable=True)
else:
if args.train_type in extra_tuners:
tuner: Tuner = extra_tuners[args.train_type]
Expand Down
17 changes: 14 additions & 3 deletions swift/megatron/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,9 +659,9 @@ def _patch_peft_ModulesToSaveWrapper():
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
from .utils import tuners_sharded_state_dict

ModulesToSaveWrapper = peft_module.ModulesToSaveWrapper
OriginModulesToSaveWrapper = peft_module.ModulesToSaveWrapper

class NewModulesToSaveWrapper(ModulesToSaveWrapper):
class ModulesToSaveWrapper(OriginModulesToSaveWrapper):

def __init__(self, module_to_save, *args, **kwargs):
tp_group = getattr(module_to_save, 'tp_group', None)
Expand Down Expand Up @@ -694,7 +694,7 @@ def sharded_state_dict(
f'{prefix}modules_to_save.default.weight']
return sharded_state_dict

peft_module.ModulesToSaveWrapper = NewModulesToSaveWrapper
peft_module.ModulesToSaveWrapper = ModulesToSaveWrapper


def _patch_TransformerLayer():
Expand Down Expand Up @@ -790,9 +790,20 @@ def _worker(plan_shard):
FileSystemReader.read_data = read_data


def _patch_TELinear():
from megatron.core.extensions.transformer_engine import TELinear

def __repr__(self):
return (f'{type(self).__name__}(in_features={self.in_features}, '
f'out_features={self.out_features}, bias={self.use_bias}, TP={self.tp_size})')

TELinear.__repr__ = __repr__


def _patch_megatron():
_patch_flash_attn()
_patch_transformer_engine()
_patch_TELinear()
_patch__batched_p2p_ops()
_patch_mla_attention()
_patch_TEGroupedLinear()
Expand Down
51 changes: 29 additions & 22 deletions swift/megatron/utils/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def convert_hf2mcore(args: ExportArguments) -> None:
logger.info(f'megatron_config: {kwargs}')
_check_megatron_kwargs(kwargs)
current_convert_kwargs = convert_kwargs.copy()
if hf_model.model_info.is_moe_model:
if args.model_info.is_moe_model:
current_convert_kwargs['moe_grouped_gemm'] = True
megatron_args = MegatronArguments(
**kwargs, **current_convert_kwargs, save=args.output_dir, torch_dtype=args.torch_dtype)
Expand All @@ -191,25 +191,22 @@ def convert_hf2mcore(args: ExportArguments) -> None:

def convert_mcore2hf(args: ExportArguments) -> None:
from swift.megatron import prepare_mcore_model, adapter_state_dict_context
hf_model, template = prepare_model_template(args)
hf_model, template = prepare_model_template(args, load_model=args.to_hf)
processor = template.processor
if args.thread_count is None:
checkpoint_size = sum(get_n_params_grads(hf_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
args.thread_count = max(math.ceil(checkpoint_size / 10), 2) # 10GB
patch_torch_dist_shard(args.thread_count)

megatron_model_meta = get_megatron_model_meta(args.model_type)
assert megatron_model_meta is not None, f'Model: {args.model} is not supported.'
kwargs = megatron_model_meta.convert_hf_config(processor.model_info.config)
logger.info(f'megatron_config: {kwargs}')
_check_megatron_kwargs(kwargs)
current_convert_kwargs = convert_kwargs.copy()
if hf_model.model_info.is_moe_model:
if args.model_info.is_moe_model:
current_convert_kwargs['moe_grouped_gemm'] = True
megatron_args = MegatronArguments(
**kwargs,
**current_convert_kwargs,
load=args.mcore_model,
save=args.output_dir if args.to_mcore else None,
adapter_load=args.mcore_adapters[0] if args.mcore_adapters else None,
torch_dtype=args.torch_dtype)
patch_megatron_tokenizer(processor)
Expand All @@ -226,18 +223,28 @@ def convert_mcore2hf(args: ExportArguments) -> None:
logger.info('Merge LoRA...')
mg_model = peft_model.merge_and_unload()
logger.info('Megatron model created successfully.')
megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
if args.test_convert_precision:
test_convert_precision(hf_model, mg_model, template)
del mg_model
logger.info('Successfully transferred MG model weights to HF model.')
ckpt_dir = megatron_args.load if megatron_args.adapter_load is None else megatron_args.adapter_load
save_checkpoint(
hf_model,
processor,
args.output_dir,
safe_serialization=args.safe_serialization,
model_dirs=[ckpt_dir, args.model_dir],
max_shard_size=args.max_shard_size,
additional_saved_files=hf_model.model_meta.additional_saved_files)
logger.info(f'Successfully saved HF model weights in `{args.output_dir}`.')
if args.to_hf:
megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
if args.test_convert_precision:
test_convert_precision(hf_model, mg_model, template)
del mg_model
logger.info('Successfully transferred MG model weights to HF model.')
ckpt_dir = megatron_args.load if megatron_args.adapter_load is None else megatron_args.adapter_load
save_checkpoint(
hf_model,
processor,
args.output_dir,
safe_serialization=args.safe_serialization,
model_dirs=[ckpt_dir, args.model_dir],
max_shard_size=args.max_shard_size,
additional_saved_files=hf_model.model_meta.additional_saved_files)
logger.info(f'Successfully saved HF model weights in `{args.output_dir}`.')
elif args.to_mcore:
if args.thread_count is None:
checkpoint_size = sum(get_n_params_grads(mg_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
args.thread_count = max(math.ceil(checkpoint_size / 10), 2) # 10GB
patch_torch_dist_shard(args.thread_count)

args.save_args()
mg_save_checkpoint(1, [mg_model], None, None, 0)
logger.info(f'Successfully saved Megatron model weights in `{args.output_dir}`.')
Comment on lines +226 to +250
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The use of elif args.to_mcore: makes the to_hf and to_mcore operations mutually exclusive. If a user provides both --to_hf and --to_mcore flags, only the to_hf operation will be performed. This might not be the intended behavior. If both operations should be supported in a single run, this elif should be an if.

However, changing elif to if will cause a NameError because del mg_model is inside the if args.to_hf: block.

To support both flags concurrently, you could refactor this section to perform both conversions if requested, and then delete the model as suggested below.

    if args.to_hf:
        megatron_model_meta.convert_mcore2hf(hf_model, mg_model)
        if args.test_convert_precision:
            test_convert_precision(hf_model, mg_model, template)
        logger.info('Successfully transferred MG model weights to HF model.')
        ckpt_dir = megatron_args.load if megatron_args.adapter_load is None else megatron_args.adapter_load
        save_checkpoint(
            hf_model,
            processor,
            args.output_dir,
            safe_serialization=args.safe_serialization,
            model_dirs=[ckpt_dir, args.model_dir],
            max_shard_size=args.max_shard_size,
            additional_saved_files=hf_model.model_meta.additional_saved_files)
        logger.info(f'Successfully saved HF model weights in `{args.output_dir}`.')
    if args.to_mcore:
        if args.thread_count is None:
            checkpoint_size = sum(get_n_params_grads(mg_model)[0]) * torch.finfo(args.torch_dtype).bits // 8e9
            args.thread_count = max(math.ceil(checkpoint_size / 10), 2)  # 10GB
        patch_torch_dist_shard(args.thread_count)

        args.save_args()
        mg_save_checkpoint(1, [mg_model], None, None, 0)
        logger.info(f'Successfully saved Megatron model weights in `{args.output_dir}`.')
    del mg_model

Loading