-
Notifications
You must be signed in to change notification settings - Fork 897
Description
Description
I try to cast Qwen2.5-VL model to Megatron format by:
CUDA_VISIBLE_DEVICES=0 \
swift export \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--to_mcore true \
--torch_dtype bfloat16 \
--output_dir Qwen2.5-VL-7B-Instruct-mcore \
--test_convert_precision true
And get error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/cli/export.py", line 5, in
[rank0]: export_main()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/export/export.py", line 53, in export_main
[rank0]: return SwiftExport(args).main()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/base.py", line 49, in main
[rank0]: result = self.run()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/export/export.py", line 40, in run
[rank0]: convert_hf2mcore(args)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/megatron/utils/convert.py", line 234, in convert_hf2mcore
[rank0]: test_convert_precision(hf_model, mg_model, template, args.test_convert_dtype)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/megatron/utils/convert.py", line 151, in test_convert_precision
[rank0]: hf_logits = hf_model(**inputs).logits
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1772, in inner
[rank0]: args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/base.py", line 1326, in pre_forward_hook
[rank0]: kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/template/qwen.py", line 348, in _post_encode
[rank0]: inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.visual, self.processor, model.config)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/base.py", line 1953, in _get_inputs_embeds_hf
[rank0]: mixed_embeds = visual(pixel_values_mixed, grid_thw=grid_thw)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py", line 428, in forward
[rank0]: hidden_states = self.patch_embed(hidden_states)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py", line 89, in forward
[rank0]: hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 725, in forward
[rank0]: return self._conv_forward(input, self.weight, self.bias)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 720, in _conv_forward
[rank0]: return F.conv3d(
[rank0]: RuntimeError
hardware and system info
Package Version
absl-py 2.3.1
accelerate 1.10.1
addict 2.4.0
aiofiles 24.1.0
aiohappyeyeballs 2.6.1
aiohttp 3.12.15
aiosignal 1.4.0
aliyun-python-sdk-core 2.16.0
aliyun-python-sdk-kms 2.16.5
annotated-types 0.7.0
anyio 4.10.0
apex 0.1
async-timeout 5.0.1
attrdict 2.0.1
attrs 25.3.0
av 15.1.0
binpacking 1.5.2
Brotli 1.1.0
certifi 2025.8.3
cffi 2.0.0
charset-normalizer 3.4.3
click 8.2.1
cmake 4.1.0
contourpy 1.3.2
cpm-kernels 1.0.11
crcmod 1.7
cryptography 46.0.1
cycler 0.12.1
dacite 1.9.2
datasets 3.6.0
decord 0.6.0
deepspeed 0.16.9
dill 0.3.8
distro 1.9.0
einops 0.8.1
exceptiongroup 1.3.0
fastapi 0.116.2
ffmpy 0.6.1
filelock 3.13.1
flash_attn 2.7.4.post1
fonttools 4.59.2
frozenlist 1.7.0
fsspec 2024.6.1
future 1.0.0
gradio 5.46.0
gradio_client 1.13.0
groovy 0.1.2
grpcio 1.75.0
h11 0.16.0
hf-xet 1.1.10
hjson 3.1.0
httpcore 1.0.9
httpx 0.28.1
huggingface-hub 0.35.0
idna 3.10
importlib_metadata 8.7.0
jieba 0.42.1
Jinja2 3.1.4
jiter 0.11.0
jmespath 0.10.0
joblib 1.5.2
kiwisolver 1.4.9
Markdown 3.9
markdown-it-py 4.0.0
MarkupSafe 2.1.5
matplotlib 3.10.6
mdurl 0.1.2
megatron-core 0.13.2
ml_dtypes 0.5.3
modelscope 1.30.0
mpmath 1.3.0
ms-swift 3.9.0.dev0
msgpack 1.1.1
multidict 6.6.4
multiprocess 0.70.16
networkx 3.3
ninja 1.13.0
nltk 3.9.1
numpy 1.26.4
nvidia-cublas-cu12 12.4.5.8
nvidia-cuda-cupti-cu12 12.4.127
nvidia-cuda-nvrtc-cu12 12.4.127
nvidia-cuda-runtime-cu12 12.4.127
nvidia-cudnn-cu12 9.1.0.70
nvidia-cufft-cu12 11.2.1.3
nvidia-curand-cu12 10.3.5.147
nvidia-cusolver-cu12 11.6.1.9
nvidia-cusparse-cu12 12.3.1.170
nvidia-cusparselt-cu12 0.6.2
nvidia-nccl-cu12 2.21.5
nvidia-nvjitlink-cu12 12.4.127
nvidia-nvtx-cu12 12.4.127
onnx 1.19.0
onnx-ir 0.1.9
onnxscript 0.3.1
openai 1.107.3
orjson 3.11.3
oss2 2.19.1
packaging 25.0
pandas 2.3.2
peft 0.17.1
pillow 11.0.0
pip 25.2
propcache 0.3.2
protobuf 6.32.1
psutil 7.0.0
py-cpuinfo 9.0.0
pyarrow 21.0.0
pybind11 3.0.1
pycparser 2.23
pycryptodome 3.23.0
pydantic 2.11.9
pydantic_core 2.33.2
pydub 0.25.1
Pygments 2.19.2
pyparsing 3.2.4
python-dateutil 2.9.0.post0
python-multipart 0.0.20
pytz 2025.2
PyYAML 6.0.2
qwen-vl-utils 0.0.11
regex 2025.9.1
requests 2.32.5
rich 14.1.0
rouge 1.0.1
ruff 0.13.0
safehttpx 0.1.6
safetensors 0.6.2
scipy 1.15.3
semantic-version 2.10.0
sentencepiece 0.2.1
setuptools 59.6.0
shellingham 1.5.4
simplejson 3.20.1
six 1.17.0
sniffio 1.3.1
sortedcontainers 2.4.0
starlette 0.48.0
sympy 1.13.1
tensorboard 2.20.0
tensorboard-data-server 0.7.2
tiktoken 0.11.0
tokenizers 0.22.0
tomlkit 0.13.3
torch 2.6.0+cu124
torchaudio 2.6.0+cu124
torchvision 0.21.0+cu124
tqdm 4.67.1
transformer_engine 2.6.0.post1
transformer_engine_cu12 2.6.0.post1
transformer-engine-torch 2.6.0.post1
transformers 4.56.1
transformers-stream-generator 0.0.5
triton 3.2.0
trl 0.20.0
typer 0.17.4
typing_extensions 4.15.0
typing-inspection 0.4.1
tzdata 2025.2
urllib3 2.5.0
uvicorn 0.35.0
websockets 15.0.1
Werkzeug 3.1.3
wheel 0.45.1
xxhash 3.5.0
yarl 1.20.1
zipp 3.23.0
zstandard 0.25.0
GPU: NVIDIA H200
CUDA Version: 12.9
Driver Version: 575.57.08
ERROR log detail
run sh: /opt/megatron-swift/bin/python3 /opt/megatron-swift/lib/python3.10/site-packages/swift/cli/export.py --model Qwen2.5-VL-7B-Instruct --to_mcore true --torch_dtype bfloat16 --output_dir Qwen2.5-VL-7B-Instruct-mcore --test_convert_precision true
[INFO:swift] Successfully registered /opt/megatron-swift/lib/python3.10/site-packages/swift/llm/dataset/data/dataset_info.json
.
[INFO:swift] rank: 0, local_rank: 0, world_size: 1, local_world_size: 1
[INFO:swift] Loading the model using model_dir: Qwen2.5-VL-7B-Instruct
[rank0]:[W917 13:30:37.073831728 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
torch_dtype
is deprecated! Use dtype
instead!
[INFO:swift] Setting args.lazy_tokenize: True
[INFO:swift] args.output_dir: /scratch/e1374427/models/Qwen2.5-VL-7B-Instruct-mcore
[INFO:swift] Global seed set to 42
[INFO:swift] args: ExportArguments(model='Qwen2.5-VL-7B-Instruct', model_type='qwen2_5_vl', model_revision=None, task_type='causal_lm', torch_dtype=torch.bfloat16, attn_impl=None, new_special_tokens=[], num_labels=None, problem_type=None, rope_scaling=None, device_map=None, max_memory={}, max_model_len=None, local_repo_path=None, init_strategy=None, template='qwen2_5_vl', system=None, max_length=2048, truncation_strategy='delete', max_pixels=None, agent_template=None, norm_bbox=None, use_chat_template=True, padding_free=False, padding_side='right', loss_scale='default', sequence_parallel_size=1, response_prefix=None, template_backend='swift', dataset=[], val_dataset=[], split_dataset_ratio=0.0, data_seed=42, dataset_num_proc=1, load_from_cache_file=True, dataset_shuffle=True, val_dataset_shuffle=False, streaming=False, interleave_prob=None, stopping_strategy='first_exhausted', shuffle_buffer_size=1000, download_mode='reuse_dataset_if_exists', columns={}, strict=False, remove_unused_columns=True, model_name=None, model_author=None, custom_dataset_info=[], quant_method=None, quant_bits=None, hqq_axis=None, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=None, max_new_tokens=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, num_beams=1, stream=False, stop_words=[], logprobs=False, top_logprobs=None, ckpt_dir=None, lora_modules=[], tuner_backend='peft', train_type='lora', adapters=[], external_plugins=[], seed=42, model_kwargs={}, load_args=True, load_data_args=False, packing=False, packing_length=None, lazy_tokenize=True, cached_dataset=[], custom_register_path=[], use_hf=False, hub_token=None, ddp_timeout=18000000, ddp_backend=None, ignore_args_error=False, use_swift_lora=False, merge_lora=False, safe_serialization=True, max_shard_size='5GB', output_dir='/scratch/e1374427/models/Qwen2.5-VL-7B-Instruct-mcore', quant_n_samples=256, quant_batch_size=1, group_size=128, to_cached_dataset=False, to_ollama=False, to_mcore=True, to_hf=False, mcore_model=None, mcore_adapters=[], thread_count=None, test_convert_precision=True, test_convert_dtype=torch.float32, push_to_hub=False, hub_model_id=None, hub_private_repo=False, commit_message='update files', to_peft_format=False, exist_ok=False)
[INFO:swift] Start time of running main: 2025-09-17 13:30:37.745367
[INFO:swift] swift.version: 3.9.0.dev0
[INFO:swift] Run the command: git -C /home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM fetch
[INFO:swift] Run the command: git -C /home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM checkout core_r0.13.0
Already on 'core_r0.13.0'
Your branch is up to date with 'origin/core_r0.13.0'.
[INFO:swift] Run the command: git -C /home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM pull
Already up to date.
[INFO:swift] local_repo_path: /home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM
[INFO:swift] Patch apply_rotary_pos_emb successfully applied.
[INFO:swift] Patch _SplitAlongDim successfully applied.
[INFO:swift] Patch FileSystemReader successfully applied.
[INFO:swift] Patch peft successfully applied.
[INFO:swift] megatron.core.version: 0.13.2
[INFO:swift] Loading the model using model_dir: Qwen2.5-VL-7B-Instruct
The image processor of type Qwen2VLImageProcessor
is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with use_fast=False
. Note that this behavior will be extended to all models in a future release.
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
torch_dtype
is deprecated! Use dtype
instead!
megatron_model_meta ............................. MMGPTMegatronModelMeta(megatron_model_type='qwen2_5_vl', model_types=['qwen2_5_vl'], convert_mcore2hf=<function convert_mcore2hf_qwen2_5_vl at 0x1551252a5ab0>, convert_hf2mcore=<function convert_hf2mcore_qwen2_5_vl at 0x1551252a5a20>, model_cls=<class 'swift.megatron.model.mm_gpt_model.MultimodalGPTModel'>, convert_hf_config=<function convert_gpt_hf_config at 0x15512528ed40>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x15512528da20>, visual_cls=<class 'swift.megatron.model.mm_gpt.qwen.Qwen2_5VL_Vit'>, extra_args_provider=None)
memory_snapshot_path ............................ snapshot.pickle
merge_file ...................................... None
micro_batch_size ................................ 1
microbatch_group_size_per_vp_stage .............. None
mid_level_dataset_surplus ....................... 0.005
min_loss_scale .................................. 1.0
min_lr .......................................... 0.0
mlp_chunks_for_prefill .......................... 1
mlp_padding_free ................................ False
mmap_bin_files .................................. True
mock_data ....................................... False
model_info ...................................... ModelInfo(model_type='qwen2_5_vl', model_dir='/scratch/e1374427/models/Qwen2.5-VL-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)
model_meta ...................................... ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x155207dda8c0>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])
modules_to_save ................................. []
moe_apply_probs_on_input ........................ False
moe_aux_loss_coeff .............................. 0.0
moe_deepep_num_sms .............................. 20
moe_enable_deepep ............................... False
moe_expert_capacity_factor ...................... None
moe_extended_tp ................................. False
moe_ffn_hidden_size ............................. None
moe_grouped_gemm ................................ False
moe_input_jitter_eps ............................ None
moe_layer_freq .................................. 1
moe_layer_recompute ............................. False
moe_pad_expert_input_to_capacity ................ False
moe_per_layer_logging ........................... False
moe_permute_fusion .............................. False
moe_router_bias_update_rate ..................... 0.001
moe_router_dtype ................................ fp32
moe_router_enable_expert_bias ................... False
moe_router_force_load_balancing ................. False
moe_router_group_topk ........................... None
moe_router_load_balancing_type .................. aux_loss
moe_router_num_groups ........................... None
moe_router_padding_for_fp8 ...................... False
moe_router_pre_softmax .......................... False
moe_router_score_function ....................... softmax
moe_router_topk ................................. 2
moe_router_topk_scaling_factor .................. None
moe_shared_expert_intermediate_size ............. None
moe_shared_expert_overlap ....................... False
moe_token_dispatcher_type ....................... alltoall
moe_token_drop_policy ........................... probs
moe_upcycling_granularity ....................... 1
moe_use_legacy_grouped_gemm ..................... False
moe_use_upcycling ............................... False
moe_z_loss_coeff ................................ None
mrope_section ................................... [16, 24, 24]
mscale .......................................... 1.0
mscale_all_dim .................................. 1.0
mtp_loss_scaling_factor ......................... 0.1
mtp_num_layers .................................. None
multi_latent_attention .......................... False
nccl_all_reduce_for_prefill ..................... False
nccl_communicator_config_path ................... None
nccl_ub ......................................... False
no_load_optim ................................... True
no_load_rng ..................................... True
no_persist_layer_norm ........................... False
no_rope_freq .................................... None
no_save_optim ................................... True
no_save_rng ..................................... True
non_persistent_ckpt_type ........................ None
non_persistent_global_ckpt_dir .................. None
non_persistent_local_ckpt_algo .................. fully_parallel
non_persistent_local_ckpt_dir ................... None
non_persistent_save_interval .................... None
norm_epsilon .................................... 1e-06
normalization ................................... RMSNorm
num_attention_heads ............................. 28
num_channels .................................... 3
num_classes ..................................... 1000
num_dataset_builder_threads ..................... 1
num_distributed_optimizer_instances ............. 1
num_experts ..................................... None
num_layers ...................................... 28
num_layers_at_end_in_bf16 ....................... 1
num_layers_at_start_in_bf16 ..................... 1
num_layers_per_virtual_pipeline_stage ........... None
num_query_groups ................................ 4
num_virtual_stages_per_pipeline_rank ............ None
num_workers ..................................... 4
object_storage_cache_path ....................... None
one_logger_async ................................ False
one_logger_project .............................. megatron-lm
one_logger_run_name ............................. None
onnx_safe ....................................... None
openai_gelu ..................................... False
optimizer ....................................... adam
optimizer_cpu_offload ........................... False
optimizer_offload_fraction ...................... 1.0
original_max_position_embeddings ................ None
output_bert_embeddings .......................... False
overlap_cpu_optimizer_d2h_h2d ................... False
overlap_grad_reduce ............................. False
overlap_p2p_comm ................................ False
overlap_p2p_comm_warmup_flush ................... False
overlap_param_gather ............................ False
overlap_param_gather_with_optimizer_step ........ False
override_opt_param_scheduler .................... False
padded_vocab_size ............................... 152064
padding_free .................................... True
params_dtype .................................... torch.bfloat16
partial_rotary_factor ........................... None
patch_dim ....................................... 16
per_split_data_args_path ........................ None
perform_initialization .......................... False
pin_cpu_grads ................................... True
pin_cpu_params .................................. True
pipeline_model_parallel_comm_backend ............ None
pipeline_model_parallel_layout .................. None
pipeline_model_parallel_size .................... 1
pipeline_model_parallel_split_rank .............. None
position_embedding_type ......................... mrope
pretrained_checkpoint ........................... None
profile ......................................... False
profile_ranks ................................... [0]
profile_step_end ................................ 12
profile_step_start .............................. 10
q_lora_rank ..................................... None
qk_head_dim ..................................... 128
qk_l2_norm ...................................... False
qk_layernorm .................................... False
qk_pos_emb_head_dim ............................. 64
query_in_block_prob ............................. 0.1
rampup_batch_size ............................... None
rank ............................................ 0
recompute_granularity ........................... selective
recompute_method ................................ None
recompute_modules ............................... ['core_attn']
recompute_num_layers ............................ None
record_memory_history ........................... False
ref_adapter_load ................................ None
ref_load ........................................ None
reference_free .................................. False
relative_attention_max_distance ................. 128
relative_attention_num_buckets .................. 32
replication ..................................... False
replication_factor .............................. 2
replication_jump ................................ None
rerun_mode ...................................... disabled
reset_attention_mask ............................ False
reset_position_ids .............................. False
result_rejected_tracker_filename ................ None
retriever_report_topk_accuracies ................ []
retriever_score_scaling ......................... False
retriever_seq_length ............................ 256
retro_add_retriever ............................. False
retro_attention_gate ............................ 1
retro_cyclic_train_iters ........................ None
retro_encoder_attention_dropout ................. 0.1
retro_encoder_hidden_dropout .................... 0.1
retro_encoder_layers ............................ 2
retro_num_neighbors ............................. 2
retro_num_retrieved_chunks ...................... 2
retro_project_dir ............................... None
retro_verify_neighbor_count ..................... True
reuse_grad_buf_for_mxfp8_param_ag ............... False
rope_scaling .................................... {'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}
rope_scaling_factor ............................. 8.0
rotary_base ..................................... 1000000
rotary_interleaved .............................. False
rotary_percent .................................. 1.0
rotary_scaling_factor ........................... 1.0
rotary_seq_len_interpolation_factor ............. None
rpo_alpha ....................................... None
run_workload_inspector_server ................... False
sample_rate ..................................... 1.0
save ............................................ /scratch/e1374427/models/Qwen2.5-VL-7B-Instruct-mcore
save_interval ................................... 500
scatter_gather_tensors_in_pipeline .............. True
seed ............................................ 42
seq_length ...................................... 128000
sequence_parallel ............................... False
sft ............................................. False
sft_tokenizer_prompt_format ..................... nemotron-h-aligned
sgd_momentum .................................... 0.9
short_seq_prob .................................. 0.1
skip_train ...................................... False
skipped_train_samples ........................... 0
spec ............................................ None
split ........................................... None
squared_relu .................................... False
start_weight_decay .............................. 0.1
straggler_ctrlr_port ............................ 65535
straggler_minmax_count .......................... 1
suggested_communication_unit_size ............... None
swiglu .......................................... True
swin_backbone_type .............................. tiny
symmetric_ar_type ............................... None
target_modules .................................. ['all-linear']
target_regex .................................... None
te_rng_tracker .................................. False
tensor_model_parallel_size ...................... 1
tensorboard_dir ................................. /scratch/e1374427/models/Qwen2.5-VL-7B-Instruct-mcore/runs
tensorboard_log_interval ........................ 1
tensorboard_queue_size .......................... 50
test_data_path .................................. None
test_mode ....................................... False
tiktoken_num_special_tokens ..................... 1000
tiktoken_pattern ................................ None
tiktoken_special_tokens ......................... None
timing_log_level ................................ 0
timing_log_option ............................... minmax
titles_data_path ................................ None
tokenizer_model ................................. None
tokenizer_type .................................. None
torch_dtype ..................................... torch.bfloat16
torch_fsdp2_reshard_after_forward ............... True
tp_comm_bootstrap_backend ....................... nccl
tp_comm_bulk_dgrad .............................. True
tp_comm_bulk_wgrad .............................. True
tp_comm_overlap ................................. False
tp_comm_overlap_ag .............................. True
tp_comm_overlap_cfg ............................. None
tp_comm_overlap_rs .............................. True
tp_comm_overlap_rs_dgrad ........................ False
tp_comm_split_ag ................................ True
tp_comm_split_rs ................................ True
train_data_path ................................. None
train_iters ..................................... None
train_samples ................................... None
train_sync_interval ............................. None
train_type ...................................... full
trainable_parameters ............................ []
trainable_parameters_regex ...................... None
transformer_impl ................................ transformer_engine
transformer_pipeline_model_parallel_size ........ 1
untie_embeddings_and_output_weights ............. True
use_checkpoint_args ............................. False
use_checkpoint_opt_param_scheduler .............. False
use_cpu_initialization .......................... True
use_custom_fsdp ................................. False
use_dist_ckpt ................................... True
use_dist_ckpt_deprecated ........................ False
use_distributed_optimizer ....................... True
use_flash_attn .................................. False
use_legacy_models ............................... False
use_mp_args_from_checkpoint_args ................ False
use_one_sent_docs ............................... False
use_persistent_ckpt_worker ...................... False
use_precision_aware_optimizer ................... False
use_pytorch_profiler ............................ False
use_ring_exchange_p2p ........................... False
use_rope_scaling ................................ False
use_rotary_position_embeddings .................. False
use_rslora ...................................... False
use_shared_expert_gate .......................... False
use_sharp ....................................... False
use_tokenizer_model_from_checkpoint_args ........ True
use_torch_fsdp2 ................................. False
use_torch_optimizer_for_cpu_offload ............. False
use_tp_pp_dp_mapping ............................ False
v_head_dim ...................................... 128
valid_data_path ................................. None
variable_seq_lengths ............................ False
virtual_pipeline_model_parallel_size ............ None
vision_backbone_type ............................ vit
vision_pretraining .............................. False
vision_pretraining_type ......................... classify
vit_gradient_checkpointing ...................... True
vocab_extra_ids ................................. 0
vocab_file ...................................... None
vocab_size ...................................... None
wandb_exp_name ..................................
wandb_project ...................................
wandb_save_dir ..................................
weight_decay .................................... 0.1
weight_decay_incr_style ......................... constant
wgrad_deferral_limit ............................ 0
world_size ...................................... 1
yaml_cfg ........................................ None
-------------------- end of arguments ---------------------
INFO:megatron.core.num_microbatches_calculator:setting number of microbatches to constant 16
setting tensorboard ...
WARNING: one_logger package is required to enable e2e metrics tracking. please go to https://confluence.nvidia.com/display/MLWFO/Package+Repositories for details to install it
WARNING:megatron.core.rerun_state_machine:RerunStateMachine initialized in mode disabled
torch distributed is already initialized, skipping initialization ...
initialized tensor model parallel with size 1
initialized pipeline model parallel with size 1
setting random seeds to 42 ...
compiling dataset index builder ...
make: Entering directory '/home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/datasets'
make: Nothing to be done for 'default'.
make: Leaving directory '/home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/datasets'done with dataset index builder. Compilation time: 0.030 seconds
WARNING: constraints for invoking optimized fused softmax kernel are not met. We default back to unfused kernel invocations.
compiling and loading fused kernels ...
done with compiling and loading fused kernels. Compilation time: 0.004 seconds
building GPT model ...
/home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/transformer/transformer_config.py:837: UserWarning: If you are using transformer_engine as the transformer implementation, the core_attn is from transformer_engine and may be the fused version. For fused attention, you have no need to set 'core_attn' to recompute. Please check that the core_attn recompute is really needed.
warnings.warn(
/home/svu/e1374427/.cache/modelscope/hub/_github/Megatron-LM/megatron/core/models/gpt/gpt_layer_specs.py:102: UserWarning: The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated and will be removed soon. Please update your code accordingly.
warnings.warn(
/opt/megatron-swift/lib/python3.10/site-packages/transformer_engine/pytorch/cpu_offload.py:673: DeprecationWarning: Offloading weights is deprecated. Using offload_weights=True does not have any effect.
warnings.warn(
[INFO:swift] Loading the model using model_dir: /scratch/e1374427/models/Qwen2.5-VL-7B-Instruct
[INFO:swift] model_kwargs: {'device_map': 'cuda:0'}
[INFO:swift] Megatron model created successfully.
[INFO:swift] n_parameter: 729
[INFO:swift] total_sum: 116114079.54245949
[INFO:swift] zero_count: 0
[INFO:swift] n_parameter: 589
[INFO:swift] total_sum: 116114079.25612068
[INFO:swift] zero_count: 0
[rank0]: Traceback (most recent call last):
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/cli/export.py", line 5, in
[rank0]: export_main()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/export/export.py", line 53, in export_main
[rank0]: return SwiftExport(args).main()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/base.py", line 49, in main
[rank0]: result = self.run()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/export/export.py", line 40, in run
[rank0]: convert_hf2mcore(args)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/megatron/utils/convert.py", line 234, in convert_hf2mcore
[rank0]: test_convert_precision(hf_model, mg_model, template, args.test_convert_dtype)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/megatron/utils/convert.py", line 151, in test_convert_precision
[rank0]: hf_logits = hf_model(**inputs).logits
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1845, in _call_impl
[rank0]: return inner()
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1772, in inner
[rank0]: args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/base.py", line 1326, in pre_forward_hook
[rank0]: kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/template/qwen.py", line 348, in _post_encode
[rank0]: inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.visual, self.processor, model.config)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/swift/llm/template/base.py", line 1953, in _get_inputs_embeds_hf
[rank0]: mixed_embeds = visual(pixel_values_mixed, grid_thw=grid_thw)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py", line 428, in forward
[rank0]: hidden_states = self.patch_embed(hidden_states)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py", line 89, in forward
[rank0]: hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 725, in forward
[rank0]: return self._conv_forward(input, self.weight, self.bias)
[rank0]: File "/opt/megatron-swift/lib/python3.10/site-packages/torch/nn/modules/conv.py", line 720, in _conv_forward
[rank0]: return F.conv3d(
[rank0]: RuntimeError
[rank0]:[W917 13:32:34.040608152 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())