Skip to content

Qwen2.5-Omni-7B seq_cls, KeyError: 'input_ids' #5344

@momo0129

Description

@momo0129

Describe the bug
swift.version: 3.8.0.dev0
"transformers_version": "4.54.0.dev0"

Thanks for your update for Omni seq_cls support, then today I'm using Qwen2.5-Omni-7B for seq_cls(single_label_classification) for audio classfication, but the data process is not right, seems like it skips some steps to get "input_ids"?

Data form:
{"messages": [{"role": "user","content": "Analysis this audio: identify whether it contain violent sound or not?"}],"audios": ["test_1.wav"], "label": 0}
{"messages": [{"role": "user","content": "Analysis this audio: identify whether it contain violent sound or not?"}],"audios": ["test_2.wav"], "label": 1}

Train Script:
model_dir=/data/Qwen2.5-Omni-7B
nproc_per_node=1
ENABLE_AUDIO_OUTPUT=0
CUDA_VISIBLE_DEVICES=2
NPROC_PER_NODE=$nproc_per_node
swift sft
--model $model_dir
--check_model false
--dataset $dataset_json
--val_dataset $val_json
--train_type lora
--torch_dtype bfloat16
--num_train_epochs 1
--per_device_train_batch_size 4
--per_device_eval_batch_size 1
--learning_rate 1e-5
--lora_rank 8
--lora_alpha 32
--target_modules all-linear
--freeze_vit true
--gradient_accumulation_steps $(expr 16 / $nproc_per_node)
--eval_steps 500
--save_steps 100
--save_total_limit 10
--logging_steps 50
--max_length 2048
--output_dir $outdir
--warmup_ratio 0.05
--dataloader_num_workers 0
--streaming true
--max_steps 2000
--ddp_find_unused_parameters false
--num_labels 2
--task_type seq_cls
--problem_type single_label_classification
--use_chat_template false
--load_from_cache_file false

Error Message:
[INFO:swift] train_dataset: IterableDataset({
features: ['messages', 'audios', 'label'],
num_shards: 1
})
[INFO:swift] val_dataset: IterableDataset({
features: ['messages', 'audios', 'label'],
num_shards: 1
})
[INFO:swift] args.problem_type: single_label_classification
[INFO:swift] The TrainArguments will be saved in: /data//models/ms_swift_ckpts/riot_detect/qwen2.5-Omni-7B-riot-detect-2labels-cls-noprompt-7331-ft-llmlora-250811/v21-20250811-211146/args.json
[INFO:swift] lora_config: LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/data//Qwen2.5-Omni-7B', revision=None, inference_mode=False, r=8, target_modules='^(thinker.model.*\.(o_proj|q_proj|v_proj|down_proj|gate_proj|up_proj|k_proj))$', exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=['v_head', 'classifier', 'score'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)
[INFO:swift] model: PeftModelForSequenceClassification(
(base_model): LoraModel(
(model): Qwen2_5OmniForConditionalGeneration(
(thinker): Qwen2_5OmniThinkerForConditionalGeneration(
(audio_tower): Qwen2_5OmniAudioEncoder(
(conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
(conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
(positional_embedding): SinusoidsPositionEmbedding()
(audio_bos_eos_token): Embedding(2, 3584)
(layers): ModuleList(
(0-31): 32 x Qwen2_5OmniAudioEncoderLayer(
(self_attn): Qwen2_5OmniAudioAttention(
(k_proj): Linear(in_features=1280, out_features=1280, bias=False)
(v_proj): Linear(in_features=1280, out_features=1280, bias=True)
(q_proj): Linear(in_features=1280, out_features=1280, bias=True)
(out_proj): Linear(in_features=1280, out_features=1280, bias=True)
)
(self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(activation_fn): GELUActivation()
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
)
)
(ln_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
(avg_pooler): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
(proj): Linear(in_features=1280, out_features=3584, bias=True)
)
(visual): Qwen2_5OmniVisionEncoder(
(patch_embed): Qwen2_5_VisionPatchEmbed(
(proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
)
(rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
(blocks): ModuleList(
(0-31): 32 x Qwen2_5OmniVisionBlock(
(norm1): Qwen2RMSNorm((1280,), eps=1e-06)
(norm2): Qwen2RMSNorm((1280,), eps=1e-06)
(attn): Qwen2_5OmniVisionAttention(
(q): Linear(in_features=1280, out_features=1280, bias=True)
(k): Linear(in_features=1280, out_features=1280, bias=True)
(v): Linear(in_features=1280, out_features=1280, bias=True)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
)
(mlp): Qwen2_5OmniMLP(
(gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
(up_proj): Linear(in_features=1280, out_features=3420, bias=True)
(down_proj): Linear(in_features=3420, out_features=1280, bias=True)
(act_fn): SiLU()
)
)
)
(merger): Qwen2_5OmniPatchMerger(
(ln_q): Qwen2RMSNorm((1280,), eps=1e-06)
(mlp): Sequential(
(0): Linear(in_features=5120, out_features=5120, bias=True)
(1): GELU(approximate='none')
(2): Linear(in_features=5120, out_features=3584, bias=True)
)
)
)
(model): Qwen2_5OmniThinkerTextModel(
(embed_tokens): Embedding(152064, 3584)
(layers): ModuleList(
(0-27): 28 x Qwen2_5OmniDecoderLayer(
(self_attn): Qwen2_5OmniAttention(
(q_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=3584, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(k_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=512, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=512, bias=True)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(o_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(rotary_emb): Qwen2_5OmniRotaryEmbedding()
)
(mlp): Qwen2MLP(
(gate_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=18944, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=18944, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(up_proj): lora.Linear(
(base_layer): Linear(in_features=3584, out_features=18944, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=18944, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(down_proj): lora.Linear(
(base_layer): Linear(in_features=18944, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=18944, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(act_fn): SiLU()
)
(input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
(post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
)
)
(norm): Qwen2RMSNorm((3584,), eps=1e-06)
(rotary_emb): Qwen2_5OmniRotaryEmbedding()
)
(lm_head): Identity()
(score): ModulesToSaveWrapper(
(original_module): Linear(in_features=3584, out_features=2, bias=False)
(modules_to_save): ModuleDict(
(default): Linear(in_features=3584, out_features=2, bias=False)
)
)
)
)
)
)
[INFO:swift] model_parameter_info: PeftModelForSequenceClassification: 8407.0159M Params (20.1923M Trainable [0.2402%]), 1.9219M Buffers.
/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/trainers/mixin.py:100: FutureWarning: tokenizer is deprecated and will be removed in version 5.0.0 for Trainer.__init__. Use processing_class instead.
super().init(
[2025-08-11 21:12:44,467] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-08-11 21:12:45,523] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
No label_names provided for model class PeftModelForSequenceClassification. Since PeftModel hides base models input arguments, if label_names is not given, label_names can't be set automatically within Trainer. Note that empty label_names list will be used instead.
[INFO:swift] use_reentrant: False
[INFO:swift] The logging file will be saved in: /data//models/ms_swift_ckpts/riot_detect/qwen2.5-Omni-7B-riot-detect-2labels-cls-noprompt-7331-ft-llmlora-250811/v21-20250811-211146/logging.jsonl
loss_type=None was set in the config but it is unrecognised.Using the default loss: ForCausalLMLoss.
[INFO:swift] Successfully registered post_encode hook: ['PeftModelForSequenceClassification'].
Train: 0%| | 0/2000 [00:00<?, ?it/s][INFO:swift] last_model_checkpoint: None
[INFO:swift] best_model_checkpoint: None
[INFO:swift] images_dir: /data//models/ms_swift_ckpts/riot_detect/qwen2.5-Omni-7B-riot-detect-2labels-cls-noprompt-7331-ft-llmlora-250811/v21-20250811-211146/images
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/llm/train/sft.py", line 244, in train
[rank0]: trainer.train(trainer.args.resume_from_checkpoint)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/trainers/trainers.py", line 57, in train
[rank0]: return super().train(*args, **kwargs)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/trainers/mixin.py", line 675, in train
[rank0]: res = super().train(*args, **kwargs)
[rank0]: File "/data/libs/transformers-main/src/transformers/trainer.py", line 2206, in train
[rank0]: return inner_training_loop(
[rank0]: File "/data/libs/transformers-main/src/transformers/trainer.py", line 2502, in _inner_training_loop
[rank0]: batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/trainers/mixin.py", line 815, in get_batch_samples
[rank0]: res = super().get_batch_samples(*args, **kwargs)
[rank0]: File "/data/libs/transformers-main/src/transformers/trainer.py", line 5300, in get_batch_samples
[rank0]: batch_samples.append(next(epoch_iterator))
[rank0]: File "/usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py", line 567, in iter
[rank0]: current_batch = next(dataloader_iter)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 733, in next
[rank0]: data = self._next_data()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 789, in _next_data
[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration
[rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 43, in fetch
[rank0]: return self.collate_fn(data)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/llm/template/base.py", line 1404, in data_collator
[rank0]: res = self._seq_cls_data_collator(batch, padding_to=padding_to)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/llm/template/base.py", line 1566, in _seq_cls_data_collator
[rank0]: res = self._data_collator(batch, padding_to=padding_to)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/llm/template/template/qwen.py", line 424, in _data_collator
[rank0]: res['position_ids'] = self._get_position_ids(res)
[rank0]: File "/data/libs/ms-swift-250811/llmscope-support_omni_seq_cls/swift/llm/template/template/qwen.py", line 599, in _get_position_ids
[rank0]: input_ids = inputs['input_ids']
[rank0]: KeyError: 'input_ids'

My hardware and system info
torch:
Write your system info like CUDA version/system/GPU/torch version here(在这里给出硬件信息和系统信息,如CUDA版本,系统,GPU型号和torch版本等)
Python 3.10.12
2.7.1+cu126
Driver Version: 535.161.08 CUDA Version: 12.6
Ubuntu 22.04.4 LTS

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions