diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 43ee2bd0c3..0e545d19bc 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -363,6 +363,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数, - 🔥vllm_max_model_len: 模型支持的最大长度。默认为`None`,即从config.json中读取。 - vllm_disable_custom_all_reduce: 禁用自定义的 all-reduce 内核,回退到 NCCL。为了稳定性,默认为`True`。 - vllm_enforce_eager: vllm使用pytorch eager模式还是建立cuda graph,默认为`False`。设置为True可以节约显存,但会影响效率。 +- vllm_mm_processor_cache_gb: 多模态处理器缓存大小(GiB),用于缓存已处理的多模态输入(如图像、视频)避免重复处理。默认为`4`。设置为`0`可禁用缓存但会降低性能(不推荐)。仅对多模态模型生效。 - vllm_disable_cascade_attn: 是否强制关闭V1引擎的cascade attention实现以防止潜在数值误差,默认为False,由vLLM内部逻辑决定是否使用。 - 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图,默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。 - vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 053660334f..66f3fa15db 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -369,6 +369,7 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai - 🔥vllm_max_model_len: The maximum sequence length supported by the model. Default is `None`, meaning it will be read from `config.json`. - vllm_disable_custom_all_reduce: Disables the custom all-reduce kernel and falls back to NCCL. For stability, the default is `True`. - vllm_enforce_eager: Determines whether vllm uses PyTorch eager mode or constructs a CUDA graph, default is `False`. Setting it to True can save memory but may affect efficiency. +- vllm_mm_processor_cache_gb: The size (in GiB) of the multimodal processor cache, used to store processed multimodal inputs (e.g., images, videos) to avoid redundant processing. Default is 4. Setting it to 0 disables the cache but may degrade performance (not recommended). This option takes effect only for multimodal models. - vllm_disable_cascade_attn: Whether to forcibly disable the V1 engine’s cascade-attention implementation to avoid potential numerical issues. Defaults to False; vLLM’s internal heuristics determine whether cascade attention is actually used. - 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`. - vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora. diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py index dd595a70a2..18b626a505 100644 --- a/swift/llm/infer/infer_engine/grpo_vllm_engine.py +++ b/swift/llm/infer/infer_engine/grpo_vllm_engine.py @@ -45,6 +45,7 @@ def __init__( task_type: Optional[str] = None, disable_cascade_attn: bool = False, load_format: str = 'auto', + mm_processor_cache_gb: Optional[float] = None, # lora enable_lora: bool = False, max_loras: int = 1, @@ -78,6 +79,7 @@ def __init__( task_type=task_type, disable_cascade_attn=disable_cascade_attn, load_format=load_format, + mm_processor_cache_gb=mm_processor_cache_gb, enable_lora=enable_lora, max_loras=max_loras, max_lora_rank=max_lora_rank, diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py index 947f8dcea2..f743d2359f 100644 --- a/swift/llm/infer/infer_engine/vllm_engine.py +++ b/swift/llm/infer/infer_engine/vllm_engine.py @@ -69,6 +69,7 @@ def __init__( task_type: Optional[str] = None, # embedding disable_cascade_attn: bool = False, load_format: str = 'auto', + mm_processor_cache_gb: Optional[float] = None, # lora enable_lora: bool = False, max_loras: int = 1, @@ -129,6 +130,7 @@ def __init__( quantization=quantization, task=task_type, disable_cascade_attn=disable_cascade_attn, + mm_processor_cache_gb=mm_processor_cache_gb, **engine_kwargs, ) context = nullcontext() @@ -169,6 +171,7 @@ def _prepare_engine_kwargs( task: Optional[str] = None, disable_cascade_attn: bool = False, load_format: str = 'auto', + mm_processor_cache_gb: Optional[float] = None, **engine_kwargs, ) -> None: if task == 'embedding': @@ -197,7 +200,10 @@ def _prepare_engine_kwargs( else: assert not limit_mm_per_prompt, ( 'The current version of vLLM does not support `limit_mm_per_prompt`. Please upgrade vLLM.') - for key in ['enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format']: + for key in [ + 'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format', + 'mm_processor_cache_gb' + ]: if key in parameters: engine_kwargs[key] = locals()[key] else: diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index dcd82af7c7..e5c8ec37dd 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -172,7 +172,8 @@ class VllmArguments: Args: vllm_gpu_memory_utilization (float): GPU memory utilization. Default is 0.9. vllm_tensor_parallel_size (int): Tensor parallelism size. Default is 1. - vllm_pipeline_parallel_size(int): Pipeline parallelism size. Default is 1. + vllm_pipeline_parallel_size (int): Pipeline parallelism size. Default is 1. + vllm_enable_expert_parallel (bool): Flag to enable expert parallelism for MoE models. Default is False. vllm_max_num_seqs (int): Maximum number of sequences. Default is 256. vllm_max_model_len (Optional[int]): Maximum model length. Default is None. vllm_disable_custom_all_reduce (bool): Flag to disable custom all-reduce. Default is True. @@ -182,6 +183,9 @@ class VllmArguments: vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None. vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False. vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None. + vllm_reasoning_parser (Optional[str]): The reasoning parser for vLLM. Default is None. + vllm_disable_cascade_attn (bool): Flag to disable cascade attention. Default is False. + vllm_mm_processor_cache_gb (Optional[float]): MM processor cache size in GB. Default is None. vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1. """ # vllm @@ -200,6 +204,7 @@ class VllmArguments: vllm_quantization: Optional[str] = None vllm_reasoning_parser: Optional[str] = None vllm_disable_cascade_attn: bool = False + vllm_mm_processor_cache_gb: Optional[float] = None # rollout vllm_data_parallel_size: int = 1 @@ -251,6 +256,7 @@ def get_vllm_engine_kwargs(self): 'quantization': self.vllm_quantization, 'reasoning_parser': self.vllm_reasoning_parser, 'disable_cascade_attn': self.vllm_disable_cascade_attn, + 'mm_processor_cache_gb': self.vllm_mm_processor_cache_gb, 'num_labels': self.num_labels, } if self.task_type in ('embedding', 'seq_cls') or 'reranker' in self.task_type: diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index 069409ca77..639fd6b809 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -585,6 +585,7 @@ def prepare_vllm(self, model): seed=self.accelerator.process_index // self.vllm_tensor_parallel_size, disable_cascade_attn=self.args.vllm_disable_cascade_attn, load_format='dummy', + mm_processor_cache_gb=self.args.vllm_mm_processor_cache_gb, template=vllm_template, distributed_executor_backend='external_launcher', **lora_kwargs,