modelscope · hjh0119 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025 · gemini-code-assist
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -363,6 +363,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数，
 - 🔥vllm_max_model_len: 模型支持的最大长度。默认为`None`，即从config.json中读取。
 - vllm_disable_custom_all_reduce: 禁用自定义的 all-reduce 内核，回退到 NCCL。为了稳定性，默认为`True`。
 - vllm_enforce_eager: vllm使用pytorch eager模式还是建立cuda graph，默认为`False`。设置为True可以节约显存，但会影响效率。
+- vllm_mm_processor_cache_gb: 多模态处理器缓存大小（GiB），用于缓存已处理的多模态输入（如图像、视频）避免重复处理。默认为`4`。设置为`0`可禁用缓存但会降低性能（不推荐）。仅对多模态模型生效。
 - vllm_disable_cascade_attn: 是否强制关闭V1引擎的cascade attention实现以防止潜在数值误差，默认为False，由vLLM内部逻辑决定是否使用。
 - 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图，默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。
 - vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -369,6 +369,7 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai
 - 🔥vllm_max_model_len: The maximum sequence length supported by the model. Default is `None`, meaning it will be read from `config.json`.
 - vllm_disable_custom_all_reduce: Disables the custom all-reduce kernel and falls back to NCCL. For stability, the default is `True`.
 - vllm_enforce_eager: Determines whether vllm uses PyTorch eager mode or constructs a CUDA graph, default is `False`. Setting it to True can save memory but may affect efficiency.
+- vllm_mm_processor_cache_gb: The size (in GiB) of the multimodal processor cache, used to store processed multimodal inputs (e.g., images, videos) to avoid redundant processing. Default is 4. Setting it to 0 disables the cache but may degrade performance (not recommended). This option takes effect only for multimodal models.
 - vllm_disable_cascade_attn: Whether to forcibly disable the V1 engine’s cascade-attention implementation to avoid potential numerical issues. Defaults to False; vLLM’s internal heuristics determine whether cascade attention is actually used.
 - 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`.
 - vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora.

diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
@@ -45,6 +45,7 @@ def __init__(
         task_type: Optional[str] = None,
         disable_cascade_attn: bool = False,
         load_format: str = 'auto',
+        mm_processor_cache_gb: Optional[float] = None,
         # lora
         enable_lora: bool = False,
         max_loras: int = 1,
@@ -78,6 +79,7 @@ def __init__(
             task_type=task_type,
             disable_cascade_attn=disable_cascade_attn,
             load_format=load_format,
+            mm_processor_cache_gb=mm_processor_cache_gb,
             enable_lora=enable_lora,
             max_loras=max_loras,
             max_lora_rank=max_lora_rank,

diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
@@ -69,6 +69,7 @@ def __init__(
         task_type: Optional[str] = None,  # embedding
         disable_cascade_attn: bool = False,
         load_format: str = 'auto',
+        mm_processor_cache_gb: Optional[float] = None,
         # lora
         enable_lora: bool = False,
         max_loras: int = 1,
@@ -129,6 +130,7 @@ def __init__(
             quantization=quantization,
             task=task_type,
             disable_cascade_attn=disable_cascade_attn,
+            mm_processor_cache_gb=mm_processor_cache_gb,
             **engine_kwargs,
         )
         context = nullcontext()
@@ -169,6 +171,7 @@ def _prepare_engine_kwargs(
         task: Optional[str] = None,
         disable_cascade_attn: bool = False,
         load_format: str = 'auto',
+        mm_processor_cache_gb: Optional[float] = None,
         **engine_kwargs,
     ) -> None:
         if task == 'embedding':
@@ -197,7 +200,10 @@ def _prepare_engine_kwargs(
         else:
             assert not limit_mm_per_prompt, (
                 'The current version of vLLM does not support `limit_mm_per_prompt`. Please upgrade vLLM.')
-        for key in ['enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format']:
+        for key in [
+                'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format',
+                'mm_processor_cache_gb'
+        ]:
-        for key in [
-                'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format',
-                'mm_processor_cache_gb'
-        ]:
+        for key in [
+            'enable_expert_parallel',
+            'enable_sleep_mode',
+            'disable_cascade_attn',
+            'load_format',
+            'mm_processor_cache_gb',
+        ]:
-        for key in [
-                'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format',
-                'mm_processor_cache_gb'
-        ]:
+        for key in [
+            'enable_expert_parallel',
+            'enable_sleep_mode',
+            'disable_cascade_attn',
+            'load_format',
+            'mm_processor_cache_gb',
+        ]:
             if key in parameters:
                 engine_kwargs[key] = locals()[key]
             else:

diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -172,7 +172,8 @@ class VllmArguments:
     Args:
         vllm_gpu_memory_utilization (float): GPU memory utilization. Default is 0.9.
         vllm_tensor_parallel_size (int): Tensor parallelism size. Default is 1.
-        vllm_pipeline_parallel_size(int): Pipeline parallelism size. Default is 1.
+        vllm_pipeline_parallel_size (int): Pipeline parallelism size. Default is 1.
+        vllm_enable_expert_parallel (bool): Flag to enable expert parallelism for MoE models. Default is False.
         vllm_max_num_seqs (int): Maximum number of sequences. Default is 256.
         vllm_max_model_len (Optional[int]): Maximum model length. Default is None.
         vllm_disable_custom_all_reduce (bool): Flag to disable custom all-reduce. Default is True.
@@ -182,6 +183,9 @@ class VllmArguments:
         vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None.
         vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False.
         vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None.
+        vllm_reasoning_parser (Optional[str]): The reasoning parser for vLLM. Default is None.
+        vllm_disable_cascade_attn (bool): Flag to disable cascade attention. Default is False.
+        vllm_mm_processor_cache_gb (Optional[float]): MM processor cache size in GB. Default is None.
         vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1.
     """
     # vllm
@@ -200,6 +204,7 @@ class VllmArguments:
     vllm_quantization: Optional[str] = None
     vllm_reasoning_parser: Optional[str] = None
     vllm_disable_cascade_attn: bool = False
+    vllm_mm_processor_cache_gb: Optional[float] = None
     # rollout
     vllm_data_parallel_size: int = 1
 
@@ -251,6 +256,7 @@ def get_vllm_engine_kwargs(self):
             'quantization': self.vllm_quantization,
             'reasoning_parser': self.vllm_reasoning_parser,
             'disable_cascade_attn': self.vllm_disable_cascade_attn,
+            'mm_processor_cache_gb': self.vllm_mm_processor_cache_gb,
             'num_labels': self.num_labels,
         }
         if self.task_type in ('embedding', 'seq_cls') or 'reranker' in self.task_type:

diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -585,6 +585,7 @@ def prepare_vllm(self, model):
                 seed=self.accelerator.process_index // self.vllm_tensor_parallel_size,
                 disable_cascade_attn=self.args.vllm_disable_cascade_attn,
                 load_format='dummy',
+                mm_processor_cache_gb=self.args.vllm_mm_processor_cache_gb,
                 template=vllm_template,
                 distributed_executor_backend='external_launcher',
                 **lora_kwargs,