modelscope · Jintao-Huang · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -367,7 +367,8 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数，
 - 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图，默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。
 - vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。
 - vllm_quantization: vllm可以在内部量化模型，参数支持的值详见[这里](https://docs.vllm.ai/en/latest/serving/engine_args.html)。
-- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存，节约重复查询前缀的处理时间。默认为`False`。**建议在实际场景下设置为`True`**，这可以加快推理效率。
+- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存，节约重复查询前缀的处理时间，加快推理效率。默认为`None`，跟随vLLM行为。
+  - 该参数在"ms-swift<3.9.1"的默认值为`False`。
 - vllm_use_async_engine: vLLM backend下是否使用async engine。部署情况（swift deploy）默认为True，其他情况默认为False。
 - vllm_reasoning_parser: 推理解析器类型，用于思考模型的思维链内容解析。默认为`None`。仅用于 `swift deploy` 命令。可选的种类参考[vLLM文档](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions)。
 

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -373,7 +373,8 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai
 - 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`.
 - vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora.
 - vllm_quantization: vllm is able to quantize model with this argument, supported values can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html).
-- vllm_enable_prefix_caching: Enable vLLM's automatic prefix caching to save processing time for repeated prompt prefixes. Default is `False`. **It is recommended to set this to `True` in real-world scenarios**, as it can significantly improve inference efficiency.
+- 🔥vllm_enable_prefix_caching: Enables vLLM's automatic prefix caching to save processing time for repeated prompt prefixes, improving inference efficiency. Default is `None`, following vLLM's default behavior.
+  - The default value of this parameter is `False` in "ms-swift<3.9.1".
 - vllm_use_async_engine: Whether to use the async engine under the vLLM backend. The deployment status (swift deploy) defaults to True, and other statuses default to False.
 - vllm_reasoning_parser: Reasoning parser type, used for parsing the chain of thought content of reasoning models. Default is `None`. Only used for the `swift deploy` command. Available types can be found in the [vLLM documentation](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions).
 

diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py
@@ -48,7 +48,7 @@ def __init__(
         enable_lora: bool = False,
         max_loras: int = 1,
         max_lora_rank: int = 16,
-        enable_prefix_caching: bool = False,
+        enable_prefix_caching: Optional[bool] = None,
         enable_sleep_mode: bool = False,
         distributed_executor_backend: Optional[str] = None,
         quantization: Optional[str] = None,

diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py
@@ -73,7 +73,7 @@ def __init__(
         enable_lora: bool = False,
         max_loras: int = 1,
         max_lora_rank: int = 16,
-        enable_prefix_caching: bool = False,
+        enable_prefix_caching: Optional[bool] = None,
         enable_sleep_mode: bool = False,
         distributed_executor_backend: Optional[str] = None,
         quantization: Optional[str] = None,
@@ -163,7 +163,7 @@ def _prepare_engine_kwargs(
         enable_lora: bool = False,
         max_loras: int = 1,
         max_lora_rank: int = 16,
-        enable_prefix_caching: bool = False,
+        enable_prefix_caching: Optional[bool] = None,
         distributed_executor_backend: Optional[str] = None,
         enable_sleep_mode: bool = False,
         task: Optional[str] = None,
@@ -214,6 +214,8 @@ def _prepare_engine_kwargs(
             engine_kwargs['hf_overrides'] = {'architectures': architectures}
         self.default_template.set_mode('vllm')
         engine_kwargs.update(self.default_template.prepare_engine_kwargs())
+        if enable_prefix_caching is not None:
+            engine_kwargs['enable_prefix_caching'] = enable_prefix_caching
         engine_args = engine_cls(
             model=self.model_dir,
             dtype=dtype_mapping[model_info.torch_dtype],
@@ -226,7 +228,6 @@ def _prepare_engine_kwargs(
             disable_custom_all_reduce=disable_custom_all_reduce,
             enforce_eager=enforce_eager,
             trust_remote_code=True,
-            enable_prefix_caching=enable_prefix_caching,
             distributed_executor_backend=distributed_executor_backend,
             **engine_kwargs,
         )

diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -179,7 +179,7 @@ class VllmArguments:
         vllm_enforce_eager (bool): Flag to enforce eager execution. Default is False.
         vllm_limit_mm_per_prompt (Optional[str]): Limit multimedia per prompt. Default is None.
         vllm_max_lora_rank (int): Maximum LoRA rank. Default is 16.
-        vllm_enable_prefix_caching (bool): Flag to enable automatic prefix caching. Default is False.
+        vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None.
         vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False.
         vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None.
         vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1.
@@ -195,7 +195,7 @@ class VllmArguments:
     vllm_enforce_eager: bool = False
     vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None  # '{"image": 5, "video": 2}'
     vllm_max_lora_rank: int = 16
-    vllm_enable_prefix_caching: bool = False
+    vllm_enable_prefix_caching: Optional[bool] = None
     vllm_use_async_engine: bool = False
     vllm_quantization: Optional[str] = None
     vllm_reasoning_parser: Optional[str] = None