diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 7c48d8c853..e3945d8032 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -367,7 +367,8 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数, - 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图,默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。 - vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。 - vllm_quantization: vllm可以在内部量化模型,参数支持的值详见[这里](https://docs.vllm.ai/en/latest/serving/engine_args.html)。 -- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存,节约重复查询前缀的处理时间。默认为`False`。**建议在实际场景下设置为`True`**,这可以加快推理效率。 +- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存,节约重复查询前缀的处理时间,加快推理效率。默认为`None`,跟随vLLM行为。 + - 该参数在"ms-swift<3.9.1"的默认值为`False`。 - vllm_use_async_engine: vLLM backend下是否使用async engine。部署情况(swift deploy)默认为True,其他情况默认为False。 - vllm_reasoning_parser: 推理解析器类型,用于思考模型的思维链内容解析。默认为`None`。仅用于 `swift deploy` 命令。可选的种类参考[vLLM文档](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions)。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 820bf2dbf8..0a3146b305 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -373,7 +373,8 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai - 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`. - vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora. - vllm_quantization: vllm is able to quantize model with this argument, supported values can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html). -- vllm_enable_prefix_caching: Enable vLLM's automatic prefix caching to save processing time for repeated prompt prefixes. Default is `False`. **It is recommended to set this to `True` in real-world scenarios**, as it can significantly improve inference efficiency. +- 🔥vllm_enable_prefix_caching: Enables vLLM's automatic prefix caching to save processing time for repeated prompt prefixes, improving inference efficiency. Default is `None`, following vLLM's default behavior. + - The default value of this parameter is `False` in "ms-swift<3.9.1". - vllm_use_async_engine: Whether to use the async engine under the vLLM backend. The deployment status (swift deploy) defaults to True, and other statuses default to False. - vllm_reasoning_parser: Reasoning parser type, used for parsing the chain of thought content of reasoning models. Default is `None`. Only used for the `swift deploy` command. Available types can be found in the [vLLM documentation](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions). diff --git a/swift/llm/infer/infer_engine/grpo_vllm_engine.py b/swift/llm/infer/infer_engine/grpo_vllm_engine.py index ac546f4b0d..5d96f306ef 100644 --- a/swift/llm/infer/infer_engine/grpo_vllm_engine.py +++ b/swift/llm/infer/infer_engine/grpo_vllm_engine.py @@ -48,7 +48,7 @@ def __init__( enable_lora: bool = False, max_loras: int = 1, max_lora_rank: int = 16, - enable_prefix_caching: bool = False, + enable_prefix_caching: Optional[bool] = None, enable_sleep_mode: bool = False, distributed_executor_backend: Optional[str] = None, quantization: Optional[str] = None, diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py index eb38cc9b3f..947f8dcea2 100644 --- a/swift/llm/infer/infer_engine/vllm_engine.py +++ b/swift/llm/infer/infer_engine/vllm_engine.py @@ -73,7 +73,7 @@ def __init__( enable_lora: bool = False, max_loras: int = 1, max_lora_rank: int = 16, - enable_prefix_caching: bool = False, + enable_prefix_caching: Optional[bool] = None, enable_sleep_mode: bool = False, distributed_executor_backend: Optional[str] = None, quantization: Optional[str] = None, @@ -163,7 +163,7 @@ def _prepare_engine_kwargs( enable_lora: bool = False, max_loras: int = 1, max_lora_rank: int = 16, - enable_prefix_caching: bool = False, + enable_prefix_caching: Optional[bool] = None, distributed_executor_backend: Optional[str] = None, enable_sleep_mode: bool = False, task: Optional[str] = None, @@ -214,6 +214,8 @@ def _prepare_engine_kwargs( engine_kwargs['hf_overrides'] = {'architectures': architectures} self.default_template.set_mode('vllm') engine_kwargs.update(self.default_template.prepare_engine_kwargs()) + if enable_prefix_caching is not None: + engine_kwargs['enable_prefix_caching'] = enable_prefix_caching engine_args = engine_cls( model=self.model_dir, dtype=dtype_mapping[model_info.torch_dtype], @@ -226,7 +228,6 @@ def _prepare_engine_kwargs( disable_custom_all_reduce=disable_custom_all_reduce, enforce_eager=enforce_eager, trust_remote_code=True, - enable_prefix_caching=enable_prefix_caching, distributed_executor_backend=distributed_executor_backend, **engine_kwargs, ) diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py index b2256da0b7..23fb351132 100644 --- a/swift/trainers/arguments.py +++ b/swift/trainers/arguments.py @@ -179,7 +179,7 @@ class VllmArguments: vllm_enforce_eager (bool): Flag to enforce eager execution. Default is False. vllm_limit_mm_per_prompt (Optional[str]): Limit multimedia per prompt. Default is None. vllm_max_lora_rank (int): Maximum LoRA rank. Default is 16. - vllm_enable_prefix_caching (bool): Flag to enable automatic prefix caching. Default is False. + vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None. vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False. vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None. vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1. @@ -195,7 +195,7 @@ class VllmArguments: vllm_enforce_eager: bool = False vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None # '{"image": 5, "video": 2}' vllm_max_lora_rank: int = 16 - vllm_enable_prefix_caching: bool = False + vllm_enable_prefix_caching: Optional[bool] = None vllm_use_async_engine: bool = False vllm_quantization: Optional[str] = None vllm_reasoning_parser: Optional[str] = None