Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/source/Instruction/命令行参数.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,8 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数,
- 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图,默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。
- vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。
- vllm_quantization: vllm可以在内部量化模型,参数支持的值详见[这里](https://docs.vllm.ai/en/latest/serving/engine_args.html)。
- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存,节约重复查询前缀的处理时间。默认为`False`。**建议在实际场景下设置为`True`**,这可以加快推理效率。
- 🔥vllm_enable_prefix_caching: 开启vllm的自动前缀缓存,节约重复查询前缀的处理时间,加快推理效率。默认为`None`,跟随vLLM行为。
- 该参数在"ms-swift<3.9.1"的默认值为`False`。
- vllm_use_async_engine: vLLM backend下是否使用async engine。部署情况(swift deploy)默认为True,其他情况默认为False。
- vllm_reasoning_parser: 推理解析器类型,用于思考模型的思维链内容解析。默认为`None`。仅用于 `swift deploy` 命令。可选的种类参考[vLLM文档](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions)。

Expand Down
3 changes: 2 additions & 1 deletion docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai
- 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`.
- vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora.
- vllm_quantization: vllm is able to quantize model with this argument, supported values can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html).
- vllm_enable_prefix_caching: Enable vLLM's automatic prefix caching to save processing time for repeated prompt prefixes. Default is `False`. **It is recommended to set this to `True` in real-world scenarios**, as it can significantly improve inference efficiency.
- 🔥vllm_enable_prefix_caching: Enables vLLM's automatic prefix caching to save processing time for repeated prompt prefixes, improving inference efficiency. Default is `None`, following vLLM's default behavior.
- The default value of this parameter is `False` in "ms-swift<3.9.1".
- vllm_use_async_engine: Whether to use the async engine under the vLLM backend. The deployment status (swift deploy) defaults to True, and other statuses default to False.
- vllm_reasoning_parser: Reasoning parser type, used for parsing the chain of thought content of reasoning models. Default is `None`. Only used for the `swift deploy` command. Available types can be found in the [vLLM documentation](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html#streaming-chat-completions).

Expand Down
2 changes: 1 addition & 1 deletion swift/llm/infer/infer_engine/grpo_vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
enable_prefix_caching: bool = False,
enable_prefix_caching: Optional[bool] = None,
enable_sleep_mode: bool = False,
distributed_executor_backend: Optional[str] = None,
quantization: Optional[str] = None,
Expand Down
7 changes: 4 additions & 3 deletions swift/llm/infer/infer_engine/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __init__(
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
enable_prefix_caching: bool = False,
enable_prefix_caching: Optional[bool] = None,
enable_sleep_mode: bool = False,
distributed_executor_backend: Optional[str] = None,
quantization: Optional[str] = None,
Expand Down Expand Up @@ -163,7 +163,7 @@ def _prepare_engine_kwargs(
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
enable_prefix_caching: bool = False,
enable_prefix_caching: Optional[bool] = None,
distributed_executor_backend: Optional[str] = None,
enable_sleep_mode: bool = False,
task: Optional[str] = None,
Expand Down Expand Up @@ -214,6 +214,8 @@ def _prepare_engine_kwargs(
engine_kwargs['hf_overrides'] = {'architectures': architectures}
self.default_template.set_mode('vllm')
engine_kwargs.update(self.default_template.prepare_engine_kwargs())
if enable_prefix_caching is not None:
engine_kwargs['enable_prefix_caching'] = enable_prefix_caching
engine_args = engine_cls(
model=self.model_dir,
dtype=dtype_mapping[model_info.torch_dtype],
Expand All @@ -226,7 +228,6 @@ def _prepare_engine_kwargs(
disable_custom_all_reduce=disable_custom_all_reduce,
enforce_eager=enforce_eager,
trust_remote_code=True,
enable_prefix_caching=enable_prefix_caching,
distributed_executor_backend=distributed_executor_backend,
**engine_kwargs,
)
Expand Down
4 changes: 2 additions & 2 deletions swift/trainers/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ class VllmArguments:
vllm_enforce_eager (bool): Flag to enforce eager execution. Default is False.
vllm_limit_mm_per_prompt (Optional[str]): Limit multimedia per prompt. Default is None.
vllm_max_lora_rank (int): Maximum LoRA rank. Default is 16.
vllm_enable_prefix_caching (bool): Flag to enable automatic prefix caching. Default is False.
vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None.
vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False.
vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None.
vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1.
Expand All @@ -195,7 +195,7 @@ class VllmArguments:
vllm_enforce_eager: bool = False
vllm_limit_mm_per_prompt: Optional[Union[dict, str]] = None # '{"image": 5, "video": 2}'
vllm_max_lora_rank: int = 16
vllm_enable_prefix_caching: bool = False
vllm_enable_prefix_caching: Optional[bool] = None
vllm_use_async_engine: bool = False
vllm_quantization: Optional[str] = None
vllm_reasoning_parser: Optional[str] = None
Expand Down
Loading