diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 22a9f023d1..c7b933823a 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -276,6 +276,7 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数. - enforce_eager: vllm使用pytorch eager模式还是建立cuda graph. 默认为`False`. 设置为True可以节约显存, 但会影响效率. - 🔥limit_mm_per_prompt: 控制vllm使用多图, 默认为`None`. 例如传入`--limit_mm_per_prompt '{"image": 10, "video": 5}'` - vllm_max_lora_rank: 默认为`16`. vllm对于lora支持的参数 +- enable_prefix_caching: 是否开启 vllm 的 Prefix Caching 能力. 默认为`False`. 设置为 True 可以节约重复请求前缀(例如 System Prompt,长文档或多轮对话)处理时间。 ### 合并参数 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 5064e6d7cb..57745f68bd 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -281,6 +281,7 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai - enforce_eager: Whether vllm uses pytorch eager mode or establishes a cuda graph. Default is `False`. Setting to True can save memory but may affect efficiency. - 🔥limit_mm_per_prompt: Controls vllm using multiple images, default is `None`. For example, use `--limit_mm_per_prompt '{"image": 10, "video": 5}'`. - vllm_max_lora_rank: Default value is `16`. Parameters supported by vllm for LoRA. +- enable_prefix_caching: Whether enable `Automatic Prefix Caching` feature for vllm. Default is `False`. Setting to True can save processing time for repeatable request prefix(such as system prompt, long docs, or multi-turn dialog, etc). ### Merge Arguments diff --git a/swift/llm/argument/infer_args.py b/swift/llm/argument/infer_args.py index 39cce3206c..d2172ddc5f 100644 --- a/swift/llm/argument/infer_args.py +++ b/swift/llm/argument/infer_args.py @@ -61,6 +61,7 @@ class VllmArguments: enforce_eager (bool): Flag to enforce eager execution. Default is False. limit_mm_per_prompt (Optional[str]): Limit multimedia per prompt. Default is None. vllm_max_lora_rank (int): Maximum LoRA rank. Default is 16. + enable_prefix_caching (bool): Flag to enable automatic prefix caching. Default is False. """ # vllm gpu_memory_utilization: float = 0.9 @@ -72,6 +73,7 @@ class VllmArguments: enforce_eager: bool = False limit_mm_per_prompt: Optional[Union[dict, str]] = None # '{"image": 10, "video": 5}' vllm_max_lora_rank: int = 16 + enable_prefix_caching: bool = False def __post_init__(self): self.limit_mm_per_prompt = ModelArguments.parse_to_dict(self.limit_mm_per_prompt) @@ -92,6 +94,7 @@ def get_vllm_engine_kwargs(self): 'max_lora_rank': self.vllm_max_lora_rank, 'enable_lora': len(adapters) > 0, 'max_loras': max(len(adapters), 1), + 'enable_prefix_caching': self.enable_prefix_caching, } diff --git a/swift/llm/infer/infer_engine/vllm_engine.py b/swift/llm/infer/infer_engine/vllm_engine.py index 727aacbfdd..74774095e9 100644 --- a/swift/llm/infer/infer_engine/vllm_engine.py +++ b/swift/llm/infer/infer_engine/vllm_engine.py @@ -34,28 +34,30 @@ class VllmEngine(InferEngine): def __init__( - self, - model_id_or_path: str, - torch_dtype: Optional[torch.dtype] = None, - *, - model_type: Optional[str] = None, - use_hf: Optional[bool] = None, - hub_token: Optional[str] = None, - revision: Optional[str] = None, - # engine_kwargs - gpu_memory_utilization: float = 0.9, - tensor_parallel_size: int = 1, - pipeline_parallel_size: int = 1, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - disable_custom_all_reduce: bool = False, - enforce_eager: bool = False, - limit_mm_per_prompt: Optional[Dict[str, Any]] = None, - # lora - enable_lora: bool = False, - max_loras: int = 1, - max_lora_rank: int = 16, - engine_kwargs: Optional[Dict[str, Any]] = None) -> None: + self, + model_id_or_path: str, + torch_dtype: Optional[torch.dtype] = None, + *, + model_type: Optional[str] = None, + use_hf: Optional[bool] = None, + hub_token: Optional[str] = None, + revision: Optional[str] = None, + # engine_kwargs + gpu_memory_utilization: float = 0.9, + tensor_parallel_size: int = 1, + pipeline_parallel_size: int = 1, + max_model_len: Optional[int] = None, + max_num_seqs: int = 256, + disable_custom_all_reduce: bool = False, + enforce_eager: bool = False, + limit_mm_per_prompt: Optional[Dict[str, Any]] = None, + # lora + enable_lora: bool = False, + max_loras: int = 1, + max_lora_rank: int = 16, + enable_prefix_caching: bool = False, + engine_kwargs: Optional[Dict[str, Any]] = None, + ) -> None: self.processor = get_model_tokenizer( model_id_or_path, torch_dtype, @@ -79,7 +81,9 @@ def __init__( enable_lora=enable_lora, max_loras=max_loras, max_lora_rank=max_lora_rank, - engine_kwargs=engine_kwargs) + enable_prefix_caching=enable_prefix_caching, + engine_kwargs=engine_kwargs, + ) self._prepare_engine() self._load_generation_config() @@ -91,19 +95,22 @@ def _prepare_engine(self) -> None: engine = AsyncLLMEngine.from_engine_args(self.engine_args) self.engine = engine - def _prepare_engine_kwargs(self, - gpu_memory_utilization: float = 0.9, - tensor_parallel_size: int = 1, - pipeline_parallel_size: int = 1, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - disable_custom_all_reduce: bool = False, - enforce_eager: bool = False, - limit_mm_per_prompt: Optional[Dict[str, Any]] = None, - enable_lora: bool = False, - max_loras: int = 1, - max_lora_rank: int = 16, - engine_kwargs: Optional[Dict[str, Any]] = None) -> None: + def _prepare_engine_kwargs( + self, + gpu_memory_utilization: float = 0.9, + tensor_parallel_size: int = 1, + pipeline_parallel_size: int = 1, + max_model_len: Optional[int] = None, + max_num_seqs: int = 256, + disable_custom_all_reduce: bool = False, + enforce_eager: bool = False, + limit_mm_per_prompt: Optional[Dict[str, Any]] = None, + enable_lora: bool = False, + max_loras: int = 1, + max_lora_rank: int = 16, + enable_prefix_caching: bool = False, + engine_kwargs: Optional[Dict[str, Any]] = None, + ) -> None: if engine_kwargs is None: engine_kwargs = {} disable_log_stats = engine_kwargs.pop('disable_log_stats', True) @@ -136,7 +143,9 @@ def _prepare_engine_kwargs(self, disable_custom_all_reduce=disable_custom_all_reduce, enforce_eager=enforce_eager, trust_remote_code=True, - **engine_kwargs) + enable_prefix_caching=enable_prefix_caching, + **engine_kwargs, + ) self.engine_args = engine_args self.enable_lora = enable_lora if max_model_len is not None: