Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/命令行参数.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数.
- enforce_eager: vllm使用pytorch eager模式还是建立cuda graph. 默认为`False`. 设置为True可以节约显存, 但会影响效率.
- 🔥limit_mm_per_prompt: 控制vllm使用多图, 默认为`None`. 例如传入`--limit_mm_per_prompt '{"image": 10, "video": 5}'`
- vllm_max_lora_rank: 默认为`16`. vllm对于lora支持的参数
- enable_prefix_caching: 是否开启 vllm 的 Prefix Caching 能力. 默认为`False`. 设置为 True 可以节约重复请求前缀(例如 System Prompt,长文档或多轮对话)处理时间。


### 合并参数
Expand Down
1 change: 1 addition & 0 deletions docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai
- enforce_eager: Whether vllm uses pytorch eager mode or establishes a cuda graph. Default is `False`. Setting to True can save memory but may affect efficiency.
- 🔥limit_mm_per_prompt: Controls vllm using multiple images, default is `None`. For example, use `--limit_mm_per_prompt '{"image": 10, "video": 5}'`.
- vllm_max_lora_rank: Default value is `16`. Parameters supported by vllm for LoRA.
- enable_prefix_caching: Whether enable `Automatic Prefix Caching` feature for vllm. Default is `False`. Setting to True can save processing time for repeatable request prefix(such as system prompt, long docs, or multi-turn dialog, etc).

### Merge Arguments

Expand Down
3 changes: 3 additions & 0 deletions swift/llm/argument/infer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class VllmArguments:
enforce_eager (bool): Flag to enforce eager execution. Default is False.
limit_mm_per_prompt (Optional[str]): Limit multimedia per prompt. Default is None.
vllm_max_lora_rank (int): Maximum LoRA rank. Default is 16.
enable_prefix_caching (bool): Flag to enable automatic prefix caching. Default is False.
"""
# vllm
gpu_memory_utilization: float = 0.9
Expand All @@ -72,6 +73,7 @@ class VllmArguments:
enforce_eager: bool = False
limit_mm_per_prompt: Optional[Union[dict, str]] = None # '{"image": 10, "video": 5}'
vllm_max_lora_rank: int = 16
enable_prefix_caching: bool = False

def __post_init__(self):
self.limit_mm_per_prompt = ModelArguments.parse_to_dict(self.limit_mm_per_prompt)
Expand All @@ -92,6 +94,7 @@ def get_vllm_engine_kwargs(self):
'max_lora_rank': self.vllm_max_lora_rank,
'enable_lora': len(adapters) > 0,
'max_loras': max(len(adapters), 1),
'enable_prefix_caching': self.enable_prefix_caching,
}


Expand Down
83 changes: 46 additions & 37 deletions swift/llm/infer/infer_engine/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,30 @@
class VllmEngine(InferEngine):

def __init__(
self,
model_id_or_path: str,
torch_dtype: Optional[torch.dtype] = None,
*,
model_type: Optional[str] = None,
use_hf: Optional[bool] = None,
hub_token: Optional[str] = None,
revision: Optional[str] = None,
# engine_kwargs
gpu_memory_utilization: float = 0.9,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: Optional[int] = None,
max_num_seqs: int = 256,
disable_custom_all_reduce: bool = False,
enforce_eager: bool = False,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
# lora
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
engine_kwargs: Optional[Dict[str, Any]] = None) -> None:
self,
model_id_or_path: str,
torch_dtype: Optional[torch.dtype] = None,
*,
model_type: Optional[str] = None,
use_hf: Optional[bool] = None,
hub_token: Optional[str] = None,
revision: Optional[str] = None,
# engine_kwargs
gpu_memory_utilization: float = 0.9,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: Optional[int] = None,
max_num_seqs: int = 256,
disable_custom_all_reduce: bool = False,
enforce_eager: bool = False,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
# lora
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
enable_prefix_caching: bool = False,
engine_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
self.processor = get_model_tokenizer(
model_id_or_path,
torch_dtype,
Expand All @@ -79,7 +81,9 @@ def __init__(
enable_lora=enable_lora,
max_loras=max_loras,
max_lora_rank=max_lora_rank,
engine_kwargs=engine_kwargs)
enable_prefix_caching=enable_prefix_caching,
engine_kwargs=engine_kwargs,
)

self._prepare_engine()
self._load_generation_config()
Expand All @@ -91,19 +95,22 @@ def _prepare_engine(self) -> None:
engine = AsyncLLMEngine.from_engine_args(self.engine_args)
self.engine = engine

def _prepare_engine_kwargs(self,
gpu_memory_utilization: float = 0.9,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: Optional[int] = None,
max_num_seqs: int = 256,
disable_custom_all_reduce: bool = False,
enforce_eager: bool = False,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
engine_kwargs: Optional[Dict[str, Any]] = None) -> None:
def _prepare_engine_kwargs(
self,
gpu_memory_utilization: float = 0.9,
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: Optional[int] = None,
max_num_seqs: int = 256,
disable_custom_all_reduce: bool = False,
enforce_eager: bool = False,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
enable_lora: bool = False,
max_loras: int = 1,
max_lora_rank: int = 16,
enable_prefix_caching: bool = False,
engine_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
if engine_kwargs is None:
engine_kwargs = {}
disable_log_stats = engine_kwargs.pop('disable_log_stats', True)
Expand Down Expand Up @@ -136,7 +143,9 @@ def _prepare_engine_kwargs(self,
disable_custom_all_reduce=disable_custom_all_reduce,
enforce_eager=enforce_eager,
trust_remote_code=True,
**engine_kwargs)
enable_prefix_caching=enable_prefix_caching,
**engine_kwargs,
)
self.engine_args = engine_args
self.enable_lora = enable_lora
if max_model_len is not None:
Expand Down
Loading