Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/Instruction/命令行参数.md
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ Vera使用`target_modules`、`target_regex`、`modules_to_save`三个参数,
- 🔥vllm_max_model_len: 模型支持的最大长度。默认为`None`,即从config.json中读取。
- vllm_disable_custom_all_reduce: 禁用自定义的 all-reduce 内核,回退到 NCCL。为了稳定性,默认为`True`。
- vllm_enforce_eager: vllm使用pytorch eager模式还是建立cuda graph,默认为`False`。设置为True可以节约显存,但会影响效率。
- vllm_mm_processor_cache_gb: 多模态处理器缓存大小(GiB),用于缓存已处理的多模态输入(如图像、视频)避免重复处理。默认为`4`。设置为`0`可禁用缓存但会降低性能(不推荐)。仅对多模态模型生效。
- vllm_disable_cascade_attn: 是否强制关闭V1引擎的cascade attention实现以防止潜在数值误差,默认为False,由vLLM内部逻辑决定是否使用。
- 🔥vllm_limit_mm_per_prompt: 控制vllm使用多图,默认为`None`。例如传入`--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`。
- vllm_max_lora_rank: 默认为`16`。vllm对于lora支持的参数。
Expand Down
1 change: 1 addition & 0 deletions docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ Parameter meanings can be found in the [vllm documentation](https://docs.vllm.ai
- 🔥vllm_max_model_len: The maximum sequence length supported by the model. Default is `None`, meaning it will be read from `config.json`.
- vllm_disable_custom_all_reduce: Disables the custom all-reduce kernel and falls back to NCCL. For stability, the default is `True`.
- vllm_enforce_eager: Determines whether vllm uses PyTorch eager mode or constructs a CUDA graph, default is `False`. Setting it to True can save memory but may affect efficiency.
- vllm_mm_processor_cache_gb: The size (in GiB) of the multimodal processor cache, used to store processed multimodal inputs (e.g., images, videos) to avoid redundant processing. Default is 4. Setting it to 0 disables the cache but may degrade performance (not recommended). This option takes effect only for multimodal models.
- vllm_disable_cascade_attn: Whether to forcibly disable the V1 engine’s cascade-attention implementation to avoid potential numerical issues. Defaults to False; vLLM’s internal heuristics determine whether cascade attention is actually used.
- 🔥vllm_limit_mm_per_prompt: Controls the use of multiple media in vllm, default is `None`. For example, you can pass in `--vllm_limit_mm_per_prompt '{"image": 5, "video": 2}'`.
- vllm_max_lora_rank: Default is `16`. This is the parameter supported by vllm for lora.
Expand Down
2 changes: 2 additions & 0 deletions swift/llm/infer/infer_engine/grpo_vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
task_type: Optional[str] = None,
disable_cascade_attn: bool = False,
load_format: str = 'auto',
mm_processor_cache_gb: Optional[float] = None,
# lora
enable_lora: bool = False,
max_loras: int = 1,
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(
task_type=task_type,
disable_cascade_attn=disable_cascade_attn,
load_format=load_format,
mm_processor_cache_gb=mm_processor_cache_gb,
enable_lora=enable_lora,
max_loras=max_loras,
max_lora_rank=max_lora_rank,
Expand Down
8 changes: 7 additions & 1 deletion swift/llm/infer/infer_engine/vllm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
task_type: Optional[str] = None, # embedding
disable_cascade_attn: bool = False,
load_format: str = 'auto',
mm_processor_cache_gb: Optional[float] = None,
# lora
enable_lora: bool = False,
max_loras: int = 1,
Expand Down Expand Up @@ -129,6 +130,7 @@ def __init__(
quantization=quantization,
task=task_type,
disable_cascade_attn=disable_cascade_attn,
mm_processor_cache_gb=mm_processor_cache_gb,
**engine_kwargs,
)
context = nullcontext()
Expand Down Expand Up @@ -169,6 +171,7 @@ def _prepare_engine_kwargs(
task: Optional[str] = None,
disable_cascade_attn: bool = False,
load_format: str = 'auto',
mm_processor_cache_gb: Optional[float] = None,
**engine_kwargs,
) -> None:
if task == 'embedding':
Expand Down Expand Up @@ -197,7 +200,10 @@ def _prepare_engine_kwargs(
else:
assert not limit_mm_per_prompt, (
'The current version of vLLM does not support `limit_mm_per_prompt`. Please upgrade vLLM.')
for key in ['enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format']:
for key in [
'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format',
'mm_processor_cache_gb'
]:
Comment on lines +203 to +206
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better readability and adherence to Python style guides (PEP 8), it's recommended to format this list of keys with one item per line. This makes the list easier to read and modify in the future.

Suggested change
for key in [
'enable_expert_parallel', 'enable_sleep_mode', 'disable_cascade_attn', 'load_format',
'mm_processor_cache_gb'
]:
for key in [
'enable_expert_parallel',
'enable_sleep_mode',
'disable_cascade_attn',
'load_format',
'mm_processor_cache_gb',
]:

if key in parameters:
engine_kwargs[key] = locals()[key]
else:
Expand Down
8 changes: 7 additions & 1 deletion swift/trainers/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ class VllmArguments:
Args:
vllm_gpu_memory_utilization (float): GPU memory utilization. Default is 0.9.
vllm_tensor_parallel_size (int): Tensor parallelism size. Default is 1.
vllm_pipeline_parallel_size(int): Pipeline parallelism size. Default is 1.
vllm_pipeline_parallel_size (int): Pipeline parallelism size. Default is 1.
vllm_enable_expert_parallel (bool): Flag to enable expert parallelism for MoE models. Default is False.
vllm_max_num_seqs (int): Maximum number of sequences. Default is 256.
vllm_max_model_len (Optional[int]): Maximum model length. Default is None.
vllm_disable_custom_all_reduce (bool): Flag to disable custom all-reduce. Default is True.
Expand All @@ -182,6 +183,9 @@ class VllmArguments:
vllm_enable_prefix_caching (Optional[bool]): Flag to enable automatic prefix caching. Default is None.
vllm_use_async_engine (bool): Whether to use async engine for vLLM. Default is False.
vllm_quantization (Optional[str]): The quantization method for vLLM. Default is None.
vllm_reasoning_parser (Optional[str]): The reasoning parser for vLLM. Default is None.
vllm_disable_cascade_attn (bool): Flag to disable cascade attention. Default is False.
vllm_mm_processor_cache_gb (Optional[float]): MM processor cache size in GB. Default is None.
vllm_data_parallel_size (int): Data parallelism size for vLLM rollout. Default is 1.
"""
# vllm
Expand All @@ -200,6 +204,7 @@ class VllmArguments:
vllm_quantization: Optional[str] = None
vllm_reasoning_parser: Optional[str] = None
vllm_disable_cascade_attn: bool = False
vllm_mm_processor_cache_gb: Optional[float] = None
# rollout
vllm_data_parallel_size: int = 1

Expand Down Expand Up @@ -251,6 +256,7 @@ def get_vllm_engine_kwargs(self):
'quantization': self.vllm_quantization,
'reasoning_parser': self.vllm_reasoning_parser,
'disable_cascade_attn': self.vllm_disable_cascade_attn,
'mm_processor_cache_gb': self.vllm_mm_processor_cache_gb,
'num_labels': self.num_labels,
}
if self.task_type in ('embedding', 'seq_cls') or 'reranker' in self.task_type:
Expand Down
1 change: 1 addition & 0 deletions swift/trainers/rlhf_trainer/grpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,7 @@ def prepare_vllm(self, model):
seed=self.accelerator.process_index // self.vllm_tensor_parallel_size,
disable_cascade_attn=self.args.vllm_disable_cascade_attn,
load_format='dummy',
mm_processor_cache_gb=self.args.vllm_mm_processor_cache_gb,
template=vllm_template,
distributed_executor_backend='external_launcher',
**lora_kwargs,
Expand Down
Loading