From 79868d7bd960eda1bf6a3a04a0ea18720b417d7b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 13:36:10 +0800
Subject: [PATCH 1/7] fix

---
 ...\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md" | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git "a/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md" "b/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
index fb512821b3..4be368ed63 100644
--- "a/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
+++ "b/docs/source/Instruction/\346\216\250\347\220\206\345\222\214\351\203\250\347\275\262.md"
@@ -157,7 +157,7 @@ os.environ['VIDEO_MAX_PIXELS'] = '50176'
 os.environ['FPS_MAX_FRAMES'] = '12'
 
 from swift.llm import PtEngine, RequestConfig, InferRequest
-model = 'Qwen/Qwen2.5-VL-3B-Instruc'
+model = 'Qwen/Qwen2.5-VL-3B-Instruct'
 
 # 加载推理引擎
 engine = PtEngine(model, max_batch_size=2)

From 5d651b6dce368a7ba5c916094755635e25894537 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 14:35:23 +0800
Subject: [PATCH 2/7] support max_memory

---
 swift/llm/argument/base_args/model_args.py | 12 ++++++++++++
 swift/llm/model/register.py                |  3 +++
 2 files changed, 15 insertions(+)

diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index a39c0e769d..321bfd76d3 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -44,6 +44,7 @@ class ModelArguments:
     num_labels: Optional[int] = None
     rope_scaling: Literal['linear', 'dynamic'] = None
     device_map: Optional[Union[dict, str]] = None
+    max_memory: Optional[Union[dict, str]] = None
     # When some model code needs to be downloaded from GitHub,
     # this parameter specifies the path to the locally downloaded repository.
     local_repo_path: Optional[str] = None
@@ -78,6 +79,15 @@ def _init_device_map(self):
                 if isinstance(v, int):
                     self.device_map[k] += local_rank
 
+    def _init_max_memory(self):
+        self.max_memory = self.parse_to_dict(self.max_memory)
+        # compat mp&ddp
+        _, local_rank, _, local_world_size = get_dist_setting()
+        if local_world_size > 1 and isinstance(self.max_memory, dict) and local_rank > 0:
+            for k in list(self.max_memory.keys()):
+                if isinstance(k, int):
+                    self.max_memory[k + local_rank] = self.max_memory.pop(k)
+
     def _init_torch_dtype(self) -> None:
         """"If torch_dtype is None, find a proper dtype by the train_type/GPU"""
         from swift.llm import TrainArguments
@@ -130,6 +140,7 @@ def __post_init__(self):
             raise ValueError(f'Please set --model <model_id_or_path>`, model: {self.model}')
         self.model_suffix = get_model_name(self.model)
         self._init_device_map()
+        self._init_max_memory()
         self._init_torch_dtype()
 
     def get_model_kwargs(self):
@@ -142,6 +153,7 @@ def get_model_kwargs(self):
             'hub_token': self.hub_token,
             'local_repo_path': self.local_repo_path,
             'device_map': self.device_map,
+            'max_memory': self.max_memory,
             'quantization_config': self.get_quantization_config(),
             'attn_impl': self.attn_impl,
             'rope_scaling': self.rope_scaling,
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 0e1fe430c3..b33b35783e 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -478,6 +478,7 @@ def get_model_tokenizer(
         # model kwargs
         model_type: Optional[str] = None,
         quantization_config=None,
+        max_memory: Optional[List[str]] = None,
         attn_impl: Literal['flash_attn', 'sdpa', 'eager', None] = None,
         rope_scaling: Optional[Dict[str, Any]] = None,
         automodel_class=None,
@@ -520,6 +521,8 @@ def get_model_tokenizer(
     model_kwargs['device_map'] = device_map
     if quantization_config:
         model_kwargs['quantization_config'] = quantization_config
+    if max_memory:
+        model_kwargs['max_memory'] = max_memory
     model_dir = model_info.model_dir
     get_function = model_meta.get_function
     kwargs['automodel_class'] = automodel_class

From d27b9609f38a39615c228e2853eb923e7ae2aee9 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 14:54:24 +0800
Subject: [PATCH 3/7] update

---
 ...5\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 1 +
 ...0\256\255\347\273\203\344\270\216\345\276\256\350\260\203.md" | 1 +
 docs/source_en/Instruction/Command-line-parameters.md            | 1 +
 docs/source_en/Instruction/Pre-training-and-Fine-tuning.md       | 1 +
 4 files changed, 4 insertions(+)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 484ab2044f..c833fce6fd 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -31,6 +31,7 @@
 - num_labels: 分类模型需要指定该参数。代表标签数量，默认为None
 - rope_scaling: rope类型，支持`linear`和`dynamic`，请配合`max_length`共同使用。默认为None
 - device_map: 模型使用的device_map配置，例如：'auto'、'cpu'、json字符串、json文件路径。默认为None，根据设备和分布式训练情况自动设置
+- max_memory: device_map为'auto'时，会根据max_memory进行模型权重的device分配，例如`--max_memory {0: "20GB", 1: "20GB"}`。默认为None
 - local_repo_path: 部分模型在加载时依赖于github repo。为了避免`git clone`时遇到网络问题，可以直接使用本地repo。该参数需要传入本地repo的路径, 默认为`None`
 
 ### 数据参数
diff --git "a/docs/source/Instruction/\351\242\204\350\256\255\347\273\203\344\270\216\345\276\256\350\260\203.md" "b/docs/source/Instruction/\351\242\204\350\256\255\347\273\203\344\270\216\345\276\256\350\260\203.md"
index 4cac7b95d4..84268c130f 100644
--- "a/docs/source/Instruction/\351\242\204\350\256\255\347\273\203\344\270\216\345\276\256\350\260\203.md"
+++ "b/docs/source/Instruction/\351\242\204\350\256\255\347\273\203\344\270\216\345\276\256\350\260\203.md"
@@ -71,6 +71,7 @@ ms-swift使用了分层式的设计思想，用户可以使用命令行界面、
 - 在使用`swift sft`通过LoRA技术微调base模型为chat模型时，有时需要手动设置模板。通过添加`--template default`参数来避免base模型因未见过对话模板中的特殊字符而无法正常停止的情况。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/base_to_chat)。
 - 如果需要在**断网**环境下进行训练，请设置`--model <model_dir>`和`--check_model false`。如果对应的模型需要`git clone`github的仓库，例如`deepseek-ai/Janus-Pro-7B`，请设置手动下载仓库，并设置`--local_repo_path <repo_dir>`。具体参数含义请参考[命令行参数文档](命令行参数.md)。
 - 无法对QLoRA训练的模型进行Merge LoRA，因此不建议使用QLoRA进行微调，无法在推理和部署时使用vLLM/LMDeploy进行推理加速。建议使用LoRA/全参数进行微调，合并为完整权重后再使用GPTQ/AWQ/BNB进行[量化](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize)。
+- 如果使用NPU进行训练，只需要将shell中的`CUDA_VISIBLE_DEVICES`修改为`ASCEND_RT_VISIBLE_DEVICES`。如果要使用device_map并行技术，你需要显式传入`--device_map auto`。
 - SWIFT默认在训练时设置`--gradient_checkpointing true`来节约显存，这会略微降低训练速度。
 - 若使用DDP进行训练，出现报错：`RuntimeError: Expected to mark a variable ready only once.`，请额外设置参数`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`或者使用DeepSpeed进行训练。
 - 如果要使用deepspeed，你需要安装deepspeed：`pip install deepspeed -U`。使用deepspeed可以节约显存，但会略微降低训练速度。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 850a1cbf8c..8fa7c943d9 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -32,6 +32,7 @@ Hints:
 - num_labels: This parameter needs to be specified for classification models. It represents the number of labels and defaults to None.
 - rope_scaling: Type of rope, supports `linear` and `dynamic`, should be used in conjunction with `max_length`. Default is None.
 - device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions.
+- max_memory: When device_map is set to 'auto', the model weights will be allocated to devices based on max_memory, for example `--max_memory {0: "20GB", 1: "20GB"}`. The default value is None.
 - local_repo_path: Some models depend on a GitHub repo when loading. To avoid network issues during `git clone`, a local repo can be used directly. This parameter needs to be passed with the path to the local repo, with the default being `None`.
 
 ### Data Arguments
diff --git a/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md b/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md
index 2363fbd8da..e243d66ef2 100644
--- a/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md
+++ b/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md
@@ -75,6 +75,7 @@ Additionally, we offer a series of scripts to help you understand the training c
 - When fine-tuning a base model to a chat model using LoRA technology with `swift sft`, you may sometimes need to manually set the template. Add the `--template default` parameter to avoid issues where the base model may fail to stop correctly due to encountering special characters in the dialogue template that it has not seen before. For more details, see [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/base_to_chat).
 - If you need to train in an **offline** environment, please set `--model <model_dir>` and `--check_model false`. If the corresponding model requires `git clone` from GitHub repositories, such as `deepseek-ai/Janus-Pro-7B`, please manually download the repository and set `--local_repo_path <repo_dir>`. For specific parameter meanings, refer to the [command line parameter documentation](./Command-line-parameters.md).
 - Merging LoRA for models trained with QLoRA is not possible, so it is not recommended to use QLoRA for fine-tuning, as it cannot utilize vLLM/LMDeploy for inference acceleration during inference and deployment. It is recommended to use LoRA or full parameter fine-tuning, merge them into complete weights, and then use GPTQ/AWQ/BNB for [quantization](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize).
+- If you are using an NPU for training, simply change `CUDA_VISIBLE_DEVICES` in the shell to `ASCEND_RT_VISIBLE_DEVICES`. If you want to use the device_map for parallelization, you need to explicitly pass `--device_map auto`.
 - By default, SWIFT sets `--gradient_checkpointing true` during training to save memory, which may slightly slow down the training speed.
 - If you are using DDP for training and encounter the error: `RuntimeError: Expected to mark a variable ready only once.`, please additionally set the parameter `--gradient_checkpointing_kwargs '{"use_reentrant": false}'` or use DeepSpeed for training.
 - To use DeepSpeed, you need to install it: `pip install deepspeed -U`. Using DeepSpeed can save memory but may slightly reduce training speed.

From 9bb81764381f4eadbc22dcb2f1dbf40caa7ce362 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 14:57:24 +0800
Subject: [PATCH 4/7] update

---
 ...\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +-
 docs/source_en/Instruction/Command-line-parameters.md           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index c833fce6fd..31bc9c3bf7 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -31,7 +31,7 @@
 - num_labels: 分类模型需要指定该参数。代表标签数量，默认为None
 - rope_scaling: rope类型，支持`linear`和`dynamic`，请配合`max_length`共同使用。默认为None
 - device_map: 模型使用的device_map配置，例如：'auto'、'cpu'、json字符串、json文件路径。默认为None，根据设备和分布式训练情况自动设置
-- max_memory: device_map为'auto'时，会根据max_memory进行模型权重的device分配，例如`--max_memory {0: "20GB", 1: "20GB"}`。默认为None
+- max_memory: device_map为'auto'时，会根据max_memory进行模型权重的device分配，例如`--max_memory '{0: "20GB", 1: "20GB"}'`。默认为None
 - local_repo_path: 部分模型在加载时依赖于github repo。为了避免`git clone`时遇到网络问题，可以直接使用本地repo。该参数需要传入本地repo的路径, 默认为`None`
 
 ### 数据参数
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 8fa7c943d9..144e8b64bb 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -32,7 +32,7 @@ Hints:
 - num_labels: This parameter needs to be specified for classification models. It represents the number of labels and defaults to None.
 - rope_scaling: Type of rope, supports `linear` and `dynamic`, should be used in conjunction with `max_length`. Default is None.
 - device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions.
-- max_memory: When device_map is set to 'auto', the model weights will be allocated to devices based on max_memory, for example `--max_memory {0: "20GB", 1: "20GB"}`. The default value is None.
+- max_memory: When device_map is set to 'auto', the model weights will be allocated to devices based on max_memory, for example `--max_memory '{0: "20GB", 1: "20GB"}'`. The default value is None.
 - local_repo_path: Some models depend on a GitHub repo when loading. To avoid network issues during `git clone`, a local repo can be used directly. This parameter needs to be passed with the path to the local repo, with the default being `None`.
 
 ### Data Arguments

From 692504334fd312b7f5a16dfbe24f9f86a3e1eb88 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 15:12:51 +0800
Subject: [PATCH 5/7] update

---
 ...\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +-
 docs/source_en/Instruction/Command-line-parameters.md           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 31bc9c3bf7..1b3378dcd7 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -31,7 +31,7 @@
 - num_labels: 分类模型需要指定该参数。代表标签数量，默认为None
 - rope_scaling: rope类型，支持`linear`和`dynamic`，请配合`max_length`共同使用。默认为None
 - device_map: 模型使用的device_map配置，例如：'auto'、'cpu'、json字符串、json文件路径。默认为None，根据设备和分布式训练情况自动设置
-- max_memory: device_map为'auto'时，会根据max_memory进行模型权重的device分配，例如`--max_memory '{0: "20GB", 1: "20GB"}'`。默认为None
+- max_memory: device_map设置为'auto'或者'sequential'时，会根据max_memory进行模型权重的device分配，例如：`--max_memory '{0: "20GB", 1: "20GB"}'`。默认为None
 - local_repo_path: 部分模型在加载时依赖于github repo。为了避免`git clone`时遇到网络问题，可以直接使用本地repo。该参数需要传入本地repo的路径, 默认为`None`
 
 ### 数据参数
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 144e8b64bb..ffd8053cc6 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -32,7 +32,7 @@ Hints:
 - num_labels: This parameter needs to be specified for classification models. It represents the number of labels and defaults to None.
 - rope_scaling: Type of rope, supports `linear` and `dynamic`, should be used in conjunction with `max_length`. Default is None.
 - device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions.
-- max_memory: When device_map is set to 'auto', the model weights will be allocated to devices based on max_memory, for example `--max_memory '{0: "20GB", 1: "20GB"}'`. The default value is None.
+- max_memory: When device_map is set to 'auto' or 'sequential', the model weights will be allocated to devices based on max_memory, for example: `--max_memory '{0: "20GB", 1: "20GB"}'`. The default value is None.
 - local_repo_path: Some models depend on a GitHub repo when loading. To avoid network issues during `git clone`, a local repo can be used directly. This parameter needs to be passed with the path to the local repo, with the default being `None`.
 
 ### Data Arguments

From 7db7aabed210b72be73c2252861b7db6e5778179 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 15:15:04 +0800
Subject: [PATCH 6/7] update test

---
 tests/infer/test_max_memory.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tests/infer/test_max_memory.py

diff --git a/tests/infer/test_max_memory.py b/tests/infer/test_max_memory.py
new file mode 100644
index 0000000000..36a60e2fcf
--- /dev/null
+++ b/tests/infer/test_max_memory.py
@@ -0,0 +1,10 @@
+from swift.llm import infer_main, InferArguments
+
+
+def test_max_memory():
+    infer_main(InferArguments(
+        model='Qwen/Qwen2.5-7B-Instruct', 
+        max_memory={0: '50GB', 1: '5GB'}, device_map='sequential'))
+
+if __name__ == '__main__':
+    test_max_memory()

From 748651fbe55063bcbf56995aabdaf9fe62954f7b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 26 Feb 2025 15:19:31 +0800
Subject: [PATCH 7/7] lint pass

---
 tests/infer/test_max_memory.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/infer/test_max_memory.py b/tests/infer/test_max_memory.py
index 36a60e2fcf..2a56bc90da 100644
--- a/tests/infer/test_max_memory.py
+++ b/tests/infer/test_max_memory.py
@@ -1,10 +1,13 @@
-from swift.llm import infer_main, InferArguments
+from swift.llm import InferArguments, infer_main
 
 
 def test_max_memory():
-    infer_main(InferArguments(
-        model='Qwen/Qwen2.5-7B-Instruct', 
-        max_memory={0: '50GB', 1: '5GB'}, device_map='sequential'))
+    infer_main(
+        InferArguments(model='Qwen/Qwen2.5-7B-Instruct', max_memory={
+            0: '50GB',
+            1: '5GB'
+        }, device_map='sequential'))
+
 
 if __name__ == '__main__':
     test_max_memory()