From ff9f473acf100891eba94cc46c0eaf0740a85018 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 00:01:02 +0800
Subject: [PATCH 01/70] try to instead lora with peft lora

---
 swift/tuners/lora.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 69719c9df1..3848dfbc72 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -77,17 +77,15 @@ class LoRA:
 
     @staticmethod
     def prepare_model(model: nn.Module, config: LoRAConfig):
-        """Prepare a model with `LoRAConfig`"""
-        LoRA._dynamic_patch_lora(
-            model,
-            replace_modules=config.target_modules,
+        from peft import LoraConfig, LoraModel
+        LoraModel(model, LoraConfig(
             r=config.r,
-            lora_alpha=config.lora_alpha,
+            target_modules=config.target_modules,
+            lora_alpha=int(config.lora_alpha),
             lora_dropout=config.lora_dropout,
-            merge_weights=config.merge_weights,
-            use_merged_linear=config.use_merged_linear,
-            enable_lora=config.enable_lora,
-            fan_in_fan_out=config.fan_in_fan_out)
+            fan_in_fan_out=config.fan_in_fan_out,
+            bias=config.bias,
+        ), 'default')
 
         def state_dict_callback(state_dict):
             return lora_state_dict(state_dict, config.bias)

From 5b00f1128b186e3fb51df2f04e663e11f081f35e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 10:51:11 +0800
Subject: [PATCH 02/70] Revert "try to instead lora with peft lora"

This reverts commit ff9f473acf100891eba94cc46c0eaf0740a85018.
---
 swift/tuners/lora.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 3848dfbc72..69719c9df1 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -77,15 +77,17 @@ class LoRA:
 
     @staticmethod
     def prepare_model(model: nn.Module, config: LoRAConfig):
-        from peft import LoraConfig, LoraModel
-        LoraModel(model, LoraConfig(
+        """Prepare a model with `LoRAConfig`"""
+        LoRA._dynamic_patch_lora(
+            model,
+            replace_modules=config.target_modules,
             r=config.r,
-            target_modules=config.target_modules,
-            lora_alpha=int(config.lora_alpha),
+            lora_alpha=config.lora_alpha,
             lora_dropout=config.lora_dropout,
-            fan_in_fan_out=config.fan_in_fan_out,
-            bias=config.bias,
-        ), 'default')
+            merge_weights=config.merge_weights,
+            use_merged_linear=config.use_merged_linear,
+            enable_lora=config.enable_lora,
+            fan_in_fan_out=config.fan_in_fan_out)
 
         def state_dict_callback(state_dict):
             return lora_state_dict(state_dict, config.bias)

From 3d1a618d8690f3eba93d55af179c3022ec748246 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 11:44:56 +0800
Subject: [PATCH 03/70] try to add bnb & gptq linear

---
 requirements/framework.txt      |  2 +-
 swift/tuners/lora.py            | 73 ++++++++++++++++++++++++++++-----
 tests/tuners/test_swift_base.py | 10 +++++
 3 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/requirements/framework.txt b/requirements/framework.txt
index 4247a138db..c4ecc554c0 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -3,7 +3,7 @@ datasets
 diffusers>=0.18.0
 numpy
 pandas
-peft
+peft>=0.5.0
 requests
 safetensors
 tensorboard
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 69719c9df1..c95ed4438b 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -10,10 +10,23 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 from .utils import SwiftConfig, SwiftOutput
+from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
+
+from peft.import_utils import is_bnb_available, is_bnb_4bit_available, is_auto_gptq_available
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+    from peft.tuners.lora import Linear8bitLt
 
-logger = logging.getLogger(__name__)
+if is_bnb_4bit_available():
+    from peft.tuners.lora import Linear4bit
+
+if is_auto_gptq_available():
+    from peft.tuners.lora import QuantLinear
+
+logger = logging.getLogger()
 
 
 @dataclass
@@ -38,7 +51,7 @@ class LoRAConfig(SwiftConfig):
         default=None,
         metadata={
             'help':
-            'The modules to be replaced by LoRA, can be the end of the module name or a regex string'
+                'The modules to be replaced by LoRA, can be the end of the module name or a regex string'
         })
     lora_alpha: float = field(
         default=1., metadata={'help': 'The factor to add the lora weights'})
@@ -54,13 +67,13 @@ class LoRAConfig(SwiftConfig):
         default=None,
         metadata={
             'help':
-            'The modules need to be turned on when using the merged linear layer'
+                'The modules need to be turned on when using the merged linear layer'
         })
     fan_in_fan_out: bool = field(
         default=False,
         metadata={
             'help':
-            'Set this to True if the layer to replace stores weight like (fan_in, fan_out)'
+                'Set this to True if the layer to replace stores weight like (fan_in, fan_out)'
         })
     bias: str = field(
         default='none',
@@ -146,6 +159,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                             sub_module.out_features,
                             bias=sub_module.bias is not None,
                             **kwargs)
+                elif isinstance(sub_module, torch.nn.Embedding):
+                    lora_module = Embedding(
+                        num_embeddings=sub_module.num_embeddings,
+                        embedding_dim=sub_module.embedding_dim,
+                        r=kwargs['r'],
+                        lora_alpha=kwargs['lora_alpha'],
+                        merge_weights=kwargs['merge_weights'])
                 elif isinstance(sub_module, torch.nn.Conv2d):
                     kwargs.pop('fan_in_fan_out', None)
                     lora_module = Conv2d(
@@ -157,6 +177,37 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         dilation=sub_module.dilation,
                         groups=sub_module.groups,
                         **kwargs)
+                elif kwargs.pop('loaded_in_8bit', False) and isinstance(sub_module, bnb.nn.Linear8bitLt):
+                    eight_bit_kwargs = kwargs.copy()
+                    eight_bit_kwargs.update(
+                        {
+                            "has_fp16_weights": sub_module.state.has_fp16_weights,
+                            "memory_efficient_backward": sub_module.state.memory_efficient_backward,
+                            "threshold": sub_module.state.threshold,
+                            "index": sub_module.index,
+                        }
+                    )
+                    lora_module = Linear8bitLt(
+                        'default', sub_module.in_features, sub_module.out_features,
+                        bias=kwargs.pop('bias', False), **eight_bit_kwargs
+                    )
+                elif kwargs.pop('loaded_in_4bit', False) and is_bnb_4bit_available() and isinstance(sub_module,
+                                                                                                    bnb.nn.Linear4bit):
+                    four_bit_kwargs = kwargs.copy()
+                    four_bit_kwargs.update(
+                        {
+                            "compute_dtype": sub_module.compute_dtype,
+                            "compress_statistics": sub_module.weight.compress_statistics,
+                            "quant_type": sub_module.weight.quant_type,
+                        }
+                    )
+                    lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features,
+                                             bias=kwargs.pop('bias', False), **four_bit_kwargs)
+
+                AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq"))
+                if AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear):
+                    lora_module = QuantLinear('default', sub_module, **kwargs)
+                    sub_module.weight = sub_module.qweight
 
                 if lora_module is not None:
                     lora_module.weight = sub_module.weight
@@ -238,11 +289,11 @@ def unpatch_lora(model, config: LoRAConfig):
 class LoRALayer:
 
     def __init__(
-        self,
-        r: int,
-        lora_alpha: int,
-        lora_dropout: float,
-        merge_weights: bool,
+            self,
+            r: int,
+            lora_alpha: int,
+            lora_dropout: float,
+            merge_weights: bool,
     ):
         self.r = r
         self.lora_alpha = lora_alpha
@@ -458,7 +509,7 @@ def __init__(self,
             self.weight.requires_grad = False
             # Compute the indices
             self.lora_ind = self.weight.new_zeros(
-                (out_features, ), dtype=torch.bool).view(len(enable_lora), -1)
+                (out_features,), dtype=torch.bool).view(len(enable_lora), -1)
             self.lora_ind[enable_lora, :] = True
             self.lora_ind = self.lora_ind.view(-1)
         self.reset_parameters()
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 715fd0c743..1dff38d84f 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -6,6 +6,7 @@
 from time import time
 
 import torch
+from modelscope import Model, Preprocessor
 from modelscope.models.nlp.structbert import (SbertConfig,
                                               SbertForSequenceClassification)
 from peft.utils import WEIGHTS_NAME
@@ -25,6 +26,15 @@ def tearDown(self):
         shutil.rmtree(self.tmp_dir)
         super().tearDown()
 
+    def test_swift_lora_forward(self):
+        model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+        lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
+        model = Swift.prepare_model(model, config=lora_config)
+        inputs = preprocessor('how are you')
+        outputs = model(**inputs)
+        self.assertTrue('logits' in outputs)
+
     def test_swift_lora_injection(self):
         model = SbertForSequenceClassification(SbertConfig())
         model2 = copy.deepcopy(model)

From 18de6ffc06c764fc6dc785026b5c8ae317fd1817 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 12:07:11 +0800
Subject: [PATCH 04/70] add more code

---
 examples/pytorch/llm/src/llm_sft.py | 7 +++----
 tests/tuners/test_swift_base.py     | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 696a887dc8..2f08c0be58 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -13,7 +13,7 @@
                    is_dist, is_master, plot_images, process_dataset,
                    select_bnb, select_dtype, show_layers)
 
-from swift import (HubStrategy, LoraConfig, Seq2SeqTrainer,
+from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
@@ -207,12 +207,11 @@ def llm_sft(args: SftArguments) -> None:
             logger.info(
                 f'Setting lora_target_modules: {args.lora_target_modules}')
         if args.resume_from_ckpt is None:
-            lora_config = LoraConfig(
+            lora_config = LoRAConfig(
                 r=args.lora_rank,
                 target_modules=args.lora_target_modules,
                 lora_alpha=args.lora_alpha,
-                lora_dropout=args.lora_dropout_p,
-                task_type='CAUSAL_LM')
+                lora_dropout=args.lora_dropout_p)
             logger.info(f'lora_config: {lora_config}')
             model = Swift.prepare_model(model, lora_config)
         else:
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 1dff38d84f..763aeab626 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -33,7 +33,7 @@ def test_swift_lora_forward(self):
         model = Swift.prepare_model(model, config=lora_config)
         inputs = preprocessor('how are you')
         outputs = model(**inputs)
-        self.assertTrue('logits' in outputs)
+        self.assertTrue(hasattr(outputs, 'logits'))
 
     def test_swift_lora_injection(self):
         model = SbertForSequenceClassification(SbertConfig())

From e836d27ea12de693ae3e5a03b806328415615462 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 15:21:08 +0800
Subject: [PATCH 05/70] fix bug

---
 examples/pytorch/llm/src/llm_sft.py |  2 +-
 swift/tuners/lora.py                | 70 +++++++++++++++--------------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 2f08c0be58..a5b86533b8 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -13,7 +13,7 @@
                    is_dist, is_master, plot_images, process_dataset,
                    select_bnb, select_dtype, show_layers)
 
-from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer,
+from swift import (HubStrategy, LoraConfig, LoRAConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index c95ed4438b..f46e42d826 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -130,6 +130,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         assert isinstance(replace_modules, (str, list))
         if isinstance(replace_modules, str):
             replace_modules = [replace_modules]
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq"))
 
         for module_key in module_keys:
             if isinstance(replace_modules, str):
@@ -145,19 +146,48 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                 _key = parts[-1]
 
                 lora_module = None
-                if isinstance(sub_module, torch.nn.Linear):
+                if getattr(model, "is_loaded_in_8bit", False) and isinstance(sub_module, bnb.nn.Linear8bitLt):
+                    eight_bit_kwargs = kwargs.copy()
+                    eight_bit_kwargs.update(
+                        {
+                            "has_fp16_weights": sub_module.state.has_fp16_weights,
+                            "memory_efficient_backward": sub_module.state.memory_efficient_backward,
+                            "threshold": sub_module.state.threshold,
+                            "index": sub_module.index,
+                        }
+                    )
+                    lora_module = Linear8bitLt(
+                        'default', sub_module.in_features, sub_module.out_features,
+                        bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **eight_bit_kwargs
+                    )
+                elif getattr(model, "is_loaded_in_4bit", False) and is_bnb_4bit_available() and isinstance(sub_module,
+                                                                                                    bnb.nn.Linear4bit):
+                    four_bit_kwargs = kwargs.copy()
+                    four_bit_kwargs.update(
+                        {
+                            "compute_dtype": sub_module.compute_dtype,
+                            "compress_statistics": sub_module.weight.compress_statistics,
+                            "quant_type": sub_module.weight.quant_type,
+                        }
+                    )
+                    lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features,
+                                             bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **four_bit_kwargs)
+                elif AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear):
+                    lora_module = QuantLinear('default', sub_module, **kwargs)
+                    sub_module.weight = sub_module.qweight
+                elif isinstance(sub_module, torch.nn.Linear):
                     if use_merged_linear:
                         lora_module = MergedLinear(
                             sub_module.in_features,
                             sub_module.out_features,
-                            bias=sub_module.bias is not None,
+                            bias=hasattr(sub_module, "bias") and sub_module.bias is not None,
                             **kwargs)
                     else:
                         kwargs.pop('enable_lora', None)
                         lora_module = Linear(
                             sub_module.in_features,
                             sub_module.out_features,
-                            bias=sub_module.bias is not None,
+                            bias=hasattr(sub_module, "bias") and sub_module.bias is not None,
                             **kwargs)
                 elif isinstance(sub_module, torch.nn.Embedding):
                     lora_module = Embedding(
@@ -177,45 +207,17 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         dilation=sub_module.dilation,
                         groups=sub_module.groups,
                         **kwargs)
-                elif kwargs.pop('loaded_in_8bit', False) and isinstance(sub_module, bnb.nn.Linear8bitLt):
-                    eight_bit_kwargs = kwargs.copy()
-                    eight_bit_kwargs.update(
-                        {
-                            "has_fp16_weights": sub_module.state.has_fp16_weights,
-                            "memory_efficient_backward": sub_module.state.memory_efficient_backward,
-                            "threshold": sub_module.state.threshold,
-                            "index": sub_module.index,
-                        }
-                    )
-                    lora_module = Linear8bitLt(
-                        'default', sub_module.in_features, sub_module.out_features,
-                        bias=kwargs.pop('bias', False), **eight_bit_kwargs
-                    )
-                elif kwargs.pop('loaded_in_4bit', False) and is_bnb_4bit_available() and isinstance(sub_module,
-                                                                                                    bnb.nn.Linear4bit):
-                    four_bit_kwargs = kwargs.copy()
-                    four_bit_kwargs.update(
-                        {
-                            "compute_dtype": sub_module.compute_dtype,
-                            "compress_statistics": sub_module.weight.compress_statistics,
-                            "quant_type": sub_module.weight.quant_type,
-                        }
-                    )
-                    lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features,
-                                             bias=kwargs.pop('bias', False), **four_bit_kwargs)
-
-                AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq"))
-                if AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear):
-                    lora_module = QuantLinear('default', sub_module, **kwargs)
-                    sub_module.weight = sub_module.qweight
 
                 if lora_module is not None:
                     lora_module.weight = sub_module.weight
                     if sub_module.bias is not None:
                         lora_module.bias = sub_module.bias
+                    if getattr(sub_module, "state", None) is not None:
+                        lora_module.state = sub_module.state
                     lora_module.to(sub_module.weight.device)
                     setattr(module, _key, lora_module)
                     modules.append(lora_module)
+
         return modules
 
     @staticmethod

From 09e267c35a578e5be1449edac19d1646301f3288 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 16:17:39 +0800
Subject: [PATCH 06/70] lint code

---
 examples/pytorch/llm/src/llm_sft.py |  2 +-
 swift/tuners/lora.py                | 98 +++++++++++++++++------------
 swift/tuners/prompt.py              |  2 +-
 swift/utils/torch_utils.py          |  6 +-
 tests/tuners/test_swift_base.py     |  6 +-
 5 files changed, 67 insertions(+), 47 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index a5b86533b8..2f08c0be58 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -13,7 +13,7 @@
                    is_dist, is_master, plot_images, process_dataset,
                    select_bnb, select_dtype, show_layers)
 
-from swift import (HubStrategy, LoraConfig, LoRAConfig, Seq2SeqTrainer,
+from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index f46e42d826..e5e315385f 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -10,10 +10,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .utils import SwiftConfig, SwiftOutput
+from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available,
+                               is_bnb_available)
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
-from peft.import_utils import is_bnb_available, is_bnb_4bit_available, is_auto_gptq_available
+from .utils import SwiftConfig, SwiftOutput
 
 if is_bnb_available():
     import bitsandbytes as bnb
@@ -51,7 +52,7 @@ class LoRAConfig(SwiftConfig):
         default=None,
         metadata={
             'help':
-                'The modules to be replaced by LoRA, can be the end of the module name or a regex string'
+            'The modules to be replaced by LoRA, can be the end of the module name or a regex string'
         })
     lora_alpha: float = field(
         default=1., metadata={'help': 'The factor to add the lora weights'})
@@ -67,13 +68,13 @@ class LoRAConfig(SwiftConfig):
         default=None,
         metadata={
             'help':
-                'The modules need to be turned on when using the merged linear layer'
+            'The modules need to be turned on when using the merged linear layer'
         })
     fan_in_fan_out: bool = field(
         default=False,
         metadata={
             'help':
-                'Set this to True if the layer to replace stores weight like (fan_in, fan_out)'
+            'Set this to True if the layer to replace stores weight like (fan_in, fan_out)'
         })
     bias: str = field(
         default='none',
@@ -130,7 +131,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         assert isinstance(replace_modules, (str, list))
         if isinstance(replace_modules, str):
             replace_modules = [replace_modules]
-        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq"))
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(
+            get_quantization_config(model, method='gptq'))
 
         for module_key in module_keys:
             if isinstance(replace_modules, str):
@@ -146,33 +148,47 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                 _key = parts[-1]
 
                 lora_module = None
-                if getattr(model, "is_loaded_in_8bit", False) and isinstance(sub_module, bnb.nn.Linear8bitLt):
+                if getattr(model, 'is_loaded_in_8bit', False) and isinstance(
+                        sub_module, bnb.nn.Linear8bitLt):
                     eight_bit_kwargs = kwargs.copy()
-                    eight_bit_kwargs.update(
-                        {
-                            "has_fp16_weights": sub_module.state.has_fp16_weights,
-                            "memory_efficient_backward": sub_module.state.memory_efficient_backward,
-                            "threshold": sub_module.state.threshold,
-                            "index": sub_module.index,
-                        }
-                    )
+                    eight_bit_kwargs.update({
+                        'has_fp16_weights':
+                        sub_module.state.has_fp16_weights,
+                        'memory_efficient_backward':
+                        sub_module.state.memory_efficient_backward,
+                        'threshold':
+                        sub_module.state.threshold,
+                        'index':
+                        sub_module.index,
+                    })
                     lora_module = Linear8bitLt(
-                        'default', sub_module.in_features, sub_module.out_features,
-                        bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **eight_bit_kwargs
-                    )
-                elif getattr(model, "is_loaded_in_4bit", False) and is_bnb_4bit_available() and isinstance(sub_module,
-                                                                                                    bnb.nn.Linear4bit):
+                        'default',
+                        sub_module.in_features,
+                        sub_module.out_features,
+                        bias=hasattr(sub_module, 'bias')
+                        and sub_module.bias is not None,
+                        **eight_bit_kwargs)
+                elif getattr(model, 'is_loaded_in_4bit',
+                             False) and is_bnb_4bit_available() and isinstance(
+                                 sub_module, bnb.nn.Linear4bit):
                     four_bit_kwargs = kwargs.copy()
-                    four_bit_kwargs.update(
-                        {
-                            "compute_dtype": sub_module.compute_dtype,
-                            "compress_statistics": sub_module.weight.compress_statistics,
-                            "quant_type": sub_module.weight.quant_type,
-                        }
-                    )
-                    lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features,
-                                             bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **four_bit_kwargs)
-                elif AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear):
+                    four_bit_kwargs.update({
+                        'compute_dtype':
+                        sub_module.compute_dtype,
+                        'compress_statistics':
+                        sub_module.weight.compress_statistics,
+                        'quant_type':
+                        sub_module.weight.quant_type,
+                    })
+                    lora_module = Linear4bit(
+                        'default',
+                        sub_module.in_features,
+                        sub_module.out_features,
+                        bias=hasattr(sub_module, 'bias')
+                        and sub_module.bias is not None,
+                        **four_bit_kwargs)
+                elif AutoGPTQQuantLinear is not None and isinstance(
+                        sub_module, AutoGPTQQuantLinear):
                     lora_module = QuantLinear('default', sub_module, **kwargs)
                     sub_module.weight = sub_module.qweight
                 elif isinstance(sub_module, torch.nn.Linear):
@@ -180,14 +196,16 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         lora_module = MergedLinear(
                             sub_module.in_features,
                             sub_module.out_features,
-                            bias=hasattr(sub_module, "bias") and sub_module.bias is not None,
+                            bias=hasattr(sub_module, 'bias')
+                            and sub_module.bias is not None,
                             **kwargs)
                     else:
                         kwargs.pop('enable_lora', None)
                         lora_module = Linear(
                             sub_module.in_features,
                             sub_module.out_features,
-                            bias=hasattr(sub_module, "bias") and sub_module.bias is not None,
+                            bias=hasattr(sub_module, 'bias')
+                            and sub_module.bias is not None,
                             **kwargs)
                 elif isinstance(sub_module, torch.nn.Embedding):
                     lora_module = Embedding(
@@ -210,9 +228,9 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
 
                 if lora_module is not None:
                     lora_module.weight = sub_module.weight
-                    if sub_module.bias is not None:
+                    if getattr(sub_module, 'bias', None) is not None:
                         lora_module.bias = sub_module.bias
-                    if getattr(sub_module, "state", None) is not None:
+                    if getattr(sub_module, 'state', None) is not None:
                         lora_module.state = sub_module.state
                     lora_module.to(sub_module.weight.device)
                     setattr(module, _key, lora_module)
@@ -291,11 +309,11 @@ def unpatch_lora(model, config: LoRAConfig):
 class LoRALayer:
 
     def __init__(
-            self,
-            r: int,
-            lora_alpha: int,
-            lora_dropout: float,
-            merge_weights: bool,
+        self,
+        r: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        merge_weights: bool,
     ):
         self.r = r
         self.lora_alpha = lora_alpha
@@ -511,7 +529,7 @@ def __init__(self,
             self.weight.requires_grad = False
             # Compute the indices
             self.lora_ind = self.weight.new_zeros(
-                (out_features,), dtype=torch.bool).view(len(enable_lora), -1)
+                (out_features, ), dtype=torch.bool).view(len(enable_lora), -1)
             self.lora_ind[enable_lora, :] = True
             self.lora_ind = self.lora_ind.view(-1)
         self.reset_parameters()
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index a255c36cff..f426a4dd83 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -61,7 +61,7 @@ class PromptConfig(SwiftConfig):
             'help':
             'When set to True, prompt is attached in front of the embedding'
         })
-    
+
     extract_embedding: bool = field(
         default=False,
         metadata={
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 4fca0a28d5..dc0d6d395f 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -90,9 +90,9 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     n_grads /= 1e6
     n_buffers /= 1e6
     s = [
-        f'{name}: ',
-        f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
-        f'{n_buffers:.4f}M Buffers',
+        f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
+        f'{n_buffers:.4f}M Buffers, ',
+        f'Trainable percentage: {100 * n_grads / n_params:.2f}%'
     ]
     s += '.'
     logger.info(''.join(s))
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 763aeab626..ce1ccb3307 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -27,8 +27,10 @@ def tearDown(self):
         super().tearDown()
 
     def test_swift_lora_forward(self):
-        model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
-        preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
         lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
         model = Swift.prepare_model(model, config=lora_config)
         inputs = preprocessor('how are you')

From 07e93c221fc60aa36cfdc995baaf2cb4f0d201bf Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 16:30:33 +0800
Subject: [PATCH 07/70] wip

---
 examples/pytorch/llm/src/llm_sft.py | 41 ++++++++++++++++-------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 2f08c0be58..39a3a8c429 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -2,7 +2,7 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from dataclasses import dataclass, field
 from functools import partial
-from typing import List, Optional
+from typing import List, Optional, Dict
 
 import torch
 import torch.distributed as dist
@@ -14,7 +14,7 @@
                    select_bnb, select_dtype, show_layers)
 
 from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer,
-                   Seq2SeqTrainingArguments, Swift, get_logger)
+                   Seq2SeqTrainingArguments, Swift, get_logger, SwiftConfig)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
                          seed_everything)
@@ -198,25 +198,30 @@ def llm_sft(args: SftArguments) -> None:
     model, tokenizer = get_model_tokenizer(
         args.model_type, torch_dtype=args.torch_dtype, **kwargs)
 
+    if args.resume_from_ckpt is None:
+    swift_config: Dict[str, SwiftConfig] = dict()
+    for sft_type in args.sft_type.split(','):
+        if sft_type == 'lora':
+            if 'ALL' in args.lora_target_modules:
+                assert len(args.lora_target_modules) == 1
+                args.lora_target_modules = find_all_linear_for_lora(
+                    model, args.quantization_bit, args.model_type)
+                logger.info(
+                    f'Setting lora_target_modules: {args.lora_target_modules}')
+
+                lora_config = LoRAConfig(
+                    r=args.lora_rank,
+                    target_modules=args.lora_target_modules,
+                    lora_alpha=args.lora_alpha,
+                    lora_dropout=args.lora_dropout_p)
+                logger.info(f'lora_config: {lora_config}')
     # ### Preparing lora
     if args.sft_type == 'lora':
-        if 'ALL' in args.lora_target_modules:
-            assert len(args.lora_target_modules) == 1
-            args.lora_target_modules = find_all_linear_for_lora(
-                model, args.quantization_bit, args.model_type)
-            logger.info(
-                f'Setting lora_target_modules: {args.lora_target_modules}')
-        if args.resume_from_ckpt is None:
-            lora_config = LoRAConfig(
-                r=args.lora_rank,
-                target_modules=args.lora_target_modules,
-                lora_alpha=args.lora_alpha,
-                lora_dropout=args.lora_dropout_p)
-            logger.info(f'lora_config: {lora_config}')
+
             model = Swift.prepare_model(model, lora_config)
-        else:
-            model = Swift.from_pretrained(
-                model, args.resume_from_ckpt, is_trainable=True)
+    else:
+        model = Swift.from_pretrained(
+            model, args.resume_from_ckpt, is_trainable=True)
 
     show_layers(model)
     print_model_info(model)

From a9ff3127aab736ebf72c6610593b2bbcbdefa031 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 5 Sep 2023 17:15:09 +0800
Subject: [PATCH 08/70] 1. prompt&adapter support endwith match 2. llm_sft
 supports mix tuners

---
 examples/pytorch/llm/src/llm_sft.py     | 42 +++++++++++++++----------
 examples/pytorch/llm/src/utils/model.py | 27 ++++++++++++++++
 swift/tuners/adapter.py                 | 10 +++++-
 swift/tuners/prompt.py                  |  9 +++++-
 4 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 39a3a8c429..6da37e4ef4 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -2,7 +2,7 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from dataclasses import dataclass, field
 from functools import partial
-from typing import List, Optional, Dict
+from typing import Dict, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -13,8 +13,8 @@
                    is_dist, is_master, plot_images, process_dataset,
                    select_bnb, select_dtype, show_layers)
 
-from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer,
-                   Seq2SeqTrainingArguments, Swift, get_logger, SwiftConfig)
+from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer,
+                   Seq2SeqTrainingArguments, Swift, SwiftConfig, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
                          seed_everything)
@@ -68,6 +68,7 @@ class SftArguments:
     lora_rank: int = 8
     lora_alpha: int = 32
     lora_dropout_p: float = 0.1
+    adapter_length: int = 128
 
     gradient_checkpointing: bool = True
     batch_size: int = 1
@@ -199,15 +200,16 @@ def llm_sft(args: SftArguments) -> None:
         args.model_type, torch_dtype=args.torch_dtype, **kwargs)
 
     if args.resume_from_ckpt is None:
-    swift_config: Dict[str, SwiftConfig] = dict()
-    for sft_type in args.sft_type.split(','):
-        if sft_type == 'lora':
-            if 'ALL' in args.lora_target_modules:
-                assert len(args.lora_target_modules) == 1
-                args.lora_target_modules = find_all_linear_for_lora(
-                    model, args.quantization_bit, args.model_type)
-                logger.info(
-                    f'Setting lora_target_modules: {args.lora_target_modules}')
+        swift_config: Dict[str, SwiftConfig] = dict()
+        for sft_type in args.sft_type.split(','):
+            if sft_type == 'lora':
+                if 'ALL' in args.lora_target_modules:
+                    assert len(args.lora_target_modules) == 1
+                    args.lora_target_modules = find_all_linear_for_lora(
+                        model, args.quantization_bit, args.model_type)
+                    logger.info(
+                        f'Setting lora_target_modules: {args.lora_target_modules}'
+                    )
 
                 lora_config = LoRAConfig(
                     r=args.lora_rank,
@@ -215,10 +217,18 @@ def llm_sft(args: SftArguments) -> None:
                     lora_alpha=args.lora_alpha,
                     lora_dropout=args.lora_dropout_p)
                 logger.info(f'lora_config: {lora_config}')
-    # ### Preparing lora
-    if args.sft_type == 'lora':
-
-            model = Swift.prepare_model(model, lora_config)
+                swift_config['lora'] = lora_config
+            elif sft_type == 'adapter':
+                adapter_config = AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=MODEL_MAPPING[model.config.model_type].get(
+                        'adapter_TM', 'mlp'),
+                    method_name='forward',
+                    hidden_pos=0,
+                    adapter_length=args.adapter_length,
+                )
+                swift_config['adapter'] = adapter_config
+        model = Swift.prepare_model(model, swift_config)
     else:
         model = Swift.from_pretrained(
             model, args.resume_from_ckpt, is_trainable=True)
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index bf6cc4c797..d16b76bfa0 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -172,6 +172,15 @@ class LoRATM(NamedTuple):
     polylm = ['c_attn']
 
 
+class AdapterTM(NamedTuple):
+    # default lora target modules. qkv
+    baichuan = ['mlp']
+    chatglm2 = ['mlp']
+    llama2 = ['mlp']
+    qwen = ['mlp']
+    polylm = ['mlp']
+
+
 # Model Home: 'https://modelscope.cn/models/{model_id}/summary'
 # keys: 'model_id', 'revision', 'get_function', 'template',
 #   'ignore_file_pattern', 'lora_TM'
@@ -181,6 +190,7 @@ class LoRATM(NamedTuple):
         'revision': 'v1.0.5',
         'get_function': get_model_tokenizer_qwen,
         'lora_TM': LoRATM.qwen,
+        'adapter_TM': AdapterTM.qwen,
     },
     'qwen-7b-chat': {
         'model_id': 'qwen/Qwen-7B-Chat',
@@ -188,12 +198,14 @@ class LoRATM(NamedTuple):
         'get_function': get_model_tokenizer_qwen,
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
+        'adapter_TM': AdapterTM.qwen,
     },
     'qwen-vl': {
         'model_id': 'qwen/Qwen-VL',
         'revision': 'v1.0.2',
         'get_function': get_model_tokenizer_qwen_vl,
         'lora_TM': LoRATM.qwen,
+        'adapter_TM': AdapterTM.qwen,
     },
     'qwen-vl-chat': {
         'model_id': 'qwen/Qwen-VL-Chat',
@@ -201,23 +213,27 @@ class LoRATM(NamedTuple):
         'get_function': get_model_tokenizer_qwen_vl,
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
+        'adapter_TM': AdapterTM.qwen,
     },
     'baichuan-7b': {
         'model_id': 'baichuan-inc/baichuan-7B',
         'revision': 'v1.0.7',
         'lora_TM': LoRATM.baichuan,
+        'adapter_TM': AdapterTM.baichuan,
     },
     'baichuan-13b': {
         'model_id': 'baichuan-inc/Baichuan-13B-Base',
         'revision': 'v1.0.5',
         'get_function': get_model_tokenizer_baichuan13b,
         'lora_TM': LoRATM.baichuan,
+        'adapter_TM': AdapterTM.baichuan,
     },
     'baichuan-13b-chat': {
         'model_id': 'baichuan-inc/Baichuan-13B-Chat',
         'revision': 'v1.0.8',
         'template': 'baichuan',
         'lora_TM': LoRATM.baichuan,
+        'adapter_TM': AdapterTM.baichuan,
     },
     'chatglm2-6b': {
         'model_id': 'ZhipuAI/chatglm2-6b',
@@ -225,18 +241,21 @@ class LoRATM(NamedTuple):
         'get_function': get_model_tokenizer_chatglm2,
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
+        'adapter_TM': AdapterTM.chatglm2,
     },
     'chatglm2-6b-32k': {
         'model_id': 'ZhipuAI/chatglm2-6b-32k',
         'revision': 'v1.0.0',
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
+        'adapter_TM': AdapterTM.chatglm2,
     },
     'llama2-7b': {
         'model_id': 'modelscope/Llama-2-7b-ms',
         'revision': 'v1.0.2',
         'ignore_file_pattern': [r'.+\.bin$'],  # use safetensors
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'llama2-13b': {
         'model_id': 'modelscope/Llama-2-13b-ms',
@@ -244,12 +263,14 @@ class LoRATM(NamedTuple):
         'get_function': get_model_tokenizer_llama2,
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'llama2-70b': {
         'model_id': 'modelscope/Llama-2-70b-ms',
         'revision': 'v1.0.0',
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'llama2-7b-chat': {
         'model_id': 'modelscope/Llama-2-7b-chat-ms',
@@ -257,6 +278,7 @@ class LoRATM(NamedTuple):
         'template': 'llama',
         'ignore_file_pattern': [r'.+\.bin$'],  # use safetensors
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'llama2-13b-chat': {
         'model_id': 'modelscope/Llama-2-13b-chat-ms',
@@ -265,6 +287,7 @@ class LoRATM(NamedTuple):
         'template': 'llama',
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'llama2-70b-chat': {
         'model_id': 'modelscope/Llama-2-70b-chat-ms',
@@ -273,24 +296,28 @@ class LoRATM(NamedTuple):
         'template': 'llama',
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'openbuddy-llama2-13b': {
         'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
         'revision': 'v1.0.0',
         'template': 'openbuddy_llama',
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'openbuddy-llama-65b': {
         'model_id': 'OpenBuddy/openbuddy-llama-65b-v8-bf16',
         'revision': 'v1.0.0',
         'template': 'openbuddy_llama',
         'lora_TM': LoRATM.llama2,
+        'adapter_TM': AdapterTM.llama2,
     },
     'polylm-13b': {
         'model_id': 'damo/nlp_polylm_13b_text_generation',
         'revision': 'v1.0.3',
         'get_function': get_model_tokenizer_polylm,
         'lora_TM': LoRATM.polylm,
+        'adapter_TM': AdapterTM.polylm,
     },
 }
 
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 19233e60eb..c6885a6050 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -76,7 +76,15 @@ def prepare_model(model: nn.Module, config: AdapterConfig) -> SwiftOutput:
         module_keys = [key for key, _ in model.named_modules()]
 
         for module_key in module_keys:
-            if re.fullmatch(config.target_modules, module_key):  # noqa
+            if isinstance(config.target_modules, str):
+                target_module_found = re.fullmatch(config.target_modules,
+                                                   module_key)
+            else:
+                target_module_found = any(
+                    module_key.endswith(target_key)
+                    for target_key in config.target_modules)
+
+            if target_module_found:  # noqa
                 module = model.get_submodule(module_key)
 
                 def _forward(self, *args, **kwargs):
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index f426a4dd83..1f5c4b1b14 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -81,7 +81,14 @@ def prepare_model(model: nn.Module, config: PromptConfig):
         module_keys = [key for key, _ in model.named_modules()]
         match_module_keys = []
         for module_key in module_keys:
-            if re.fullmatch(config.target_modules, module_key):  # noqa
+            if isinstance(config.target_modules, str):
+                target_module_found = re.fullmatch(config.target_modules,
+                                                   module_key)
+            else:
+                target_module_found = any(
+                    module_key.endswith(target_key)
+                    for target_key in config.target_modules)
+            if target_module_found:  # noqa
                 module = model.get_submodule(module_key)
 
                 def _forward(self, *args, **kwargs):

From 30b3e8a96d15545b398004296a8e7d9f094b5b06 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 6 Sep 2023 00:27:36 +0800
Subject: [PATCH 09/70] add restuner

---
 swift/__init__.py                    |   2 +
 swift/tuners/__init__.py             |   4 +
 swift/tuners/lora.py                 |   2 -
 swift/tuners/mapping.py              |   8 +-
 swift/tuners/restuning.py            | 323 +++++++++++++++++++++++++++
 swift/tuners/restuning_components.py | 301 +++++++++++++++++++++++++
 swift/tuners/side.py                 | 255 +++++++++++++++++++++
 7 files changed, 892 insertions(+), 3 deletions(-)
 create mode 100644 swift/tuners/restuning.py
 create mode 100644 swift/tuners/restuning_components.py
 create mode 100644 swift/tuners/side.py

diff --git a/swift/__init__.py b/swift/__init__.py
index d4ab2b8c64..e41615c414 100644
--- a/swift/__init__.py
+++ b/swift/__init__.py
@@ -8,6 +8,7 @@
     from .tuners import (
         Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig,
         SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM,
+        ResTuningConfig, SideConfig,
         PeftModelForSeq2SeqLM, PeftModelForSequenceClassification,
         PeftModelForTokenClassification, PrefixTuningConfig,
         PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig,
@@ -29,6 +30,7 @@
         'tuners': [
             'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA',
             'LoRAConfig', 'SWIFT_MAPPING', 'LoraConfig', 'PeftConfig',
+            'ResTuningConfig', 'SideConfig',
             'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM',
             'PeftModelForSequenceClassification',
             'PeftModelForTokenClassification', 'PrefixTuningConfig',
diff --git a/swift/tuners/__init__.py b/swift/tuners/__init__.py
index bed8803d70..6ebb813e90 100644
--- a/swift/tuners/__init__.py
+++ b/swift/tuners/__init__.py
@@ -8,6 +8,8 @@
     from .base import SwiftModel, Swift
     from .lora import LoRA, LoRAConfig
     from .mapping import SWIFT_MAPPING
+    from .side import Side, SideConfig, SideModule
+    from .restuning import ResTuning, ResTuningConfig, ResTuningModule
     from .peft import (LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM,
                        PeftModelForSeq2SeqLM,
                        PeftModelForSequenceClassification,
@@ -23,6 +25,8 @@
         'base': ['SwiftModel', 'Swift'],
         'lora': ['LoRA', 'LoRAConfig'],
         'mapping': ['SWIFT_MAPPING'],
+        'side': ['Side', 'SideConfig', 'SideModule'],
+        'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningModule'],
         'peft': [
             'LoraConfig', 'PeftConfig', 'PeftModel', 'PeftModelForCausalLM',
             'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification',
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index e5e315385f..5cbb797970 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -129,8 +129,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         modules = []
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(replace_modules, (str, list))
-        if isinstance(replace_modules, str):
-            replace_modules = [replace_modules]
         AutoGPTQQuantLinear = get_auto_gptq_quant_linear(
             get_quantization_config(model, method='gptq'))
 
diff --git a/swift/tuners/mapping.py b/swift/tuners/mapping.py
index 1f91c542ef..b958cc1305 100644
--- a/swift/tuners/mapping.py
+++ b/swift/tuners/mapping.py
@@ -3,16 +3,22 @@
 from .adapter import Adapter, AdapterConfig
 from .lora import LoRA, LoRAConfig
 from .prompt import Prompt, PromptConfig
+from .restuning import ResTuning, ResTuningConfig
+from .side import Side, SideConfig
 
 
 class SwiftTuners:
     ADAPTER = 'ADAPTER'
     PROMPT = 'PROMPT'
     LORA = 'LORA'
+    SIDE = 'SIDE'
+    RESTUNING = 'RESTUNING'
 
 
 SWIFT_MAPPING = {
     SwiftTuners.ADAPTER: (AdapterConfig, Adapter),
     SwiftTuners.PROMPT: (PromptConfig, Prompt),
-    SwiftTuners.LORA: (LoRAConfig, LoRA)
+    SwiftTuners.LORA: (LoRAConfig, LoRA),
+    SwiftTuners.SIDE: (SideConfig, Side),
+    SwiftTuners.RESTUNING: (ResTuningConfig, ResTuning),
 }
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
new file mode 100644
index 0000000000..b72e000bcb
--- /dev/null
+++ b/swift/tuners/restuning.py
@@ -0,0 +1,323 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import re
+import types
+from dataclasses import dataclass, field
+from typing import Union, Dict, Optional, List
+
+import torch.nn as nn
+
+from swift.utils.logger import get_logger
+from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner
+from .utils import SwiftConfig, SwiftOutput
+
+logger = get_logger()
+
+
+@dataclass
+class ResTuningConfig(SwiftConfig):
+    """
+    The configuration class for the ResTuning module.
+
+    ResTuning is a flexible parameter-efficient and memory-efficient tuning paradigm framework.
+    'Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding Tuner from Backbone'
+    by Jiang et al.(2023)
+    See
+
+    Args:
+        dims: The dimensions of the hidden states
+        root_modules: The root module to be replaced, can a regex string
+        root_modules_hook: The hook type of root modules, can be "input" or "output"
+        stem_modules: The stem modules to be replaced, can a regex string or name list of full match format
+        stem_modules_hook: The hook type of stem modules, can be "input" or "output"
+        target_modules: The target module to be replaced, can a regex string
+        target_modules_hook: The hook type of target modules, can be "input" or "output"
+        tuner_cfg: The configuration of the tuning module, can a string or customized config
+        use_upsample: Whether to use auxiliary upsample module
+        use_bypass: Whether to use bypass
+
+    """
+
+    dims: Optional[Union[List[int], int]] = field(
+        default=None, metadata={'help': 'The dimensions of the hidden states'})
+
+    root_modules: str = field(
+        default=None,
+        metadata={
+            'help': 'The root module to be replaced, can a regex string (use the first matching module) or full match format'
+        })
+
+    root_modules_hook: str = field(
+        default="input",
+        metadata={
+            'help': 'The hook type of root modules, can be "input" or "output"'
+        })
+
+    stem_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            'help': 'The stem modules to be replaced, can a regex string or name list of full match format'
+        })
+
+    stem_modules_hook: str = field(
+        default="output",
+        metadata={
+            'help': 'The hook type of stem modules, can be "input" or "output"'
+        })
+
+    target_modules: str = field(
+        default=None,
+        metadata={
+            'help': 'The target module to be replaced, can a regex string (use the first matching module) or full match format'
+        })
+
+    target_modules_hook: str = field(
+        default="input",
+        metadata={
+            'help': 'The hook type of target modules, can be "input" or "output"'
+        })
+
+    target_hidden_pos: str = field(
+        default=None,
+        metadata={
+            'help':
+                'The position of the hidden state for target modules output'
+        })
+
+    tuner_cfg: Optional[Union[List[Dict], Dict, str]] = field(
+        default=None,
+        metadata={'help': 'The configuration of the tuning module, can a string or customized config'})
+
+    use_upsample: bool = field(
+        default=False,
+        metadata={'help': 'Whether to use auxiliary upsample module'})
+
+    upsample_out_channels: List[int] = field(
+        default=None,
+        metadata={'help': 'The number of output channels when "use_upsample" is set to "True"'})
+
+    zero_init_last: bool = field(
+        default=False,
+        metadata={'help': 'Zero init last weight'})
+
+    use_bypass: bool = field(
+        default=True,
+        metadata={'help': 'Whether to use bypass'})
+
+    def __post_init__(self):
+        from .mapping import SwiftTuners
+        self.swift_type = SwiftTuners.RESTUNING
+
+
+class ResTuning:
+
+    @staticmethod
+    def prepare_model(model: nn.Module, config: ResTuningConfig) -> SwiftOutput:
+        """Prepare a model with `ResTuningConfig`"""
+
+        def _forward_seq(self, input, *args, **kwargs):
+            for idx, module in enumerate(self):
+                if idx >= len(self.origin_module_keys): continue
+                input = module(input)
+            return input
+
+        def _forward_target(self, *args, **kwargs):
+            if self.target_modules_hook == "input":
+                args_main = _forward_restuning(self)
+                args_main = self.forward_origin(args_main, **kwargs)
+            else:
+                _args_main = self.forward_origin(*args, **kwargs)
+                args_main = _forward_restuning(self)
+                if type(_args_main) != type(args_main):
+                    _args_main[self.target_hidden_pos] = args_main
+                    args_main = _args_main
+            return args_main
+
+        def _forward_restuning(self):
+            probe_results = []
+            root_module_ins = self.root_module_ins_list[0]
+            stem_module_ins_list = self.stem_module_ins_list
+            top_module = model.get_submodule('')
+            if root_module_ins:
+                if root_module_ins.root_modules_hook == 'input':
+                    probe_results.append(root_module_ins.probe_input_data)
+                else:
+                    probe_results.append(root_module_ins.probe_output_data)
+            for i, st_mod in enumerate(stem_module_ins_list):
+                if i == 0 and root_module_ins is None:
+                    probe_results.append(st_mod.probe_input_data)
+                if st_mod.stem_modules_hook == 'input':
+                    probe_results.append(st_mod.probe_input_data)
+                else:
+                    probe_results.append(st_mod.probe_output_data)
+            args_main = getattr(top_module, 'restuning')(probe_results)
+            return args_main
+
+        # 1. Matching the root module
+        module_keys = [key for key, _ in model.named_modules()]
+        root_module_ins_list = []
+        if config.root_modules:
+            for module_key in module_keys:
+                if re.fullmatch(config.root_modules, module_key):
+                    root_module = model.get_submodule(module_key)
+                    logger.info(f"Matching root module [{module_key}] of type {type(root_module)}")
+                    if isinstance(root_module, (nn.ModuleList, nn.ModuleDict)):
+                        logger.warning(
+                            f"Type of {type(root_module)} may not be supported because of its customized forward")
+                    if config.root_modules_hook == "input":
+                        root_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                    else:
+                        root_module.register_forward_hook(probe_output_hook, with_kwargs=True)
+                    root_module.root_modules_hook = config.root_modules_hook
+                    root_module_ins_list.append(root_module)
+                    break
+            if len(root_module_ins_list) == 0:
+                logger.error(f"Cannot match root modules")
+
+        # 2. Matching the stem module
+        stem_module_ins_list = []
+        stem_module_ins_index = []
+        for module_key in module_keys:
+            if (isinstance(config.stem_modules, str) and re.fullmatch(config.stem_modules, module_key)) or \
+                    (isinstance(config.stem_modules, list) and module_key in config.stem_modules):
+                stem_module = model.get_submodule(module_key)
+                if isinstance(config.stem_modules, list):
+                    stem_module_ins_index.append(config.stem_modules.index(module_key))
+                logger.info(f"Matching stem module [{module_key}] of type {type(stem_module)}")
+                if isinstance(stem_module, (nn.ModuleList, nn.ModuleDict)):
+                    logger.warning(
+                        f"Type of {type(stem_module)} may not be supported because of its customized forward")
+                if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0:
+                    stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                if config.stem_modules_hook == "input":
+                    stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                else:
+                    stem_module.register_forward_hook(probe_output_hook, with_kwargs=True)
+                stem_module.stem_modules_hook = config.stem_modules_hook
+                stem_module_ins_list.append(stem_module)
+        if isinstance(config.stem_modules, list):
+            stem_module_ins_list = [stem_module_ins_list[stem_module_ins_index.index(i)] for i in
+                                    range(len(stem_module_ins_index))]
+        depth = len(stem_module_ins_list)
+        if len(stem_module_ins_list) == 0:
+            raise Exception(f"Cannot match source modules")
+
+        # 3. Init restuning module
+        if len(stem_module_ins_list) != 0:
+            top_module = model.get_submodule('')
+            restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample,
+                                                     config.upsample_out_channels, config.zero_init_last,
+                                                     config.tuner_cfg)
+            setattr(top_module, 'restuning', restuning_module)
+
+        # 4. Matching the target module
+        target_module_ins = None
+        for module_key in module_keys:
+            if re.fullmatch(config.target_modules, module_key):
+                tgt_module = model.get_submodule(module_key)
+                logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}")
+                if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)):
+                    raise Exception(
+                        f"Type of {type(tgt_module)} may not be supported because of its customized forward")
+
+                tgt_module.target_modules_hook = config.target_modules_hook
+                tgt_module.target_hidden_pos = config.target_hidden_pos
+                tgt_module.root_module_ins_list = root_module_ins_list
+                tgt_module.stem_module_ins_list = stem_module_ins_list
+                target_module_ins = tgt_module
+
+                if isinstance(tgt_module, nn.Sequential):
+                    tgt_module.origin_module_keys = copy.deepcopy(list(tgt_module._modules.keys()))
+                    tgt_module.forward_origin = types.MethodType(_forward_seq, tgt_module)
+                else:
+                    tgt_module.forward_origin = tgt_module.forward
+                tgt_module.forward = types.MethodType(_forward_target, tgt_module)
+        if target_module_ins is None:
+            raise Exception(f"Cannot match target modules")
+
+        def state_dict_callback(state_dict):
+            return {
+                key: value
+                for key, value in state_dict.items() if 'restuning' in key
+            }
+
+        def mark_trainable_callback(model):
+            return
+
+        return SwiftOutput(config, state_dict_callback,
+                           mark_trainable_callback)
+
+
+class ResTuningBypassModule(nn.Module):
+    """The implementation of ResTuningBypass method.
+    """
+
+    def __init__(
+            self,
+            dims,
+            depth,
+            use_upsample=False,
+            upsample_out_channels=None,
+            zero_init_last=False,
+            tuner_cfg=None,
+    ):
+        super(ResTuningBypassModule, self).__init__()
+
+        self.bypass_blocks = nn.Sequential(*[
+            ResTunerBypassBlock(
+                dim=dims[i] if isinstance(dims, list) else dims,
+                layer_num=i,
+                depth=depth,
+                use_upsample=use_upsample,
+                upsample_out_channels=upsample_out_channels[i] if isinstance(upsample_out_channels,
+                                                                             list) else upsample_out_channels,
+                zero_init_last=zero_init_last,
+                tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list) else tuner_cfg
+            )
+            for i in range(depth)])
+
+    def forward(self, x_list, **kwargs):
+        x_bypass = detach_tensors(x_list.pop(0))
+        x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass
+        x_list = detach_tensors(x_list)
+        x_list = [_x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list]
+        for i, (bp_blk, x_stem) in enumerate(zip(self.bypass_blocks, x_list)):
+            target_size = x_list[i + 1].shape[2:] if i < len(x_list) - 1 else None
+            x_bypass = bp_blk(x_stem, x_bypass, target_size, **kwargs)
+        return x_bypass
+
+
+class ResTunerBypassBlock(nn.Module):
+    def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_last=False, tuner_cfg=None, **kwargs):
+        super().__init__()
+        self.layer_num = layer_num
+        self.depth = depth
+
+        if isinstance(tuner_cfg, str):
+            lateral_cfg = tuner_cfg
+            vertical_cfg = tuner_cfg
+            aux_cfg = "upsample" if use_upsample and layer_num != depth - 1 else None
+        elif isinstance(tuner_cfg, dict):
+            lateral_cfg = tuner_cfg['lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None
+            vertical_cfg = tuner_cfg['vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None
+            aux_cfg = tuner_cfg['aux_cfg'] if 'aux_cfg' in tuner_cfg else None
+
+        self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "lateral", lateral_cfg, **kwargs)
+        self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "vertical", vertical_cfg, **kwargs)
+        if aux_cfg and len(aux_cfg) != 0:
+            self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "aux", aux_cfg, **kwargs)
+
+    def forward(self, x_stem, x_bypass, target_size=None, **kwargs):
+        x_lateral = self.lateral_tuner(x_stem)
+        x_vertical = self.vertical_tuner(x_bypass)
+
+        x_bypass_out = x_lateral + x_vertical
+        if hasattr(self, 'aux_tuner'):
+            x_bypass_out = self.aux_tuner(x_bypass_out, target_size)
+
+        # logger.info(f"x_main:{x_stem.shape} / {torch.sum(x_stem)}, x_side:{x_bypass.shape} / {torch.sum(x_bypass)}")
+        # logger.info(f"x_lateral:{x_lateral.shape} / {torch.sum(x_lateral)}, x_vertical:{x_vertical.shape} / {torch.sum(x_vertical)}")
+        # logger.info(f"x_bypass_out: {x_bypass_out.shape} / {torch.sum(x_bypass_out)}")
+
+        return x_bypass_out
+
diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
new file mode 100644
index 0000000000..f6aefb0610
--- /dev/null
+++ b/swift/tuners/restuning_components.py
@@ -0,0 +1,301 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from swift.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class ResTuner(nn.Module):
+    def __init__(
+            self, dim=None, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg={}, **kwargs):
+        super().__init__()
+        self.dim = dim
+        self.layer_num = layer_num
+        self.depth = depth
+        self.stage = stage
+        self.tuner_cfg = tuner_cfg
+
+        if (isinstance(tuner_cfg, str) and tuner_cfg == "res_adapter") or \
+                (isinstance(tuner_cfg, dict) and "res_adapter" in tuner_cfg):
+            tuner_cfg = tuner_cfg['res_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg
+            self.tuner = ResAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last,
+                                    stage=stage, tuner_cfg=tuner_cfg, **kwargs)
+        elif (isinstance(tuner_cfg, str) and tuner_cfg == "res_group_adapter") or \
+                (isinstance(tuner_cfg, dict) and "res_group_adapter" in tuner_cfg):
+            tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg
+            self.tuner = ResGroupAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last,
+                                         stage=stage, tuner_cfg=tuner_cfg, **kwargs)
+        elif (isinstance(tuner_cfg, str) and tuner_cfg == "upsample") or \
+                (isinstance(tuner_cfg, dict) and "upsample" in tuner_cfg):
+            tuner_cfg = tuner_cfg['upsample'] if isinstance(tuner_cfg, dict) else tuner_cfg
+            if 'upsample_out_channels' in kwargs:
+                out_channels = kwargs['upsample_out_channels']
+                use_conv = True if out_channels else False
+            else:
+                out_channels = dim
+                use_conv = False
+            self.tuner = Upsample(channels=dim, use_conv=use_conv, out_channels=out_channels, tuner_cfg=tuner_cfg,
+                                  **kwargs)
+        else:
+            self.tuner = Identity()
+
+    def forward(self, x, *args, **kwargs):
+        if self.tuner_cfg == "zero" or "zero" in self.tuner_cfg:
+            x_out = 0.0
+        else:
+            x_out = self.tuner(x, *args, **kwargs)
+        return x_out
+
+
+class ResAdapter(nn.Module):
+    def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU,
+                 **kwargs):
+        super(ResAdapter, self).__init__()
+        self.dim = dim
+        self.layer_num = layer_num
+        self.depth = depth
+
+        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 10
+        self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
+        self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
+
+        self.adapter_length = self.adapter_length[self.layer_num] if isinstance(self.adapter_length,
+                                                                                list) else self.adapter_length
+        assert isinstance(self.adapter_length, int) or (
+                isinstance(self.adapter_length, tuple) and len(self.adapter_length) == 3)
+        if isinstance(self.adapter_length, int):
+            self.ln1 = nn.Linear(dim, self.adapter_length)
+        else:
+            self.ln1 = nn.Linear(self.adapter_length[0], self.adapter_length[1])
+        self.activate = act_layer()
+        if isinstance(self.adapter_length, int):
+            self.ln2 = nn.Linear(self.adapter_length, dim)
+        else:
+            self.ln2 = nn.Linear(self.adapter_length[1], self.adapter_length[2])
+            dim = self.adapter_length[2]
+
+        self._xavier_init_weights(self.ln1)
+        if zero_init_last and layer_num == depth - 1:
+            self._zero_init_weights(self.ln2)
+        else:
+            self._xavier_init_weights(self.ln2)
+
+        self.scaling = init_weight_type(dim, self.adapter_weight)
+
+    def _zero_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.zeros_(m.weight)
+            nn.init.zeros_(m.bias)
+
+    def _kaiming_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
+            nn.init.normal_(m.bias)
+
+    def _xavier_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            nn.init.normal_(m.bias, std=1e-6)
+
+    def forward(self, x):
+        x_shortcut = x
+        if len(x_shortcut.size()) == 4:
+            B, C, N1, N2 = x.size()
+            x = x.view(x_shortcut.size()[0], x_shortcut.size()[1], -1).permute(0, 2, 1)
+
+        x_adapter = self.ln2(self.activate(self.ln1(x)))
+
+        if self.adapter_weight:
+            x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight)
+
+        if len(x_shortcut.size()) == 4:
+            x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1],
+                                                        x_shortcut.size()[2], x_shortcut.size()[3])
+        x_out = x_shortcut + x_adapter
+        return x_out
+
+
+class ResGroupAdapter(nn.Module):
+    def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU,
+                 **kwargs):
+        super(ResGroupAdapter, self).__init__()
+        self.dim = dim
+        self.layer_num = layer_num
+        self.depth = depth
+
+        self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
+        self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
+
+        self.adapter_dim = tuner_cfg['dim'] if 'dim' in tuner_cfg else dim
+        self.adapter_head = tuner_cfg['head'] if 'head' in tuner_cfg else 4
+        self.adapter_scale_factor = tuner_cfg['scale_factor'] if 'scale_factor' in tuner_cfg else 2
+
+        assert self.adapter_dim % self.adapter_head == 0, 'adapter dim should be divisible by adapter head'
+        self.dim_mlp = self.adapter_dim // self.adapter_head
+
+        self.ln1 = nn.Linear(self.dim_mlp, self.dim_mlp * self.adapter_scale_factor)
+        self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor, self.dim_mlp)
+        self.activate = act_layer()
+
+        self._kaiming_init_weights(self.ln1)
+        if zero_init_last and layer_num == depth - 1:
+            self._zero_init_weights(self.ln2)
+        else:
+            self._kaiming_init_weights(self.ln2)
+        self.scaling = init_weight_type(dim, self.adapter_weight)
+
+    def _zero_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.zeros_(m.weight)
+            nn.init.zeros_(m.bias)
+
+    def _kaiming_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
+            nn.init.normal_(m.bias)
+
+    def _xavier_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            nn.init.normal_(m.bias, std=1e-6)
+
+    def forward(self, x):
+        x_shortcut = x
+
+        batch, inner_dim, height, width = x.shape
+
+        x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+
+        x_adapter = rearrange(x_adapter, "b n (c h) -> (b h) n c", h=self.adapter_head)
+        x_adapter = self.ln2(self.activate(self.ln1(x_adapter)))
+        x_adapter = rearrange(x_adapter, "(b h) n c -> b n (c h)", h=self.adapter_head)
+
+        if self.adapter_weight:
+            x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight)
+
+        x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous()
+        x_out = x_shortcut + x_adapter
+
+        return x_out
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs, *args, **kwargs):
+        return inputs
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, **kwargs):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        if use_conv:
+            self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=padding)
+        self.init_weights()
+
+    def init_weights(self):
+        def _init_weights(m):
+            if isinstance(m, nn.Conv2d):
+                nn.init.zeros_(m.weight)
+                nn.init.zeros_(m.bias)
+
+        self.apply(_init_weights)
+
+    def forward(self, x, target_size=None, *args, **kwargs):
+        assert x.shape[1] == self.channels
+        if target_size is None:
+            x = F.interpolate(x.float(), scale_factor=2, mode="nearest").type_as(x)
+        else:
+            x = F.interpolate(x.float(), target_size, mode="nearest").type_as(x)
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+def init_weight_type(dim, weight_type):
+    if weight_type is None:
+        scaling = None
+    elif weight_type == "gate":
+        scaling = nn.Linear(dim, 1)
+    elif weight_type == "scale":
+        scaling = nn.Parameter(torch.Tensor(1))
+        scaling.data.fill_(1)
+    elif weight_type == "scale_kv":
+        scaling_k = nn.Parameter(torch.Tensor(1))
+        scaling_k.data.fill_(1)
+        scaling_v = nn.Parameter(torch.Tensor(1))
+        scaling_v.data.fill_(1)
+        scaling = (scaling_k, scaling_v)
+    elif weight_type == "scale_channel":
+        scaling = nn.Parameter(torch.Tensor(dim))
+        scaling.data.fill_(1)
+    elif weight_type == "scale_kv_channel":
+        scaling_k = nn.Parameter(torch.Tensor(dim))
+        scaling_k.data.fill_(1)
+        scaling_v = nn.Parameter(torch.Tensor(dim))
+        scaling_v.data.fill_(1)
+        scaling = (scaling_k, scaling_v)
+    elif weight_type and weight_type.startswith("scalar"):
+        scaling = float(weight_type.split('_')[-1])
+    else:
+        scaling = None
+    return scaling
+
+
+def apply_data_weight(data, scaling, weight_type):
+    if weight_type in ["gate"]:
+        scaling = torch.mean(torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1)
+    elif weight_type in ["scale", "scale_channel"] or weight_type.startswith('scalar'):
+        scaling = scaling
+    else:
+        scaling = None
+    if scaling is not None:
+        data = data * scaling
+    return data
+
+
+def detach_tensors(feats):
+    if type(feats) in [list, tuple]:
+        feats = [detach_tensors(feat) if feat is not None else None for feat in feats]
+    elif isinstance(feats, dict):
+        feats = {key: detach_tensors(val) for key, val in feats.items()}
+    elif isinstance(feats, torch.Tensor):
+        feats = feats.detach().float()
+    else:
+        feats = feats.detach()
+    return feats
+
+
+def probe_tensors(module, feats, name):
+    feats = detach_tensors(feats)
+    setattr(module, name, feats)
+
+
+def probe_input_pre_hook(self, args, kwargs):
+    input = args[0]
+    probe_tensors(self, input, 'probe_input_data')
+    return args, kwargs
+
+
+def probe_output_hook(self, args, kwargs, result):
+    output = result
+    probe_tensors(self, output, 'probe_output_data')
+    return output
+
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
new file mode 100644
index 0000000000..f5aabb7a69
--- /dev/null
+++ b/swift/tuners/side.py
@@ -0,0 +1,255 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+import re
+import types
+import copy
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Union, Callable, Any
+from collections import OrderedDict
+from itertools import repeat
+
+import torch
+from torch import nn
+import torchvision
+
+from swift.utils.logger import get_logger
+from .utils import SwiftConfig, SwiftOutput
+
+logger = get_logger()
+
+
+@dataclass
+class SideConfig(SwiftConfig):
+    """
+    The configuration class for the side module.
+
+    Side-Tuning only needs to train one side network and
+    weights the output of pre-trained model and side network.
+    'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks'
+    by Zhang et al.(2019)
+    See https://arxiv.org/abs/1912.13503
+
+    Args:
+        target_modules: The feedforward module to be replaced, in regex format
+    """
+
+    dim: int = field(
+        default=None, metadata={'help': 'The dimension of the hidden states'})
+
+    target_modules: str = field(
+        default=None,
+        metadata={
+            'help': 'The target module to be replaced, in full match format'
+        })
+
+    side_module_name: float = field(
+        default=1., metadata={'help': 'The name of the additive side networks'})
+
+    hidden_pos: Union[str, int] = field(
+        default=0,
+        metadata={
+            'help':
+                'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)'
+        })
+
+    def __post_init__(self):
+        from .mapping import SwiftTuners
+        self.swift_type = SwiftTuners.SIDE
+
+
+class Side:
+
+    @staticmethod
+    def prepare_model(model: nn.Module, config: SideConfig) -> SwiftOutput:
+        """Prepare a model with `SideConfig`"""
+        module_keys = [key for key, _ in model.named_modules()]
+
+        for module_key in module_keys:
+            if re.fullmatch(config.target_modules, module_key):  # noqa
+                tgt_module = model.get_submodule(module_key)
+                logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}")
+                if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)):
+                    raise Exception(
+                        f"Type of {type(tgt_module)} may not be supported because of its customized forward")
+
+                def _forward(self, *args, **kwargs):
+                    args_main = self.forward_origin(*args, **kwargs)
+                    if isinstance(args_main, (tuple, list, dict)):
+                        if isinstance(config.hidden_pos, str):
+                            args_main[config.hidden_pos] = getattr(self, 'side')(*args, args_main[config.hidden_pos])
+                    else:
+                        args_main = getattr(self, 'side')(*args, args_main)
+                    return args_main
+
+                if isinstance(tgt_module, nn.Sequential):
+                    tgt_module.tgt_module_keys = copy.deepcopy(list(tgt_module._modules.keys()))
+
+                    def forward_seq(self, input, *args, **kwargs):
+                        for idx, module in enumerate(self):
+                            if idx >= len(tgt_module.tgt_module_keys): continue
+                            input = module(input)
+                        return input
+
+                    tgt_module.forward_origin = types.MethodType(forward_seq, tgt_module)
+                else:
+                    tgt_module.forward_origin = tgt_module.forward
+                tgt_module.forward = types.MethodType(_forward, tgt_module)
+                side_module = SideModule(config.dim, config.side_module_name)
+                setattr(tgt_module, 'side', side_module)
+
+        def state_dict_callback(state_dict):
+            return {
+                key: value
+                for key, value in state_dict.items() if 'side' in key
+            }
+
+        def mark_trainable_callback(model):
+            return
+
+        return SwiftOutput(config, state_dict_callback,
+                           mark_trainable_callback)
+
+
+class SideModule(nn.Module):
+    """The implementation of vision side-tuning method.
+
+    Side-Tuning only needs to train one side network and
+    weights the output of pre-trained model and side network.
+    'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks'
+    by Zhang et al.(2019)
+    See https://arxiv.org/abs/1912.13503
+
+    Attributes:
+        side_module_name: The name of the additive side networks.
+    """
+
+    def __init__(
+            self,
+            dim,
+            side_module_name='fcn4'
+    ):
+        super(SideModule, self).__init__()
+
+        side_module_name = side_module_name.lower()
+        if side_module_name == 'fcn4':
+            self.side_net = FCN4(out_dims=dim)
+        elif side_module_name == 'mlp':
+            self.side_net = Mlp(dim)
+        elif side_module_name == 'alexnet':
+            mm = torchvision.models.alexnet(pretrained=True)
+            self.side_net = nn.Sequential(
+                OrderedDict([
+                    ('features', mm.features), ('avgpool', mm.avgpool),
+                    ('flatten', nn.Flatten()),
+                    ('fc', nn.Linear(9216, dim, bias=False))
+                ]))
+        else:
+            raise ValueError(f'Unsupported side_module_name: {side_module_name}')
+        self.alpha = nn.Parameter(torch.tensor(0.0))
+
+    def forward(self, x, x_main):
+        alpha_squashed = torch.sigmoid(self.alpha)
+        x_side = self.side_net(x)
+        x_out = alpha_squashed * x_main + (1 - alpha_squashed) * x_side
+        return x_out
+
+
+class FCN4(nn.Module):
+    """The implementation of simple FCN4 network for side network.
+    """
+
+    def __init__(self, out_dims=-1, **kwargs):
+        super(FCN4, self).__init__(**kwargs)
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                3,
+                16,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 16), nn.ReLU())
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(
+                16,
+                16,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 16), nn.ReLU())
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(
+                16,
+                32,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 32), nn.ReLU())
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(
+                32,
+                64,
+                kernel_size=3,
+                stride=1,
+                padding=0,
+                bias=False,
+                dilation=1), nn.GroupNorm(2, 64), nn.ReLU())
+        self.pool = nn.AdaptiveAvgPool2d((1, 1))
+        if out_dims > 0:
+            self.fc = nn.Linear(64, out_dims)
+        else:
+            self.fc = None
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.pool(x)
+        x = x.view(x.size(0), -1)
+        if self.fc is not None:
+            x = self.fc(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer.
+    """
+
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
+            norm_layer=None,
+            bias=True,
+            drop=0.,
+            use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = tuple(repeat(bias, 2))
+        drop_probs = tuple(repeat(drop, 2))
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x

From cbb0b2fc071e591ddeaebc459a51914b27d86496 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 6 Sep 2023 00:29:56 +0800
Subject: [PATCH 10/70] add tests

---
 swift/tuners/side.py                 |   2 +-
 tests/tuners/test_swift_base.py      |  40 +++++++-
 tests/tuners/test_swift_restuning.py | 135 +++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 2 deletions(-)
 create mode 100644 tests/tuners/test_swift_restuning.py

diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index f5aabb7a69..9e4f043dd7 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -43,7 +43,7 @@ class SideConfig(SwiftConfig):
             'help': 'The target module to be replaced, in full match format'
         })
 
-    side_module_name: float = field(
+    side_module_name: str = field(
         default=1., metadata={'help': 'The name of the additive side networks'})
 
     hidden_pos: Union[str, int] = field(
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index ce1ccb3307..7676a2a283 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -11,7 +11,7 @@
                                               SbertForSequenceClassification)
 from peft.utils import WEIGHTS_NAME
 
-from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub
+from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig
 
 
 class TestSwift(unittest.TestCase):
@@ -104,3 +104,41 @@ def test_swift_multiple_adapters(self):
                 all(
                     torch.isclose(state_dict[key],
                                   state_dict2[key]).flatten().detach().cpu()))
+    def test_swift_side(self):
+        from transformers import AutoModelForImageClassification
+        model = AutoModelForImageClassification.from_pretrained(
+            'google/vit-base-patch16-224')
+        model2 = copy.deepcopy(model)
+        result_origin = model(torch.ones((1, 3, 224, 224))).logits
+        print(
+            f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}'
+        )
+
+        side_config = SideConfig(
+            dim=768,
+            target_modules=r'vit',
+            side_module_name='fcn4',
+            hidden_pos='last_hidden_state')
+
+        model = Swift.prepare_model(model, config=side_config)
+        result = model(torch.ones((1, 3, 224, 224))).logits
+        print(
+            f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}'
+        )
+        self.assertTrue(isinstance(model, SwiftModel))
+        model.save_pretrained(self.tmp_dir)
+        self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
+
+        model2 = Swift.from_pretrained(model2, self.tmp_dir)
+        state_dict = model.state_dict()
+        state_dict2 = model2.state_dict()
+        for key in state_dict:
+            self.assertTrue(key in state_dict2)
+            self.assertTrue(
+                all(
+                    torch.isclose(state_dict[key],
+                                  state_dict2[key]).flatten().detach().cpu()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py
new file mode 100644
index 0000000000..421544d0df
--- /dev/null
+++ b/tests/tuners/test_swift_restuning.py
@@ -0,0 +1,135 @@
+import copy
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from swift import ResTuningConfig
+from swift import Swift, SwiftModel
+
+
+class TestSwiftResTuning(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def set_random_seed(self, seed=123):
+        """Set random seed manually to get deterministic results"""
+        import random
+        import numpy as np
+        import torch
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    def model_comparison(self, model, model2):
+        model_key = list(model.state_dict().keys())
+        model2_key = list(model2.state_dict().keys())
+        self.assertTrue(model_key == model2_key)
+        model_val = torch.sum(torch.stack([torch.sum(val) for val in model.state_dict().values()]))
+        model2_val = torch.sum(torch.stack([torch.sum(val) for val in model2.state_dict().values()]))
+        self.assertTrue(torch.isclose(model_val, model2_val))
+
+    def test_swift_restuning_vit(self):
+        from transformers import AutoModelForImageClassification
+        model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
+        model_swift_1 = copy.deepcopy(model)
+        model_swift_2 = copy.deepcopy(model)
+        result_origin = model(torch.ones((1, 3, 224, 224))).logits
+        print(
+            f"test_swift_restuning_vit result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}")
+
+        # load type - 1
+        self.set_random_seed()
+        restuning_config_1 = ResTuningConfig(
+            dims=768,
+            root_modules=r'.*vit.encoder.layer.0$',
+            stem_modules=r'.*vit.encoder.layer\.\d+$',
+            target_modules=r'.*vit.layernorm',
+            target_modules_hook="input",
+            tuner_cfg="res_adapter",
+        )
+        model_swift_1 = Swift.prepare_model(model_swift_1, config=restuning_config_1)
+        self.assertTrue(isinstance(model_swift_1, SwiftModel))
+        print(model_swift_1.get_trainable_parameters())
+        result_swift_1 = model_swift_1(torch.ones((1, 3, 224, 224))).logits
+        print(
+            f"test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, result_swift_1 sum: {torch.sum(result_swift_1)}")
+
+        # load type - 2
+        self.set_random_seed()
+        restuning_config_2 = ResTuningConfig(
+            dims=768,
+            root_modules=r'.*vit.encoder.layer.0$',
+            stem_modules=r'.*vit.encoder.layer\.\d+$',
+            target_modules=r'.*vit.encoder',
+            target_modules_hook="output",
+            target_hidden_pos="last_hidden_state",
+            tuner_cfg="res_adapter",
+        )
+        model_swift_2 = Swift.prepare_model(model_swift_2, config=restuning_config_2)
+        self.assertTrue(isinstance(model_swift_2, SwiftModel))
+        print(model_swift_2.get_trainable_parameters())
+        result_swift_2 = model_swift_2(torch.ones((1, 3, 224, 224))).logits
+        print(
+            f"test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, result_swift_2 sum: {torch.sum(result_swift_2)}")
+
+        self.assertTrue(all(torch.isclose(result_swift_1, result_swift_2).flatten()))
+
+        model_swift_1.save_pretrained(self.tmp_dir)
+        self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
+        model_loaded = Swift.from_pretrained(model, self.tmp_dir)
+        self.model_comparison(model_swift_1, model_loaded)
+
+    def test_swift_restuning_diffusers_sd(self):
+        from diffusers import UNet2DConditionModel
+        model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        model.requires_grad_(False)
+        model2 = copy.deepcopy(model)
+        self.set_random_seed()
+        input_data = {
+            "sample": torch.ones((1, 4, 64, 64)),
+            "timestep": 10,
+            "encoder_hidden_states": torch.ones((1, 77, 768))
+        }
+        result_origin = model(**input_data).sample
+        print(
+            f"test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}")
+
+        self.set_random_seed()
+        restuning_config = ResTuningConfig(
+            dims=[1280, 1280, 1280, 640, 320],
+            root_modules='mid_block',
+            stem_modules=['mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2', 'up_blocks.3'],
+            target_modules='conv_norm_out',
+            tuner_cfg="res_group_adapter",
+            use_upsample=True,
+            upsample_out_channels=[1280, 1280, 640, 320, None],
+            zero_init_last=True
+        )
+
+        model = Swift.prepare_model(model, config=restuning_config)
+        self.assertTrue(isinstance(model, SwiftModel))
+        print(model.get_trainable_parameters())
+
+        result = model(**input_data).sample
+        print(f"test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}")
+        model.save_pretrained(self.tmp_dir)
+        self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
+        model2 = Swift.from_pretrained(model2, self.tmp_dir)
+        self.model_comparison(model, model2)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f8d6b091511d9077cfbb54bc844ebe9d3d94dfcb Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 6 Sep 2023 10:19:23 +0800
Subject: [PATCH 11/70] temp

---
 .../llm/scripts/chatglm_6b/lora_ddp/sft.sh    |   2 +-
 examples/pytorch/llm/src/llm_sft.py           |  64 +++--
 examples/pytorch/llm/src/utils/dataset.py     |  26 +-
 examples/pytorch/llm/src/utils/preprocess.py  |   6 +
 swift/trainers/trainers.py                    | 226 +++++++++++++++++-
 swift/tuners/adapter.py                       |   6 +
 swift/tuners/lora.py                          |   4 -
 swift/tuners/prompt.py                        |   5 +-
 8 files changed, 314 insertions(+), 25 deletions(-)

diff --git a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
index 3baa73288a..ff0c147200 100644
--- a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
@@ -10,7 +10,7 @@ torchrun \
     --sft_type lora \
     --output_dir runs \
     --ddp_backend gloo \
-    --dataset alpaca-en,alpaca-zh \
+    --dataset advertise_gen \
     --dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 1024 \
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 6da37e4ef4..e12cdacc0f 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -30,7 +30,7 @@ class SftArguments:
         metadata={'choices': list(MODEL_MAPPING.keys())})
     # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G
     sft_type: str = field(
-        default='lora', metadata={'choices': ['lora', 'full']})
+        default='lora')
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -83,7 +83,7 @@ class SftArguments:
     lr_scheduler_type: str = 'cosine'
     warmup_ratio: float = 0.05
 
-    eval_steps: int = 50
+    eval_steps: int = 10
     save_steps: Optional[int] = None
     save_total_limit: int = 2
     logging_steps: int = 5
@@ -123,12 +123,7 @@ def __post_init__(self):
             # Initialize in advance
             dist.init_process_group(backend=self.ddp_backend)
 
-        if self.sft_type == 'lora':
-            if self.learning_rate is None:
-                self.learning_rate = 1e-4
-            if self.save_steps is None:
-                self.save_steps = self.eval_steps
-        elif self.sft_type == 'full':
+        if self.sft_type == 'full':
             assert self.quantization_bit is None, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
             if self.learning_rate is None:
@@ -137,7 +132,11 @@ def __post_init__(self):
                 # Saving the model takes a long time
                 self.save_steps = self.eval_steps * 4
         else:
-            raise ValueError(f'sft_type: {self.sft_type}')
+            if self.learning_rate is None:
+                self.learning_rate = 1e-4
+            if self.save_steps is None:
+                self.save_steps = self.eval_steps
+
         if self.template_type is None:
             self.template_type = MODEL_MAPPING[self.model_type].get(
                 'template', 'default')
@@ -221,7 +220,7 @@ def llm_sft(args: SftArguments) -> None:
             elif sft_type == 'adapter':
                 adapter_config = AdapterConfig(
                     dim=model.config.hidden_size,
-                    target_modules=MODEL_MAPPING[model.config.model_type].get(
+                    target_modules=MODEL_MAPPING[args.model_type].get(
                         'adapter_TM', 'mlp'),
                     method_name='forward',
                     hidden_pos=0,
@@ -239,10 +238,13 @@ def llm_sft(args: SftArguments) -> None:
 
     # ### Loading Dataset
     dataset = get_dataset(args.dataset.split(','))
-    train_dataset, val_dataset = process_dataset(dataset,
-                                                 args.dataset_test_size,
-                                                 args.dataset_sample,
-                                                 args.dataset_seed)
+    if isinstance(dataset, tuple):
+        train_dataset, val_dataset = dataset
+    else:
+        train_dataset, val_dataset = process_dataset(dataset,
+                                                    args.dataset_test_size,
+                                                    args.dataset_sample,
+                                                    args.dataset_seed)
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
@@ -314,6 +316,39 @@ def llm_sft(args: SftArguments) -> None:
             trainer_args._frozen = True
     logger.info(f'trainer_args: {trainer_args}')
 
+    def compute_metrics(self, prediction):
+        preds, labels = prediction[0], prediction[1]
+        if isinstance(preds, tuple):
+            preds = preds[0]
+
+        score_dict = {
+            'rouge-1': [],
+            'rouge-2': [],
+            'rouge-l': [],
+            'bleu-4': []
+        }
+        for pred, label in zip(preds, labels):
+            hypothesis = list(jieba.cut(pred))
+            if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
+                hypothesis = ['</s>']
+            reference = list(jieba.cut(label))
+            rouge = Rouge()
+            scores = rouge.get_scores(' '.join(hypothesis),
+                                      ' '.join(reference))
+            result = scores[0]
+
+            for k, v in result.items():
+                score_dict[k].append(round(v['f'] * 100, 4))
+            bleu_score = sentence_bleu(
+                [list(label)],
+                list(pred),
+                smoothing_function=SmoothingFunction().method3)
+            score_dict['bleu-4'].append(round(bleu_score * 100, 4))
+
+        for k, v in score_dict.items():
+            score_dict[k] = float(np.mean(v))
+        return score_dict
+
     trainer = Seq2SeqTrainer(
         model=model,
         args=trainer_args,
@@ -321,6 +356,7 @@ def llm_sft(args: SftArguments) -> None:
         train_dataset=train_dataset,
         eval_dataset=val_dataset,
         tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index ee87496835..1ec7397261 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -42,6 +42,26 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset:
     return _process_alpaca_dataset(dataset)
 
 
+def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
+    dataset_train: HfDataset = MsDataset.load(
+        'lvjianjin/AdvertiseGen', split='train').to_hf_dataset().rename_columns({
+            "content": "query",
+            "summary": "response",
+        })
+    dataset_val: HfDataset = MsDataset.load(
+        'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset().rename_columns({
+            "content": "query",
+            "summary": "response",
+        })
+    return dataset_train, dataset_val
+
+
+def get_alpaca_gpt4_en_dataset() -> HfDataset:
+    dataset: HfDataset = MsDataset.load(
+        'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
+    return _process_alpaca_dataset(dataset)
+
+
 def get_alpaca_gpt4_zh_dataset() -> HfDataset:
     dataset: HfDataset = MsDataset.load(
         'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
@@ -304,6 +324,7 @@ def get_gpt4all_en_dataset() -> HfDataset:
     'gpt4all-en': get_gpt4all_en_dataset,
     # multi-modal
     'coco-en': get_coco_en_dataset,
+    'advertise_gen': get_advertise_gen_dataset,
 }
 
 
@@ -312,7 +333,10 @@ def get_dataset(dataset_name_list: List[str]) -> HfDataset:
     for dataset_name in dataset_name_list:
         get_function = DATASET_MAPPING[dataset_name]
         dataset_list.append(get_function())
-    dataset = concatenate_datasets(dataset_list)
+    if not isinstance(dataset_list[0], tuple):
+        dataset = concatenate_datasets(dataset_list)
+    else:
+        dataset = dataset_list[0]
     return dataset
 
 
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index d3be77610d..417e0fc713 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -12,6 +12,12 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
+    'default_no_template': {
+        'prefix': [],
+        'prompt': ['{{query}}'],
+        'chat_sep': [],
+        'suffix': [['eos_token_id']],
+    },
     'chatml': {
         'prefix': [['im_start_id'], 'system\n{{system}}', ['im_end_id'], '\n'],
         'prompt': [['im_start_id'], 'user\n{{query}}', ['im_end_id'], '\n',
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index aef7f0b5b3..faceae77a1 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -3,6 +3,13 @@
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+from transformers.trainer_utils import PredictionOutput
+from transformers.utils import logging
 
 from .mixin import PushToMsHubMixin, SwiftMixin
 from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew
@@ -13,7 +20,224 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
 
 
 class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
-    pass
+    
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        **gen_kwargs
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
+                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def predict(
+        self,
+        test_dataset: Dataset,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "test",
+        **gen_kwargs
+    ) -> PredictionOutput:
+        """
+        Run prediction and returns predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        <Tip>
+
+        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
+
+        </Tip>
+
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+
+        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+
+        # XXX: adapt synced_gpus for fairscale as well
+        gen_kwargs = self._gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.model.config.max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
+        )
+        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
+        gen_kwargs["synced_gpus"] = (
+            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
+        )
+
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "position_ids" in inputs:
+            gen_kwargs["position_ids"] = inputs.get("position_ids", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+
+        # prepare generation inputs
+        # some encoder-decoder models can have varying encoder's and thus
+        # varying model input names
+        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
+            generation_inputs = inputs[self.model.encoder.main_input_name]
+        else:
+            generation_inputs = inputs[self.model.main_input_name]
+
+        gen_kwargs["input_ids"] = generation_inputs
+        generated_tokens = self.model.generate(**gen_kwargs)
+        generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
+
+        # in case the batch is shorter than max length, the output should be padded
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+            gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+
+        loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        if has_labels:
+            labels = inputs["labels"]
+            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
+                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
+                gen_kwargs["max_new_tokens"] + 1
+            ):
+                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+        else:
+            labels = None
+
+        return (loss, generated_tokens, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+            # If PAD token is not defined at least EOS token has to be defined
+            pad_token_id = (
+                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            )
+        else:
+            if self.model.config.pad_token_id is not None:
+                pad_token_id = self.model.config.pad_token_id
+            else:
+                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
+
+        padded_tensor = pad_token_id * torch.ones(
+            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
+        )
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor
 
 
 # monkey patching
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index c6885a6050..50e2d49100 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -158,6 +158,7 @@ def __init__(
         self.activate = act_layer()
         self.ln2 = nn.Linear(adapter_length, dim)
         self.init_weights()
+        self._prepared = False
 
     def init_weights(self):
 
@@ -169,6 +170,11 @@ def _init_weights(m):
         self.apply(_init_weights)
 
     def forward(self, x, identity=None):
+        if not self._prepared:
+            self.ln1.to(x.device)
+            self.activate.to(x.device)
+            self.ln2.to(x.device)
+            self._prepared = True
         out = self.ln2(self.activate(self.ln1(x)))
         if identity is None:
             identity = x
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index e5e315385f..7a7fb711c9 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -126,7 +126,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         Returns:
             The lora modules
         """
-        modules = []
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(replace_modules, (str, list))
         if isinstance(replace_modules, str):
@@ -234,9 +233,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         lora_module.state = sub_module.state
                     lora_module.to(sub_module.weight.device)
                     setattr(module, _key, lora_module)
-                    modules.append(lora_module)
-
-        return modules
 
     @staticmethod
     def unpatch_lora(model, config: LoRAConfig):
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 1f5c4b1b14..927e7437b2 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -79,7 +79,6 @@ class Prompt:
     @staticmethod
     def prepare_model(model: nn.Module, config: PromptConfig):
         module_keys = [key for key, _ in model.named_modules()]
-        match_module_keys = []
         for module_key in module_keys:
             if isinstance(config.target_modules, str):
                 target_module_found = re.fullmatch(config.target_modules,
@@ -144,7 +143,6 @@ def _forward(self, *args, **kwargs):
                                              config.attention_mask_value,
                                              config.attach_front)
                 setattr(module, 'prompt', prompt_module)
-                match_module_keys.append(module_key)
 
         def state_dict_callback(state_dict):
             return {
@@ -185,12 +183,11 @@ def __init__(self,
         self.prompt_length = prompt_length
         self.mask_values = mask_values
         self.attach_front = attach_front
-
         self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim))
         nn.init.xavier_uniform_(self.prompt_token)
 
     def forward(self, x):
-        prompt_token = self.prompt_token.expand(x.shape[0], -1, -1)
+        prompt_token = self.prompt_token.expand(x.shape[0], -1, -1).to(x.device)
 
         if self.layer_num == 0:
             if self.attach_front:

From ec27d414390a7e9341fe986be0b5370b66d3cfae Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 6 Sep 2023 11:27:57 +0800
Subject: [PATCH 12/70] add restuner test

---
 examples/pytorch/llm/src/llm_sft.py       |  8 +++-
 examples/pytorch/llm/src/utils/dataset.py |  6 ---
 examples/pytorch/llm/src/utils/model.py   | 57 +++++++++++++++++++++++
 swift/trainers/trainers.py                |  4 +-
 4 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index e12cdacc0f..131d02a570 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -14,7 +14,7 @@
                    select_bnb, select_dtype, show_layers)
 
 from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer,
-                   Seq2SeqTrainingArguments, Swift, SwiftConfig, get_logger)
+                   Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
                          seed_everything)
@@ -227,6 +227,12 @@ def llm_sft(args: SftArguments) -> None:
                     adapter_length=args.adapter_length,
                 )
                 swift_config['adapter'] = adapter_config
+            elif sft_type == 'restuner':
+                restuner_config = ResTuningConfig(
+                    dims=model.config.hidden_size,
+                    **MODEL_MAPPING[args.model_type]['restuner_TM']
+                )
+                swift_config['restuner'] = restuner_config
         model = Swift.prepare_model(model, swift_config)
     else:
         model = Swift.from_pretrained(
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 1ec7397261..af3ccb6d66 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -56,12 +56,6 @@ def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
     return dataset_train, dataset_val
 
 
-def get_alpaca_gpt4_en_dataset() -> HfDataset:
-    dataset: HfDataset = MsDataset.load(
-        'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
-    return _process_alpaca_dataset(dataset)
-
-
 def get_alpaca_gpt4_zh_dataset() -> HfDataset:
     dataset: HfDataset = MsDataset.load(
         'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index d16b76bfa0..b5b2af91f2 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -181,6 +181,45 @@ class AdapterTM(NamedTuple):
     polylm = ['mlp']
 
 
+class ResTunerTM(NamedTuple):
+    # default lora target modules. qkv
+    baichuan = {
+        "root_modules": r'.*layers.0$',
+        "stem_modules": r'.*layers\.\d+$',
+        "target_modules": r'.*model.norm',
+        "target_modules_hook": "input",
+        "tuner_cfg": "res_adapter",
+    }
+    chatglm2 = {
+        "root_modules": r'.*layers.0$',
+        "stem_modules": r'.*layers\.\d+$',
+        "target_modules": r'.*final_layernorm',
+        "target_modules_hook": "input",
+        "tuner_cfg": "res_adapter",
+    }
+    llama2 = {
+        "root_modules": r'.*layers.0$',
+        "stem_modules": r'.*layers\.\d+$',
+        "target_modules": r'.*model.norm',
+        "target_modules_hook": "input",
+        "tuner_cfg": "res_adapter",
+    }
+    qwen = {
+        "root_modules": r'.*transformer.h.0$',
+        "stem_modules": r'.*transformer.h\.\d+$',
+        "target_modules": r'.*transformer.ln_f',
+        "target_modules_hook": "input",
+        "tuner_cfg": "res_adapter",
+    }
+    polylm = {
+        "root_modules": r'.*transformer.h.0$',
+        "stem_modules": r'.*transformer.h\.\d+$',
+        "target_modules": r'.*transformer.ln_f',
+        "target_modules_hook": "input",
+        "tuner_cfg": "res_adapter",
+    }
+
+
 # Model Home: 'https://modelscope.cn/models/{model_id}/summary'
 # keys: 'model_id', 'revision', 'get_function', 'template',
 #   'ignore_file_pattern', 'lora_TM'
@@ -191,6 +230,7 @@ class AdapterTM(NamedTuple):
         'get_function': get_model_tokenizer_qwen,
         'lora_TM': LoRATM.qwen,
         'adapter_TM': AdapterTM.qwen,
+        'restuner_TM': ResTunerTM.qwen,
     },
     'qwen-7b-chat': {
         'model_id': 'qwen/Qwen-7B-Chat',
@@ -199,6 +239,7 @@ class AdapterTM(NamedTuple):
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
         'adapter_TM': AdapterTM.qwen,
+        'restuner_TM': ResTunerTM.qwen,
     },
     'qwen-vl': {
         'model_id': 'qwen/Qwen-VL',
@@ -206,6 +247,7 @@ class AdapterTM(NamedTuple):
         'get_function': get_model_tokenizer_qwen_vl,
         'lora_TM': LoRATM.qwen,
         'adapter_TM': AdapterTM.qwen,
+        'restuner_TM': ResTunerTM.qwen,
     },
     'qwen-vl-chat': {
         'model_id': 'qwen/Qwen-VL-Chat',
@@ -214,12 +256,14 @@ class AdapterTM(NamedTuple):
         'template': 'chatml',
         'lora_TM': LoRATM.qwen,
         'adapter_TM': AdapterTM.qwen,
+        'restuner_TM': ResTunerTM.qwen,
     },
     'baichuan-7b': {
         'model_id': 'baichuan-inc/baichuan-7B',
         'revision': 'v1.0.7',
         'lora_TM': LoRATM.baichuan,
         'adapter_TM': AdapterTM.baichuan,
+        'restuner_TM': ResTunerTM.baichuan,
     },
     'baichuan-13b': {
         'model_id': 'baichuan-inc/Baichuan-13B-Base',
@@ -227,6 +271,7 @@ class AdapterTM(NamedTuple):
         'get_function': get_model_tokenizer_baichuan13b,
         'lora_TM': LoRATM.baichuan,
         'adapter_TM': AdapterTM.baichuan,
+        'restuner_TM': ResTunerTM.baichuan,
     },
     'baichuan-13b-chat': {
         'model_id': 'baichuan-inc/Baichuan-13B-Chat',
@@ -234,6 +279,7 @@ class AdapterTM(NamedTuple):
         'template': 'baichuan',
         'lora_TM': LoRATM.baichuan,
         'adapter_TM': AdapterTM.baichuan,
+        'restuner_TM': ResTunerTM.baichuan,
     },
     'chatglm2-6b': {
         'model_id': 'ZhipuAI/chatglm2-6b',
@@ -242,6 +288,7 @@ class AdapterTM(NamedTuple):
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
         'adapter_TM': AdapterTM.chatglm2,
+        'restuner_TM': ResTunerTM.chatglm2,
     },
     'chatglm2-6b-32k': {
         'model_id': 'ZhipuAI/chatglm2-6b-32k',
@@ -249,6 +296,7 @@ class AdapterTM(NamedTuple):
         'template': 'chatglm2',
         'lora_TM': LoRATM.chatglm2,
         'adapter_TM': AdapterTM.chatglm2,
+        'restuner_TM': ResTunerTM.chatglm2,
     },
     'llama2-7b': {
         'model_id': 'modelscope/Llama-2-7b-ms',
@@ -256,6 +304,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],  # use safetensors
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'llama2-13b': {
         'model_id': 'modelscope/Llama-2-13b-ms',
@@ -264,6 +313,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'llama2-70b': {
         'model_id': 'modelscope/Llama-2-70b-ms',
@@ -271,6 +321,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'llama2-7b-chat': {
         'model_id': 'modelscope/Llama-2-7b-chat-ms',
@@ -279,6 +330,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],  # use safetensors
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'llama2-13b-chat': {
         'model_id': 'modelscope/Llama-2-13b-chat-ms',
@@ -288,6 +340,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'llama2-70b-chat': {
         'model_id': 'modelscope/Llama-2-70b-chat-ms',
@@ -297,6 +350,7 @@ class AdapterTM(NamedTuple):
         'ignore_file_pattern': [r'.+\.bin$'],
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'openbuddy-llama2-13b': {
         'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
@@ -304,6 +358,7 @@ class AdapterTM(NamedTuple):
         'template': 'openbuddy_llama',
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'openbuddy-llama-65b': {
         'model_id': 'OpenBuddy/openbuddy-llama-65b-v8-bf16',
@@ -311,6 +366,7 @@ class AdapterTM(NamedTuple):
         'template': 'openbuddy_llama',
         'lora_TM': LoRATM.llama2,
         'adapter_TM': AdapterTM.llama2,
+        'restuner_TM': ResTunerTM.llama2,
     },
     'polylm-13b': {
         'model_id': 'damo/nlp_polylm_13b_text_generation',
@@ -318,6 +374,7 @@ class AdapterTM(NamedTuple):
         'get_function': get_model_tokenizer_polylm,
         'lora_TM': LoRATM.polylm,
         'adapter_TM': AdapterTM.polylm,
+        'restuner_TM': ResTunerTM.polylm,
     },
 }
 
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index faceae77a1..bc416b9bea 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -8,6 +8,7 @@
 import torch
 from torch import nn
 from torch.utils.data import Dataset
+from transformers.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer_utils import PredictionOutput
 from transformers.utils import logging
 
@@ -219,7 +220,8 @@ def prediction_step(
         else:
             labels = None
 
-        return (loss, generated_tokens, labels)
+        # return (loss, generated_tokens, labels)
+        return (loss, None, None)
 
     def _pad_tensors_to_max_len(self, tensor, max_length):
         if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):

From 5c8b401597727dc8e131acbf00212fd8245a4abc Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 13:17:19 +0800
Subject: [PATCH 13/70] test

---
 examples/pytorch/llm/src/llm_sft.py          | 48 ++++++++++++++++----
 examples/pytorch/llm/src/utils/preprocess.py | 25 ++++++++--
 swift/trainers/trainers.py                   | 11 +++--
 swift/tuners/adapter.py                      |  6 ++-
 swift/tuners/restuning_components.py         | 24 ++++++++--
 swift/utils/llm_utils.py                     |  3 +-
 6 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 131d02a570..8f2118fa02 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -2,16 +2,17 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Dict, List, Optional
+from typing import Dict, List
+from typing import Optional
 
+import jieba
+import numpy as np
 import torch
 import torch.distributed as dist
+from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu)
+from rouge import Rouge
+from rouge.rouge import Rouge
 from transformers import BitsAndBytesConfig
-from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, find_all_linear_for_lora, get_dataset,
-                   get_dist_setting, get_model_tokenizer, get_preprocess,
-                   is_dist, is_master, plot_images, process_dataset,
-                   select_bnb, select_dtype, show_layers)
 
 from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger)
@@ -19,6 +20,11 @@
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
                          seed_everything)
 from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset
+from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
+                   broadcast_string, find_all_linear_for_lora, get_dataset,
+                   get_dist_setting, get_model_tokenizer, get_preprocess,
+                   is_dist, is_master, plot_images, process_dataset,
+                   select_bnb, select_dtype, show_layers)
 
 logger = get_logger()
 
@@ -78,7 +84,7 @@ class SftArguments:
     optim: str = 'adamw_torch'
     learning_rate: Optional[float] = None
     weight_decay: float = 0.01
-    gradient_accumulation_steps: int = 16
+    gradient_accumulation_steps: int = 1
     max_grad_norm: float = 1.
     lr_scheduler_type: str = 'cosine'
     warmup_ratio: float = 0.05
@@ -246,6 +252,8 @@ def llm_sft(args: SftArguments) -> None:
     dataset = get_dataset(args.dataset.split(','))
     if isinstance(dataset, tuple):
         train_dataset, val_dataset = dataset
+        # train_dataset = train_dataset.select(range(100))
+        # val_dataset = val_dataset.select(range(100))
     else:
         train_dataset, val_dataset = process_dataset(dataset,
                                                     args.dataset_test_size,
@@ -258,6 +266,13 @@ def llm_sft(args: SftArguments) -> None:
         args.max_length,
         batched=True)
     train_dataset = train_dataset.map(preprocess_func, batched=True)
+    preprocess_func = get_preprocess(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        batched=True,
+        train=False)
     val_dataset = val_dataset.map(preprocess_func, batched=True)
     del dataset
     # Data analysis
@@ -279,7 +294,7 @@ def llm_sft(args: SftArguments) -> None:
         do_eval=True,
         evaluation_strategy='steps',
         per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=args.batch_size,
+        per_device_eval_batch_size=1,
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         learning_rate=args.learning_rate,
         weight_decay=args.weight_decay,
@@ -309,6 +324,7 @@ def llm_sft(args: SftArguments) -> None:
         resume_from_checkpoint=args.resume_from_ckpt,
         ddp_backend=args.ddp_backend,
         gradient_checkpointing=args.gradient_checkpointing,
+        predict_with_generate=True,
         local_rank=local_rank)
 
     if args.gradient_checkpointing:
@@ -322,7 +338,7 @@ def llm_sft(args: SftArguments) -> None:
             trainer_args._frozen = True
     logger.info(f'trainer_args: {trainer_args}')
 
-    def compute_metrics(self, prediction):
+    def compute_metrics(prediction):
         preds, labels = prediction[0], prediction[1]
         if isinstance(preds, tuple):
             preds = preds[0]
@@ -333,7 +349,21 @@ def compute_metrics(self, prediction):
             'rouge-l': [],
             'bleu-4': []
         }
+
+        def _decode(tokens, ignore_pad_token_for_loss=False):
+            if ignore_pad_token_for_loss:
+                tokens = np.where(tokens != -100, tokens,
+                                tokenizer.pad_token_id)
+            tokens = np.where(tokens < tokenizer.vocab_size, tokens,
+                            tokenizer.pad_token_id)
+            return [
+                t for t in tokenizer.batch_decode(
+                    tokens, skip_special_tokens=True) if t != '</s>'
+            ]
+
         for pred, label in zip(preds, labels):
+            pred = ''.join(_decode(pred, False))
+            label = ''.join(_decode(label, True))
             hypothesis = list(jieba.cut(pred))
             if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
                 hypothesis = ['</s>']
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index 417e0fc713..f52dc4bb82 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -40,6 +40,12 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
+    'chatglm2_no_template': {
+        'prefix': [[64790, 64792]],
+        'prompt': ['{{query}}'],
+        'chat_sep': [],
+        'suffix': [['eos_token_id']],
+    },
     'llama': {
         'prefix': [['bos_token_id'],
                    '[INST] <<SYS>>\n{{system}}\n<</SYS>>\n\n'],
@@ -126,6 +132,7 @@ def _preprocess(
     history: Optional[History] = None,
     system: Optional[str] = None,
     max_length: Optional[int] = None,
+    train = True,
 ) -> Dict[str, List[int]]:
     if history is None:
         history = []
@@ -158,17 +165,24 @@ def _preprocess(
 
     labels = None
     if response is not None:
-        labels = [-100] * len(input_ids)
         tgt_input_ids = _encode(tokenizer, [response], [])
         tgt_input_ids += _encode(tokenizer, template_config['suffix'], [])
-        input_ids += tgt_input_ids
-        labels += tgt_input_ids
+        if train:
+            labels = [-100] * len(input_ids) + tgt_input_ids
+            input_ids += tgt_input_ids
+        else:
+            labels = tgt_input_ids
 
     if max_length is not None:
         input_ids = input_ids[-max_length:]
         if labels is not None:
             labels = labels[-max_length:]
 
+    if train:
+        pass
+    else:
+        input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids
+
     return {'input_ids': input_ids, 'labels': labels}
 
 
@@ -177,7 +191,8 @@ def get_preprocess(
         tokenizer: PreTrainedTokenizer,
         system: Optional[str] = None,
         max_length: Optional[int] = None,
-        batched: bool = False
+        batched: bool = False,
+        train=True,
 ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]:
 
     def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
@@ -186,7 +201,7 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
         response: str = example.get('response', None)
         custom_system = example.get('system', system)
         return _preprocess(template_type, tokenizer, query, response, history,
-                           custom_system, max_length)
+                           custom_system, max_length, train)
 
     if batched:
         # Avoid tqdm printing too much logs when dataset.map(...)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index bc416b9bea..440ec1633d 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -166,7 +166,12 @@ def prediction_step(
         inputs = self._prepare_inputs(inputs)
 
         # XXX: adapt synced_gpus for fairscale as well
-        gen_kwargs = self._gen_kwargs.copy()
+        gen_kwargs = {
+            'do_sample': True,
+            'top_p': 0.7,
+            'max_length': 512,
+            'temperature': 0.95
+        }
         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
             gen_kwargs["max_length"] = self.model.config.max_length
         gen_kwargs["num_beams"] = (
@@ -220,8 +225,8 @@ def prediction_step(
         else:
             labels = None
 
-        # return (loss, generated_tokens, labels)
-        return (loss, None, None)
+        return (loss, generated_tokens, labels)
+        # return (loss, None, None)
 
     def _pad_tensors_to_max_len(self, tensor, max_length):
         if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 50e2d49100..bbd2dd880e 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -175,8 +175,12 @@ def forward(self, x, identity=None):
             self.activate.to(x.device)
             self.ln2.to(x.device)
             self._prepared = True
+        
+        x_dtype = x.dtype
+        x = x.to(self.ln1.weight.dtype)
         out = self.ln2(self.activate(self.ln1(x)))
         if identity is None:
             identity = x
+        identity = identity.to(out.dtype)
         out = identity + out
-        return out
+        return out.to(x_dtype)
diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
index f6aefb0610..207f02ecd5 100644
--- a/swift/tuners/restuning_components.py
+++ b/swift/tuners/restuning_components.py
@@ -60,7 +60,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="",
         self.layer_num = layer_num
         self.depth = depth
 
-        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 10
+        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 17
         self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
         self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
 
@@ -86,6 +86,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="",
             self._xavier_init_weights(self.ln2)
 
         self.scaling = init_weight_type(dim, self.adapter_weight)
+        self._prepared = False
 
     def _zero_init_weights(self, m):
         if isinstance(m, nn.Linear):
@@ -103,6 +104,14 @@ def _xavier_init_weights(self, m):
             nn.init.normal_(m.bias, std=1e-6)
 
     def forward(self, x):
+        if not self._prepared:
+            self.ln1.to(x.device)
+            self.activate.to(x.device)
+            self.ln2.to(x.device)
+            self._prepared = True
+        
+        x_dtype = x.dtype
+        x = x.to(self.ln1.weight.dtype)
         x_shortcut = x
         if len(x_shortcut.size()) == 4:
             B, C, N1, N2 = x.size()
@@ -117,7 +126,7 @@ def forward(self, x):
             x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1],
                                                         x_shortcut.size()[2], x_shortcut.size()[3])
         x_out = x_shortcut + x_adapter
-        return x_out
+        return x_out.to(x_dtype)
 
 
 class ResGroupAdapter(nn.Module):
@@ -148,6 +157,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="",
         else:
             self._kaiming_init_weights(self.ln2)
         self.scaling = init_weight_type(dim, self.adapter_weight)
+        self._prepared = False
 
     def _zero_init_weights(self, m):
         if isinstance(m, nn.Linear):
@@ -165,6 +175,14 @@ def _xavier_init_weights(self, m):
             nn.init.normal_(m.bias, std=1e-6)
 
     def forward(self, x):
+        if not self._prepared:
+            self.ln1.to(x.device)
+            self.activate.to(x.device)
+            self.ln2.to(x.device)
+            self._prepared = True
+        
+        x_dtype = x.dtype
+        x = x.to(self.ln1.weight.dtype)
         x_shortcut = x
 
         batch, inner_dim, height, width = x.shape
@@ -181,7 +199,7 @@ def forward(self, x):
         x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous()
         x_out = x_shortcut + x_adapter
 
-        return x_out
+        return x_out.to(x_dtype)
 
 
 class Identity(nn.Module):
diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py
index 3ae6e3aca7..21ab38ae6f 100644
--- a/swift/utils/llm_utils.py
+++ b/swift/utils/llm_utils.py
@@ -32,7 +32,8 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
     input_ids = [torch.tensor(b['input_ids']) for b in batch]
     labels = [torch.tensor(b['labels']) for b in batch]
     attention_mask = [
-        torch.ones(len(input_ids[i]), dtype=torch.int64)
+        torch.where(input_ids[i]==tokenizer.pad_token_id, 
+        0, 1)
         for i in range(len(input_ids))
     ]
 

From 534d4c91be4cd3c65a3d8bb40d3452f80810fcfe Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 13:58:21 +0800
Subject: [PATCH 14/70] wip

---
 swift/tuners/restuning_components.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
index 207f02ecd5..a3ab2dfe28 100644
--- a/swift/tuners/restuning_components.py
+++ b/swift/tuners/restuning_components.py
@@ -295,7 +295,7 @@ def detach_tensors(feats):
     elif isinstance(feats, dict):
         feats = {key: detach_tensors(val) for key, val in feats.items()}
     elif isinstance(feats, torch.Tensor):
-        feats = feats.detach().float()
+        feats = feats.detach()
     else:
         feats = feats.detach()
     return feats

From 7734e98abc5f15f393545cb52178fda85b069519 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 17:39:03 +0800
Subject: [PATCH 15/70] wip

---
 examples/pytorch/llm/src/llm_sft.py          | 103 ++++++++++++++++++-
 examples/pytorch/llm/src/utils/preprocess.py |   8 +-
 swift/utils/llm_utils.py                     |   9 ++
 3 files changed, 113 insertions(+), 7 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 8f2118fa02..83eddd817b 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -259,13 +259,110 @@ def llm_sft(args: SftArguments) -> None:
                                                     args.dataset_test_size,
                                                     args.dataset_sample,
                                                     args.dataset_seed)
+
+    args.max_source_length = 64
+    args.max_target_length = 64
+    prompt_column = 'query'
+    response_column = 'response'
+    history_column = None
+    prefix = ''
+    max_target_length = 128
+    def preprocess_function_eval(examples):
+        inputs, targets = [], []
+        for i in range(len(examples[prompt_column])):
+            if examples[prompt_column][i] and examples[response_column][i]:
+                query = examples[prompt_column][i]
+                if history_column is None or len(examples[history_column][i]) == 0:
+                    prompt = query
+                else:
+                    prompt = ''
+                    history = examples[history_column][i]
+                    for turn_idx, (old_query, response) in enumerate(history):
+                        prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
+                            turn_idx, old_query, response)
+                    prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
+                inputs.append(prompt)
+                targets.append(examples[response_column][i])
+
+        inputs = [prefix + inp for inp in inputs]
+        model_inputs = tokenizer(
+            inputs,
+            max_length=args.max_source_length,
+            truncation=True,
+            padding=True)
+        labels = tokenizer(
+            text_target=targets, max_length=max_target_length, truncation=True)
+
+        if True:
+            labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100)
+                                    for lb in label]
+                                for label in labels['input_ids']]
+        model_inputs['labels'] = labels['input_ids']
+
+        return model_inputs
+
+
+    def preprocess_function_train(examples):
+        max_seq_length = args.max_source_length + args.max_target_length
+
+        model_inputs = {
+            'input_ids': [],
+            'labels': [],
+        }
+        for i in range(len(examples[prompt_column])):
+            if examples[prompt_column][i] and examples[response_column][i]:
+                query, answer = examples[prompt_column][i], examples[
+                    response_column][i]
+
+                if history_column is None:
+                    prompt = query
+                else:
+                    prompt = ''
+                    history = examples[history_column][i]
+                    for turn_idx, (old_query, response) in enumerate(history):
+                        prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
+                            turn_idx, old_query, response)
+                    prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
+
+                prompt = prefix + prompt
+                a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
+                b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
+
+                if len(a_ids) > args.max_source_length - 1:
+                    a_ids = a_ids[:args.max_source_length - 1]
+
+                if len(b_ids) > args.max_target_length - 2:
+                    b_ids = b_ids[:args.max_target_length - 2]
+
+                input_ids = tokenizer.build_inputs_with_special_tokens(
+                    a_ids, b_ids)
+
+                if False:
+                    context_length = input_ids.index(tokenizer.bos_token_id)
+                else:
+                    context_length = len(a_ids) + 2
+                mask_position = context_length - 1
+                labels = [-100] * context_length + input_ids[mask_position + 1:]
+
+                pad_len = max_seq_length - len(input_ids)
+                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
+                labels = labels + [tokenizer.pad_token_id] * pad_len
+                if True:
+                    labels = [(lb if lb != tokenizer.pad_token_id else -100)
+                            for lb in labels]
+
+                model_inputs['input_ids'].append(input_ids)
+                model_inputs['labels'].append(labels)
+
+        return model_inputs
+
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
         batched=True)
-    train_dataset = train_dataset.map(preprocess_func, batched=True)
+    train_dataset = train_dataset.map(preprocess_function_train, batched=True)
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
@@ -273,7 +370,7 @@ def llm_sft(args: SftArguments) -> None:
         args.max_length,
         batched=True,
         train=False)
-    val_dataset = val_dataset.map(preprocess_func, batched=True)
+    val_dataset = val_dataset.map(preprocess_function_eval, batched=True)
     del dataset
     # Data analysis
     stat_dataset(train_dataset)
@@ -329,7 +426,7 @@ def llm_sft(args: SftArguments) -> None:
 
     if args.gradient_checkpointing:
         # fix: gradients will be None
-        model.config.use_cache = False
+        model.config.use_cache = True
         model.enable_input_require_grads()
         if is_dist():
             trainer_args._frozen = False  # Compatible with transformers==4.32.0
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index f52dc4bb82..13d4f267da 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -178,10 +178,10 @@ def _preprocess(
         if labels is not None:
             labels = labels[-max_length:]
 
-    if train:
-        pass
-    else:
-        input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids
+    # if train:
+    #     pass
+    # else:
+    #     input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids
 
     return {'input_ids': input_ids, 'labels': labels}
 
diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py
index 21ab38ae6f..2892099b20 100644
--- a/swift/utils/llm_utils.py
+++ b/swift/utils/llm_utils.py
@@ -42,6 +42,15 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
     attention_mask = pad_sequence(
         attention_mask, batch_first=True, padding_value=0)
     labels = pad_sequence(labels, batch_first=True, padding_value=-100)
+
+    # if 'position_ids' in batch[0]:
+    #     position_ids = [torch.tensor(b['position_ids']) for b in batch]
+    #     return {
+    #         'input_ids': input_ids,
+    #         'attention_mask': attention_mask,
+    #         'labels': labels,
+    #         'position_ids': torch.stack(position_ids),
+    #     }
     return {
         'input_ids': input_ids,
         'attention_mask': attention_mask,

From 0be303b1530edc97dc5d1afefaf1226f1b2f0e02 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 19:37:56 +0800
Subject: [PATCH 16/70] refine code

---
 examples/pytorch/llm/src/llm_sft.py          | 239 ++++++++++---------
 examples/pytorch/llm/src/utils/dataset.py    |  20 +-
 examples/pytorch/llm/src/utils/preprocess.py |  18 +-
 swift/trainers/trainers.py                   | 233 +-----------------
 swift/utils/llm_utils.py                     |  13 +-
 5 files changed, 148 insertions(+), 375 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 83eddd817b..2f730f76fa 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -74,22 +74,24 @@ class SftArguments:
     lora_rank: int = 8
     lora_alpha: int = 32
     lora_dropout_p: float = 0.1
-    adapter_length: int = 128
+    adapter_length: int = 32
 
     gradient_checkpointing: bool = True
     batch_size: int = 1
+    eval_batch_size: int = 1
     num_train_epochs: int = 1
     # if max_steps >= 0, override num_train_epochs
     max_steps: int = -1
     optim: str = 'adamw_torch'
     learning_rate: Optional[float] = None
     weight_decay: float = 0.01
-    gradient_accumulation_steps: int = 1
+    gradient_accumulation_steps: int = 16
     max_grad_norm: float = 1.
+    predict_with_generate: bool = False
     lr_scheduler_type: str = 'cosine'
     warmup_ratio: float = 0.05
 
-    eval_steps: int = 10
+    eval_steps: int = 50
     save_steps: Optional[int] = None
     save_total_limit: int = 2
     logging_steps: int = 5
@@ -105,7 +107,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-            'SDK token can be found in https://modelscope.cn/my/myaccesstoken'
+                'SDK token can be found in https://modelscope.cn/my/myaccesstoken'
         })
 
     # other
@@ -113,7 +115,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-            "This parameter is used only when model_type.startswith('qwen-7b')"
+                "This parameter is used only when model_type.startswith('qwen-7b')"
         })
 
     def __post_init__(self):
@@ -232,12 +234,14 @@ def llm_sft(args: SftArguments) -> None:
                     hidden_pos=0,
                     adapter_length=args.adapter_length,
                 )
+                logger.info(f'adapter_config: {adapter_config}')
                 swift_config['adapter'] = adapter_config
             elif sft_type == 'restuner':
                 restuner_config = ResTuningConfig(
                     dims=model.config.hidden_size,
                     **MODEL_MAPPING[args.model_type]['restuner_TM']
                 )
+                logger.info(f'restuner_config: {restuner_config}')
                 swift_config['restuner'] = restuner_config
         model = Swift.prepare_model(model, swift_config)
     else:
@@ -252,125 +256,124 @@ def llm_sft(args: SftArguments) -> None:
     dataset = get_dataset(args.dataset.split(','))
     if isinstance(dataset, tuple):
         train_dataset, val_dataset = dataset
-        # train_dataset = train_dataset.select(range(100))
-        # val_dataset = val_dataset.select(range(100))
     else:
         train_dataset, val_dataset = process_dataset(dataset,
-                                                    args.dataset_test_size,
-                                                    args.dataset_sample,
-                                                    args.dataset_seed)
-
-    args.max_source_length = 64
-    args.max_target_length = 64
-    prompt_column = 'query'
-    response_column = 'response'
-    history_column = None
-    prefix = ''
-    max_target_length = 128
-    def preprocess_function_eval(examples):
-        inputs, targets = [], []
-        for i in range(len(examples[prompt_column])):
-            if examples[prompt_column][i] and examples[response_column][i]:
-                query = examples[prompt_column][i]
-                if history_column is None or len(examples[history_column][i]) == 0:
-                    prompt = query
-                else:
-                    prompt = ''
-                    history = examples[history_column][i]
-                    for turn_idx, (old_query, response) in enumerate(history):
-                        prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
-                            turn_idx, old_query, response)
-                    prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
-                inputs.append(prompt)
-                targets.append(examples[response_column][i])
-
-        inputs = [prefix + inp for inp in inputs]
-        model_inputs = tokenizer(
-            inputs,
-            max_length=args.max_source_length,
-            truncation=True,
-            padding=True)
-        labels = tokenizer(
-            text_target=targets, max_length=max_target_length, truncation=True)
-
-        if True:
-            labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100)
-                                    for lb in label]
-                                for label in labels['input_ids']]
-        model_inputs['labels'] = labels['input_ids']
-
-        return model_inputs
-
-
-    def preprocess_function_train(examples):
-        max_seq_length = args.max_source_length + args.max_target_length
-
-        model_inputs = {
-            'input_ids': [],
-            'labels': [],
-        }
-        for i in range(len(examples[prompt_column])):
-            if examples[prompt_column][i] and examples[response_column][i]:
-                query, answer = examples[prompt_column][i], examples[
-                    response_column][i]
-
-                if history_column is None:
-                    prompt = query
-                else:
-                    prompt = ''
-                    history = examples[history_column][i]
-                    for turn_idx, (old_query, response) in enumerate(history):
-                        prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
-                            turn_idx, old_query, response)
-                    prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
-
-                prompt = prefix + prompt
-                a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
-                b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
-
-                if len(a_ids) > args.max_source_length - 1:
-                    a_ids = a_ids[:args.max_source_length - 1]
-
-                if len(b_ids) > args.max_target_length - 2:
-                    b_ids = b_ids[:args.max_target_length - 2]
-
-                input_ids = tokenizer.build_inputs_with_special_tokens(
-                    a_ids, b_ids)
-
-                if False:
-                    context_length = input_ids.index(tokenizer.bos_token_id)
-                else:
-                    context_length = len(a_ids) + 2
-                mask_position = context_length - 1
-                labels = [-100] * context_length + input_ids[mask_position + 1:]
-
-                pad_len = max_seq_length - len(input_ids)
-                input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
-                labels = labels + [tokenizer.pad_token_id] * pad_len
-                if True:
-                    labels = [(lb if lb != tokenizer.pad_token_id else -100)
-                            for lb in labels]
-
-                model_inputs['input_ids'].append(input_ids)
-                model_inputs['labels'].append(labels)
-
-        return model_inputs
+                                                     args.dataset_test_size,
+                                                     args.dataset_sample,
+                                                     args.dataset_seed)
+
+    # args.max_source_length = 64
+    # args.max_target_length = 64
+    # prompt_column = 'query'
+    # response_column = 'response'
+    # history_column = None
+    # prefix = ''
+    # max_target_length = 128
+    #
+    # def preprocess_function_eval(examples):
+    #     inputs, targets = [], []
+    #     for i in range(len(examples[prompt_column])):
+    #         if examples[prompt_column][i] and examples[response_column][i]:
+    #             query = examples[prompt_column][i]
+    #             if history_column is None or len(examples[history_column][i]) == 0:
+    #                 prompt = query
+    #             else:
+    #                 prompt = ''
+    #                 history = examples[history_column][i]
+    #                 for turn_idx, (old_query, response) in enumerate(history):
+    #                     prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
+    #                         turn_idx, old_query, response)
+    #                 prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
+    #             inputs.append(prompt)
+    #             targets.append(examples[response_column][i])
+    #
+    #     inputs = [prefix + inp for inp in inputs]
+    #     model_inputs = tokenizer(
+    #         inputs,
+    #         max_length=args.max_source_length,
+    #         truncation=True,
+    #         padding=True)
+    #     labels = tokenizer(
+    #         text_target=targets, max_length=max_target_length, truncation=True)
+    #
+    #     if True:
+    #         labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100)
+    #                                 for lb in label]
+    #                                for label in labels['input_ids']]
+    #     model_inputs['labels'] = labels['input_ids']
+    #
+    #     return model_inputs
+    #
+    # def preprocess_function_train(examples):
+    #     max_seq_length = args.max_source_length + args.max_target_length
+    #
+    #     model_inputs = {
+    #         'input_ids': [],
+    #         'labels': [],
+    #     }
+    #     for i in range(len(examples[prompt_column])):
+    #         if examples[prompt_column][i] and examples[response_column][i]:
+    #             query, answer = examples[prompt_column][i], examples[
+    #                 response_column][i]
+    #
+    #             if history_column is None:
+    #                 prompt = query
+    #             else:
+    #                 prompt = ''
+    #                 history = examples[history_column][i]
+    #                 for turn_idx, (old_query, response) in enumerate(history):
+    #                     prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
+    #                         turn_idx, old_query, response)
+    #                 prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
+    #
+    #             prompt = prefix + prompt
+    #             a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
+    #             b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
+    #
+    #             if len(a_ids) > args.max_source_length - 1:
+    #                 a_ids = a_ids[:args.max_source_length - 1]
+    #
+    #             if len(b_ids) > args.max_target_length - 2:
+    #                 b_ids = b_ids[:args.max_target_length - 2]
+    #
+    #             input_ids = tokenizer.build_inputs_with_special_tokens(
+    #                 a_ids, b_ids)
+    #
+    #             if False:
+    #                 context_length = input_ids.index(tokenizer.bos_token_id)
+    #             else:
+    #                 context_length = len(a_ids) + 2
+    #             mask_position = context_length - 1
+    #             labels = [-100] * context_length + input_ids[mask_position + 1:]
+    #
+    #             pad_len = max_seq_length - len(input_ids)
+    #             input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
+    #             labels = labels + [tokenizer.pad_token_id] * pad_len
+    #             if True:
+    #                 labels = [(lb if lb != tokenizer.pad_token_id else -100)
+    #                           for lb in labels]
+    #
+    #             model_inputs['input_ids'].append(input_ids)
+    #             model_inputs['labels'].append(labels)
+    #
+    #     return model_inputs
 
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
-        batched=True)
-    train_dataset = train_dataset.map(preprocess_function_train, batched=True)
+        batched=True,
+        validate_generation=False)
+    train_dataset = train_dataset.map(preprocess_func, batched=True)
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
         batched=True,
-        train=False)
-    val_dataset = val_dataset.map(preprocess_function_eval, batched=True)
+        validate_generation=True)
+    val_dataset = val_dataset.map(preprocess_func, batched=True)
     del dataset
     # Data analysis
     stat_dataset(train_dataset)
@@ -391,7 +394,7 @@ def preprocess_function_train(examples):
         do_eval=True,
         evaluation_strategy='steps',
         per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=1,
+        per_device_eval_batch_size=args.eval_batch_size,
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         learning_rate=args.learning_rate,
         weight_decay=args.weight_decay,
@@ -421,7 +424,7 @@ def preprocess_function_train(examples):
         resume_from_checkpoint=args.resume_from_ckpt,
         ddp_backend=args.ddp_backend,
         gradient_checkpointing=args.gradient_checkpointing,
-        predict_with_generate=True,
+        predict_with_generate=args.predict_with_generate,
         local_rank=local_rank)
 
     if args.gradient_checkpointing:
@@ -437,8 +440,6 @@ def preprocess_function_train(examples):
 
     def compute_metrics(prediction):
         preds, labels = prediction[0], prediction[1]
-        if isinstance(preds, tuple):
-            preds = preds[0]
 
         score_dict = {
             'rouge-1': [],
@@ -450,12 +451,12 @@ def compute_metrics(prediction):
         def _decode(tokens, ignore_pad_token_for_loss=False):
             if ignore_pad_token_for_loss:
                 tokens = np.where(tokens != -100, tokens,
-                                tokenizer.pad_token_id)
+                                  tokenizer.pad_token_id)
             tokens = np.where(tokens < tokenizer.vocab_size, tokens,
-                            tokenizer.pad_token_id)
+                              tokenizer.pad_token_id)
             return [
                 t for t in tokenizer.batch_decode(
-                    tokens, skip_special_tokens=True) if t != '</s>'
+                    tokens, skip_special_tokens=True)
             ]
 
         for pred, label in zip(preds, labels):
@@ -463,7 +464,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
             label = ''.join(_decode(label, True))
             hypothesis = list(jieba.cut(pred))
             if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
-                hypothesis = ['</s>']
+                hypothesis = [tokenizer.decode(tokenizer.eos_token_id)]
             reference = list(jieba.cut(label))
             rouge = Rouge()
             scores = rouge.get_scores(' '.join(hypothesis),
@@ -489,7 +490,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
         train_dataset=train_dataset,
         eval_dataset=val_dataset,
         tokenizer=tokenizer,
-        compute_metrics=compute_metrics,
+        compute_metrics=compute_metrics if args.predict_with_generate else None,
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index af3ccb6d66..2f0c73b819 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -2,7 +2,7 @@
 import os
 import re
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import json
 import numpy as np
@@ -322,15 +322,27 @@ def get_gpt4all_en_dataset() -> HfDataset:
 }
 
 
-def get_dataset(dataset_name_list: List[str]) -> HfDataset:
-    dataset_list: List[HfDataset] = []
+def get_dataset(dataset_name_list: List[str]) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]:
+    """Returns a dataset to be split or a train-val dataset tuple"""
+    dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = []
     for dataset_name in dataset_name_list:
         get_function = DATASET_MAPPING[dataset_name]
         dataset_list.append(get_function())
+
+    assert(all(isinstance(dataset, tuple) for dataset in dataset_list)
+           or all(isinstance(dataset, HfDataset) for dataset in dataset_list))
     if not isinstance(dataset_list[0], tuple):
         dataset = concatenate_datasets(dataset_list)
     else:
-        dataset = dataset_list[0]
+        train_datasets = [dataset[0] for dataset in dataset_list]
+        val_datasets = [dataset[1] for dataset in dataset_list]
+        if len(train_datasets) > 1:
+            train_dataset = concatenate_datasets(train_datasets)
+            val_dataset = concatenate_datasets(val_datasets)
+        else:
+            train_dataset = train_datasets[0]
+            val_dataset = val_datasets[0]
+        dataset = (train_dataset, val_dataset)
     return dataset
 
 
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index 13d4f267da..05090b0862 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -12,7 +12,7 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
-    'default_no_template': {
+    'default_generate': {
         'prefix': [],
         'prompt': ['{{query}}'],
         'chat_sep': [],
@@ -40,7 +40,7 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
-    'chatglm2_no_template': {
+    'chatglm2_generate': {
         'prefix': [[64790, 64792]],
         'prompt': ['{{query}}'],
         'chat_sep': [],
@@ -132,7 +132,7 @@ def _preprocess(
     history: Optional[History] = None,
     system: Optional[str] = None,
     max_length: Optional[int] = None,
-    train = True,
+    validate_generation=True,  # do cross-validation with `model.generate()`
 ) -> Dict[str, List[int]]:
     if history is None:
         history = []
@@ -167,10 +167,12 @@ def _preprocess(
     if response is not None:
         tgt_input_ids = _encode(tokenizer, [response], [])
         tgt_input_ids += _encode(tokenizer, template_config['suffix'], [])
-        if train:
+        if not validate_generation:
+            # train, or validate with `loss`
             labels = [-100] * len(input_ids) + tgt_input_ids
             input_ids += tgt_input_ids
         else:
+            # validate with `model.generate()`
             labels = tgt_input_ids
 
     if max_length is not None:
@@ -178,9 +180,7 @@ def _preprocess(
         if labels is not None:
             labels = labels[-max_length:]
 
-    # if train:
-    #     pass
-    # else:
+    # if validate_generation:
     #     input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids
 
     return {'input_ids': input_ids, 'labels': labels}
@@ -192,7 +192,7 @@ def get_preprocess(
         system: Optional[str] = None,
         max_length: Optional[int] = None,
         batched: bool = False,
-        train=True,
+        validate_generation=False,
 ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]:
 
     def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
@@ -201,7 +201,7 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
         response: str = example.get('response', None)
         custom_system = example.get('system', system)
         return _preprocess(template_type, tokenizer, query, response, history,
-                           custom_system, max_length, train)
+                           custom_system, max_length, validate_generation)
 
     if batched:
         # Avoid tqdm printing too much logs when dataset.map(...)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 440ec1633d..aef7f0b5b3 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -3,14 +3,6 @@
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.utils.data import Dataset
-from transformers.deepspeed import is_deepspeed_zero3_enabled
-from transformers.trainer_utils import PredictionOutput
-from transformers.utils import logging
 
 from .mixin import PushToMsHubMixin, SwiftMixin
 from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew
@@ -21,230 +13,7 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
 
 
 class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
-    
-    def evaluate(
-        self,
-        eval_dataset: Optional[Dataset] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
-        **gen_kwargs
-    ) -> Dict[str, float]:
-        """
-        Run evaluation and returns metrics.
-
-        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init `compute_metrics` argument).
-
-        You can also subclass and override this method to inject custom behavior.
-
-        Args:
-            eval_dataset (`Dataset`, *optional*):
-                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
-                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
-                method.
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is `"eval"` (default)
-            max_length (`int`, *optional*):
-                The maximum target length to use when predicting with the generate method.
-            num_beams (`int`, *optional*):
-                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
-                beam search.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-
-        Returns:
-            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
-            dictionary also contains the epoch number which comes from the training state.
-        """
-
-        gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
-        self._gen_kwargs = gen_kwargs
-
-        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
-
-    def predict(
-        self,
-        test_dataset: Dataset,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "test",
-        **gen_kwargs
-    ) -> PredictionOutput:
-        """
-        Run prediction and returns predictions and potential metrics.
-
-        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in `evaluate()`.
-
-        Args:
-            test_dataset (`Dataset`):
-                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
-                `model.forward()` method are automatically removed. Has to implement the method `__len__`
-            ignore_keys (`List[str]`, *optional*):
-                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
-                gathering predictions.
-            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
-                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is `"eval"` (default)
-            max_length (`int`, *optional*):
-                The maximum target length to use when predicting with the generate method.
-            num_beams (`int`, *optional*):
-                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
-                beam search.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
-
-        <Tip>
-
-        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
-        padding in a token classification task) the predictions will be padded (on the right) to allow for
-        concatenation into one array. The padding index is -100.
-
-        </Tip>
-
-        Returns: *NamedTuple* A namedtuple with the following keys:
-
-            - predictions (`np.ndarray`): The predictions on `test_dataset`.
-            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
-            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
-              labels).
-        """
-
-        gen_kwargs = gen_kwargs.copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.args.generation_max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
-        )
-        self._gen_kwargs = gen_kwargs
-
-
-        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
-
-    def prediction_step(
-        self,
-        model: nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (`nn.Module`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-
-        Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
-            labels (each being optional).
-        """
-
-        if not self.args.predict_with_generate or prediction_loss_only:
-            return super().prediction_step(
-                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
-            )
-
-        has_labels = "labels" in inputs
-        inputs = self._prepare_inputs(inputs)
-
-        # XXX: adapt synced_gpus for fairscale as well
-        gen_kwargs = {
-            'do_sample': True,
-            'top_p': 0.7,
-            'max_length': 512,
-            'temperature': 0.95
-        }
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.model.config.max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
-        )
-        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
-        gen_kwargs["synced_gpus"] = (
-            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
-        )
-
-        if "attention_mask" in inputs:
-            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
-        if "position_ids" in inputs:
-            gen_kwargs["position_ids"] = inputs.get("position_ids", None)
-        if "global_attention_mask" in inputs:
-            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
-
-        # prepare generation inputs
-        # some encoder-decoder models can have varying encoder's and thus
-        # varying model input names
-        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
-            generation_inputs = inputs[self.model.encoder.main_input_name]
-        else:
-            generation_inputs = inputs[self.model.main_input_name]
-
-        gen_kwargs["input_ids"] = generation_inputs
-        generated_tokens = self.model.generate(**gen_kwargs)
-        generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
-
-        # in case the batch is shorter than max length, the output should be padded
-        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
-        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
-            gen_kwargs["max_new_tokens"] + 1
-        ):
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
-
-        loss = None
-
-        if self.args.prediction_loss_only:
-            return (loss, None, None)
-
-        if has_labels:
-            labels = inputs["labels"]
-            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
-                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
-            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
-                gen_kwargs["max_new_tokens"] + 1
-            ):
-                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
-        else:
-            labels = None
-
-        return (loss, generated_tokens, labels)
-        # return (loss, None, None)
-
-    def _pad_tensors_to_max_len(self, tensor, max_length):
-        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
-            # If PAD token is not defined at least EOS token has to be defined
-            pad_token_id = (
-                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
-            )
-        else:
-            if self.model.config.pad_token_id is not None:
-                pad_token_id = self.model.config.pad_token_id
-            else:
-                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
-
-        padded_tensor = pad_token_id * torch.ones(
-            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
-        )
-        padded_tensor[:, : tensor.shape[-1]] = tensor
-        return padded_tensor
+    pass
 
 
 # monkey patching
diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py
index 2892099b20..6173b9292b 100644
--- a/swift/utils/llm_utils.py
+++ b/swift/utils/llm_utils.py
@@ -32,8 +32,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
     input_ids = [torch.tensor(b['input_ids']) for b in batch]
     labels = [torch.tensor(b['labels']) for b in batch]
     attention_mask = [
-        torch.where(input_ids[i]==tokenizer.pad_token_id, 
-        0, 1)
+        torch.ones(len(input_ids[i]), dtype=torch.int64)
         for i in range(len(input_ids))
     ]
 
@@ -42,15 +41,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
     attention_mask = pad_sequence(
         attention_mask, batch_first=True, padding_value=0)
     labels = pad_sequence(labels, batch_first=True, padding_value=-100)
-
-    # if 'position_ids' in batch[0]:
-    #     position_ids = [torch.tensor(b['position_ids']) for b in batch]
-    #     return {
-    #         'input_ids': input_ids,
-    #         'attention_mask': attention_mask,
-    #         'labels': labels,
-    #         'position_ids': torch.stack(position_ids),
-    #     }
+    
     return {
         'input_ids': input_ids,
         'attention_mask': attention_mask,

From 2ec90616cfb5c02527954b2a308dcf4343ed0a67 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 19:47:43 +0800
Subject: [PATCH 17/70] add generation config

---
 examples/pytorch/llm/src/llm_sft.py | 10 +++++++++-
 swift/utils/llm_utils.py            |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 2f730f76fa..dc05bc2c81 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -12,7 +12,7 @@
 from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu)
 from rouge import Rouge
 from rouge.rouge import Rouge
-from transformers import BitsAndBytesConfig
+from transformers import BitsAndBytesConfig, GenerationConfig
 
 from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer,
                    Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger)
@@ -262,6 +262,13 @@ def llm_sft(args: SftArguments) -> None:
                                                      args.dataset_sample,
                                                      args.dataset_seed)
 
+    generation_config = {
+            'do_sample': True,
+            'top_p': 0.7,
+            'max_length': args.max_length,
+            'temperature': 0.95
+    }
+
     # args.max_source_length = 64
     # args.max_target_length = 64
     # prompt_column = 'query'
@@ -425,6 +432,7 @@ def llm_sft(args: SftArguments) -> None:
         ddp_backend=args.ddp_backend,
         gradient_checkpointing=args.gradient_checkpointing,
         predict_with_generate=args.predict_with_generate,
+        generation_config=GenerationConfig.from_dict(generation_config),
         local_rank=local_rank)
 
     if args.gradient_checkpointing:
diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py
index 6173b9292b..b61ce173d7 100644
--- a/swift/utils/llm_utils.py
+++ b/swift/utils/llm_utils.py
@@ -41,7 +41,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
     attention_mask = pad_sequence(
         attention_mask, batch_first=True, padding_value=0)
     labels = pad_sequence(labels, batch_first=True, padding_value=-100)
-    
+
     return {
         'input_ids': input_ids,
         'attention_mask': attention_mask,

From 953889039eb7fa70fc60c1c23211e68e8ca33f01 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 20:20:51 +0800
Subject: [PATCH 18/70] fix

---
 swift/trainers/trainers.py | 97 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index aef7f0b5b3..0ba6d62aff 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -1,8 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
+from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from .mixin import PushToMsHubMixin, SwiftMixin
 from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew
@@ -13,7 +18,97 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
 
 
 class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
-    pass
+
+    def prediction_step(
+            self,
+            model: nn.Module,
+            inputs: Dict[str, Union[torch.Tensor, Any]],
+            prediction_loss_only: bool,
+            ignore_keys: Optional[List[str]] = None,
+            **gen_kwargs,
+    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+
+        # XXX: adapt synced_gpus for fairscale as well
+        # Priority (handled in generate):
+        # gen_kwargs > model.generation_config > default GenerationConfig()
+
+        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
+            gen_kwargs = self._gen_kwargs.copy()
+
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.model.config.max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
+        )
+        default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
+        gen_kwargs["synced_gpus"] = (
+            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
+        )
+
+        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
+        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
+        if (
+                "labels" in inputs
+                and "decoder_input_ids" in inputs
+                and inputs["labels"].shape == inputs["decoder_input_ids"].shape
+        ):
+            inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
+        generated_tokens = self.model.generate(**inputs, **gen_kwargs)
+
+        # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
+        # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
+        # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
+        if self.model.generation_config._from_model_config:
+            self.model.generation_config._from_model_config = False
+
+        # Retrieves GenerationConfig from model.generation_config
+        gen_config = self.model.generation_config
+        # in case the batch is shorter than max length, the output should be padded
+        if generated_tokens.shape[-1] < gen_config.max_length:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
+        elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
+
+        if has_labels:
+            labels = inputs["labels"]
+            if labels.shape[-1] < gen_config.max_length:
+                labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
+            elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
+                labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
+        else:
+            labels = None
+
+        return None, generated_tokens, labels
 
 
 # monkey patching

From 55ffd9e1532c54141ef03941f3e84467db261c98 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 20:28:30 +0800
Subject: [PATCH 19/70] fix

---
 swift/trainers/trainers.py | 88 +++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 34 deletions(-)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 0ba6d62aff..ee45ff5d15 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -25,7 +25,6 @@ def prediction_step(
             inputs: Dict[str, Union[torch.Tensor, Any]],
             prediction_loss_only: bool,
             ignore_keys: Optional[List[str]] = None,
-            **gen_kwargs,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
@@ -42,8 +41,6 @@ def prediction_step(
                 argument `labels`. Check your model's documentation for all accepted arguments.
             prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
-            gen_kwargs:
-                Additional `generate` specific kwargs.
 
         Return:
             Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
@@ -59,12 +56,7 @@ def prediction_step(
         inputs = self._prepare_inputs(inputs)
 
         # XXX: adapt synced_gpus for fairscale as well
-        # Priority (handled in generate):
-        # gen_kwargs > model.generation_config > default GenerationConfig()
-
-        if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"):
-            gen_kwargs = self._gen_kwargs.copy()
-
+        gen_kwargs = self._gen_kwargs.copy()
         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
             gen_kwargs["max_length"] = self.model.config.max_length
         gen_kwargs["num_beams"] = (
@@ -75,40 +67,68 @@ def prediction_step(
             gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
         )
 
-        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
-        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
-        if (
-                "labels" in inputs
-                and "decoder_input_ids" in inputs
-                and inputs["labels"].shape == inputs["decoder_input_ids"].shape
-        ):
-            inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
-        generated_tokens = self.model.generate(**inputs, **gen_kwargs)
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "position_ids" in inputs:
+            gen_kwargs["position_ids"] = inputs.get("position_ids", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+
+        # prepare generation inputs
+        # some encoder-decoder models can have varying encoder's and thus
+        # varying model input names
+        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
+            generation_inputs = inputs[self.model.encoder.main_input_name]
+        else:
+            generation_inputs = inputs[self.model.main_input_name]
 
-        # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
-        # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
-        # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
-        if self.model.generation_config._from_model_config:
-            self.model.generation_config._from_model_config = False
+        gen_kwargs["input_ids"] = generation_inputs
+        generated_tokens = self.model.generate(**gen_kwargs)
+        generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
 
-        # Retrieves GenerationConfig from model.generation_config
-        gen_config = self.model.generation_config
         # in case the batch is shorter than max length, the output should be padded
-        if generated_tokens.shape[-1] < gen_config.max_length:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
-        elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+                gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+
+        loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
 
         if has_labels:
             labels = inputs["labels"]
-            if labels.shape[-1] < gen_config.max_length:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
-            elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
-                labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
+            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
+                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
+                    gen_kwargs["max_new_tokens"] + 1
+            ):
+                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
         else:
             labels = None
 
-        return None, generated_tokens, labels
+        return (loss, generated_tokens, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+            # If PAD token is not defined at least EOS token has to be defined
+            pad_token_id = (
+                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            )
+        else:
+            if self.model.config.pad_token_id is not None:
+                pad_token_id = self.model.config.pad_token_id
+            else:
+                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
+
+        padded_tensor = pad_token_id * torch.ones(
+            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
+        )
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor
 
 
 # monkey patching

From eaf9fc98fb8a70681b9edab0b30d365bb20dc924 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 20:41:49 +0800
Subject: [PATCH 20/70] fix

---
 swift/trainers/trainers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index ee45ff5d15..e97aa7f179 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -56,7 +56,7 @@ def prediction_step(
         inputs = self._prepare_inputs(inputs)
 
         # XXX: adapt synced_gpus for fairscale as well
-        gen_kwargs = self._gen_kwargs.copy()
+        gen_kwargs = self.model.generation_config.to_dict().copy()
         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
             gen_kwargs["max_length"] = self.model.config.max_length
         gen_kwargs["num_beams"] = (

From 462d3ca64eb0ba3c3beccf71cbb9cb5d3807a5dd Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 20:52:12 +0800
Subject: [PATCH 21/70] add perf

---
 swift/trainers/trainers.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index e97aa7f179..d738b6d817 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -8,6 +8,7 @@
 from transformers import Trainer as HfTrainer
 from transformers import trainer
 from transformers.deepspeed import is_deepspeed_zero3_enabled
+import time
 
 from .mixin import PushToMsHubMixin, SwiftMixin
 from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew
@@ -19,6 +20,15 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
 
 class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
 
+    def __init__(self, *args, **kwargs):
+        super.__init__(*args, **kwargs)
+        self.perf = {
+            'gen_time': 0.,
+            'gen_len': 0,
+            'eval_memory': 0.,
+            'train_memory': 0.,
+        }
+
     def prediction_step(
             self,
             model: nn.Module,
@@ -83,8 +93,14 @@ def prediction_step(
             generation_inputs = inputs[self.model.main_input_name]
 
         gen_kwargs["input_ids"] = generation_inputs
+        gen_time = time.time()
         generated_tokens = self.model.generate(**gen_kwargs)
+        gen_time = time.time() - gen_time
         generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
+        gen_len = len(generated_tokens[0])
+        self.perf['gen_time'] = self.perf['gen_time'] + gen_time
+        self.perf['gen_len'] = self.perf['gen_len'] + gen_len
+        self.perf['eval_memory'] = torch.cuda.memory_allocated()
 
         # in case the batch is shorter than max length, the output should be padded
         if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:

From f956f93585863f094333f4db70b15b9f04e7c662 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 7 Sep 2023 22:07:46 +0800
Subject: [PATCH 22/70] add perf info

---
 examples/pytorch/llm/src/llm_sft.py |  1 +
 swift/trainers/trainers.py          | 17 ++++++++++++++---
 swift/tuners/base.py                |  3 ++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index dc05bc2c81..7ee5a717d3 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -502,6 +502,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
+    logger.info(trainer.perf)
 
     # ### Visualization
     if is_master():
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index d738b6d817..23e9a3d775 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -21,14 +21,25 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
 class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
 
     def __init__(self, *args, **kwargs):
-        super.__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.perf = {
             'gen_time': 0.,
             'gen_len': 0,
             'eval_memory': 0.,
-            'train_memory': 0.,
+            'train_memory': None,
+            'model': self.model.get_trainable_parameters(),
         }
 
+    def train(
+        self,
+        *args,
+        **kwargs,
+    ):
+        training_output = super().train(*args, **kwargs)
+        if self.perf['train_memory'] is None:
+            self.perf['train_memory'] = torch.cuda.memory_allocated()
+        return training_output
+
     def prediction_step(
             self,
             model: nn.Module,
@@ -100,7 +111,7 @@ def prediction_step(
         gen_len = len(generated_tokens[0])
         self.perf['gen_time'] = self.perf['gen_time'] + gen_time
         self.perf['gen_len'] = self.perf['gen_len'] + gen_len
-        self.perf['eval_memory'] = torch.cuda.memory_allocated()
+        self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), self.perf['eval_memory'])
 
         # in case the batch is shorter than max length, the output should be padded
         if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index b6f4d1c3db..0eedaba028 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -397,7 +397,8 @@ def get_trainable_parameters(self):
             if param.requires_grad:
                 trainable_params += num_params
         return f'trainable params: {trainable_params:,d} || all params: {all_param:,d} ' \
-               f'|| trainable%: {100 * trainable_params / all_param}'
+               f'|| trainable%: {100 * trainable_params / all_param}' \
+               f'|| cuda memory: {sum([torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count())])}'
 
 
 class Swift:

From df194a2274016a78083b5b24f801d242958ebd87 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 8 Sep 2023 13:39:33 +0800
Subject: [PATCH 23/70] fix

---
 examples/pytorch/llm/src/llm_sft.py       | 107 ++--------------------
 examples/pytorch/llm/src/utils/dataset.py |  15 +++
 2 files changed, 21 insertions(+), 101 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 7ee5a717d3..38f2e177a5 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -229,7 +229,7 @@ def llm_sft(args: SftArguments) -> None:
                 adapter_config = AdapterConfig(
                     dim=model.config.hidden_size,
                     target_modules=MODEL_MAPPING[args.model_type].get(
-                        'adapter_TM', 'mlp'),
+                        'adapter_TM', ['mlp']),
                     method_name='forward',
                     hidden_pos=0,
                     adapter_length=args.adapter_length,
@@ -251,6 +251,7 @@ def llm_sft(args: SftArguments) -> None:
     show_layers(model)
     print_model_info(model)
     logger.info(str(model))
+    logger.info(model.get_trainable_parameters())
 
     # ### Loading Dataset
     dataset = get_dataset(args.dataset.split(','))
@@ -263,108 +264,12 @@ def llm_sft(args: SftArguments) -> None:
                                                      args.dataset_seed)
 
     generation_config = {
-            'do_sample': True,
-            'top_p': 0.7,
-            'max_length': args.max_length,
-            'temperature': 0.95
+        'do_sample': True,
+        'top_p': 0.7,
+        'max_length': args.max_length,
+        'temperature': 0.95
     }
 
-    # args.max_source_length = 64
-    # args.max_target_length = 64
-    # prompt_column = 'query'
-    # response_column = 'response'
-    # history_column = None
-    # prefix = ''
-    # max_target_length = 128
-    #
-    # def preprocess_function_eval(examples):
-    #     inputs, targets = [], []
-    #     for i in range(len(examples[prompt_column])):
-    #         if examples[prompt_column][i] and examples[response_column][i]:
-    #             query = examples[prompt_column][i]
-    #             if history_column is None or len(examples[history_column][i]) == 0:
-    #                 prompt = query
-    #             else:
-    #                 prompt = ''
-    #                 history = examples[history_column][i]
-    #                 for turn_idx, (old_query, response) in enumerate(history):
-    #                     prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
-    #                         turn_idx, old_query, response)
-    #                 prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
-    #             inputs.append(prompt)
-    #             targets.append(examples[response_column][i])
-    #
-    #     inputs = [prefix + inp for inp in inputs]
-    #     model_inputs = tokenizer(
-    #         inputs,
-    #         max_length=args.max_source_length,
-    #         truncation=True,
-    #         padding=True)
-    #     labels = tokenizer(
-    #         text_target=targets, max_length=max_target_length, truncation=True)
-    #
-    #     if True:
-    #         labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100)
-    #                                 for lb in label]
-    #                                for label in labels['input_ids']]
-    #     model_inputs['labels'] = labels['input_ids']
-    #
-    #     return model_inputs
-    #
-    # def preprocess_function_train(examples):
-    #     max_seq_length = args.max_source_length + args.max_target_length
-    #
-    #     model_inputs = {
-    #         'input_ids': [],
-    #         'labels': [],
-    #     }
-    #     for i in range(len(examples[prompt_column])):
-    #         if examples[prompt_column][i] and examples[response_column][i]:
-    #             query, answer = examples[prompt_column][i], examples[
-    #                 response_column][i]
-    #
-    #             if history_column is None:
-    #                 prompt = query
-    #             else:
-    #                 prompt = ''
-    #                 history = examples[history_column][i]
-    #                 for turn_idx, (old_query, response) in enumerate(history):
-    #                     prompt += '[Round {}]\n问：{}\n答：{}\n'.format(
-    #                         turn_idx, old_query, response)
-    #                 prompt += '[Round {}]\n问：{}\n答：'.format(len(history), query)
-    #
-    #             prompt = prefix + prompt
-    #             a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
-    #             b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
-    #
-    #             if len(a_ids) > args.max_source_length - 1:
-    #                 a_ids = a_ids[:args.max_source_length - 1]
-    #
-    #             if len(b_ids) > args.max_target_length - 2:
-    #                 b_ids = b_ids[:args.max_target_length - 2]
-    #
-    #             input_ids = tokenizer.build_inputs_with_special_tokens(
-    #                 a_ids, b_ids)
-    #
-    #             if False:
-    #                 context_length = input_ids.index(tokenizer.bos_token_id)
-    #             else:
-    #                 context_length = len(a_ids) + 2
-    #             mask_position = context_length - 1
-    #             labels = [-100] * context_length + input_ids[mask_position + 1:]
-    #
-    #             pad_len = max_seq_length - len(input_ids)
-    #             input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
-    #             labels = labels + [tokenizer.pad_token_id] * pad_len
-    #             if True:
-    #                 labels = [(lb if lb != tokenizer.pad_token_id else -100)
-    #                           for lb in labels]
-    #
-    #             model_inputs['input_ids'].append(input_ids)
-    #             model_inputs['labels'].append(labels)
-    #
-    #     return model_inputs
-
     preprocess_func = get_preprocess(
         args.template_type,
         tokenizer,
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 2f0c73b819..3b9cf1c241 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -134,6 +134,20 @@ def get_instinwild_en_dataset() -> HfDataset:
     return _process_alpaca_dataset(dataset)
 
 
+def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]:
+    dataset_train: HfDataset = MsDataset.load(
+        'modelscope/DuReader_robust-QG', split='train').to_hf_dataset().rename_columns({
+            "text1": "query",
+            "text2": "response",
+        })
+    dataset_val: HfDataset = MsDataset.load(
+        'modelscope/DuReader_robust-QG', split='validation').to_hf_dataset().rename_columns({
+            "text1": "query",
+            "text2": "response",
+        })
+    return dataset_train, dataset_val
+
+
 def get_cot_en_dataset() -> HfDataset:
     dataset: HfDataset = MsDataset.load(
         'YorickHe/CoT', split='train').to_hf_dataset()
@@ -319,6 +333,7 @@ def get_gpt4all_en_dataset() -> HfDataset:
     # multi-modal
     'coco-en': get_coco_en_dataset,
     'advertise_gen': get_advertise_gen_dataset,
+    'du_reader': get_du_reader_dataset,
 }
 
 

From e7cf7f724f47e6e26ba5c9d4ae1cc1d0d225ed55 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 8 Sep 2023 14:49:04 +0800
Subject: [PATCH 24/70] fix

---
 examples/pytorch/llm/src/llm_sft.py | 31 ++++++++++++++++-------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 38f2e177a5..2593843215 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -324,8 +324,8 @@ def llm_sft(args: SftArguments) -> None:
         eval_steps=args.eval_steps,
         dataloader_num_workers=args.dataloader_num_workers,
         load_best_model_at_end=True,
-        metric_for_best_model='loss',
-        greater_is_better=False,
+        metric_for_best_model='rouge-l',
+        greater_is_better=True,
         sortish_sampler=True,
         optim=args.optim,
         hub_model_id=args.hub_model_id,
@@ -379,18 +379,21 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
             if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
                 hypothesis = [tokenizer.decode(tokenizer.eos_token_id)]
             reference = list(jieba.cut(label))
-            rouge = Rouge()
-            scores = rouge.get_scores(' '.join(hypothesis),
-                                      ' '.join(reference))
-            result = scores[0]
-
-            for k, v in result.items():
-                score_dict[k].append(round(v['f'] * 100, 4))
-            bleu_score = sentence_bleu(
-                [list(label)],
-                list(pred),
-                smoothing_function=SmoothingFunction().method3)
-            score_dict['bleu-4'].append(round(bleu_score * 100, 4))
+            try:
+                rouge = Rouge()
+                scores = rouge.get_scores(' '.join(hypothesis),
+                                        ' '.join(reference))
+                result = scores[0]
+
+                for k, v in result.items():
+                    score_dict[k].append(round(v['f'] * 100, 4))
+                bleu_score = sentence_bleu(
+                    [list(label)],
+                    list(pred),
+                    smoothing_function=SmoothingFunction().method3)
+                score_dict['bleu-4'].append(round(bleu_score * 100, 4))
+            except:
+                logger.error(f'eval error {hypothesis}, {reference}')
 
         for k, v in score_dict.items():
             score_dict[k] = float(np.mean(v))

From 20b077297b6134f52e039199a7f434ae3b115504 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Fri, 8 Sep 2023 17:04:27 +0800
Subject: [PATCH 25/70] revert code

---
 examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
index ff0c147200..3baa73288a 100644
--- a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh
@@ -10,7 +10,7 @@ torchrun \
     --sft_type lora \
     --output_dir runs \
     --ddp_backend gloo \
-    --dataset advertise_gen \
+    --dataset alpaca-en,alpaca-zh \
     --dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 1024 \

From cebdd11549ca8035405d93203a2798e1e1d95c3b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 8 Sep 2023 20:21:33 +0800
Subject: [PATCH 26/70] support activate/deactivate adapter

---
 swift/tuners/adapter.py         | 32 +++++++++----
 swift/tuners/base.py            | 25 ++++++++--
 swift/tuners/lora.py            | 85 ++++++++++++++++++++++-----------
 swift/tuners/prompt.py          | 30 ++++++++----
 swift/tuners/restuning.py       | 40 +++++++++++-----
 swift/tuners/side.py            | 27 ++++++++---
 swift/tuners/utils.py           |  4 +-
 swift/utils/torch_utils.py      | 12 +++++
 tests/utils/test_torch_utils.py | 14 ++++++
 9 files changed, 199 insertions(+), 70 deletions(-)
 create mode 100644 tests/utils/test_torch_utils.py

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 19233e60eb..b60ffb4ad0 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -3,12 +3,12 @@
 import re
 import types
 from dataclasses import dataclass, field
-from typing import Union
+from typing import Union, List
 
 import torch
 from torch import nn
 from transformers.activations import ACT2CLS
-
+from swift.utils.torch_utils import find_sub_module
 from .utils import SwiftConfig, SwiftOutput
 
 
@@ -71,7 +71,7 @@ def __post_init__(self):
 class Adapter:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: AdapterConfig) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: AdapterConfig, adapter_name: str) -> SwiftOutput:
         """Prepare a model with `AdapterConfig`"""
         module_keys = [key for key, _ in model.named_modules()]
 
@@ -84,19 +84,21 @@ def _forward(self, *args, **kwargs):
                     if isinstance(args, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, int):
                             return args[0:config.hidden_pos] + args[
-                                config.hidden_pos] + getattr(self, 'adapter')(args[config.hidden_pos]) \
+                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) \
                                 + args[config.hidden_pos + 1:] # noqa
                         else:
                             kwargs[config.hidden_pos] = args[
-                                config.hidden_pos] + getattr(self, 'adapter')(
+                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(
                                     args[config.hidden_pos])
                     elif isinstance(args, torch.Tensor):
-                        args = getattr(self, 'adapter')(args)
+                        args = getattr(self, f'adapter_{adapter_name}')(args)
                     return args
 
                 def _feed_forward_chunk(self, attention_output):
                     return _forward(self, attention_output)
 
+                # TODO The `config.method_name` method should not be replaced twice.
+
                 module.forward_origin = getattr(module, config.method_name)
                 num_args_in_forward_chunk_fn = len(
                     inspect.signature(module.forward_origin).parameters)
@@ -109,12 +111,12 @@ def _feed_forward_chunk(self, attention_output):
                 adapter_module = AdapterModule(config.dim,
                                                config.adapter_length,
                                                ACT2CLS[config.act_layer])
-                setattr(module, 'adapter', adapter_module)
+                setattr(module, f'adapter_{adapter_name}', adapter_module)
 
-        def state_dict_callback(state_dict):
+        def state_dict_callback(state_dict, adapter_name: str):
             return {
                 key: value
-                for key, value in state_dict.items() if 'adapter' in key
+                for key, value in state_dict.items() if f'adapter_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -123,6 +125,12 @@ def mark_trainable_callback(model):
         return SwiftOutput(config, state_dict_callback,
                            mark_trainable_callback)
 
+    @staticmethod
+    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        for _module in modules:
+            module.activate(activate)
+
 
 class AdapterModule(nn.Module):
     """The implementation of adapter tuning method.
@@ -150,6 +158,7 @@ def __init__(
         self.activate = act_layer()
         self.ln2 = nn.Linear(adapter_length, dim)
         self.init_weights()
+        self._activate = True
 
     def init_weights(self):
 
@@ -160,7 +169,12 @@ def _init_weights(m):
 
         self.apply(_init_weights)
 
+    def activate(self, activate=True):
+        self._activate = activate
+
     def forward(self, x, identity=None):
+        if not self.activate:
+            return 0.
         out = self.ln2(self.activate(self.ln1(x)))
         if identity is None:
             identity = x
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index b6f4d1c3db..a513e67588 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -52,12 +52,12 @@ def __init__(self,
 
         self.adapters = {}
         if isinstance(config, SwiftConfig):
-            self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config)
+            self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config, DEFAULT_ADAPTER)
         elif isinstance(config, dict):
             assert (all(isinstance(c, SwiftConfig) for c in config.values()))
             for adapter_name, config in config.items():
                 self.adapters[adapter_name] = self._prepare_model(
-                    model, config)
+                    model, config, adapter_name)
         self.model = model
 
         self.extra_state_keys = extra_state_keys or []
@@ -151,7 +151,7 @@ def state_dict(self,
         if kwargs.get('save_adapter', True):
             for name, output in self.adapters.items():
                 if adapter_name == name or adapter_name is None:
-                    state_dicts.update(output.state_dict_callback(destination))
+                    state_dicts.update(output.state_dict_callback(destination, adapter_name))
         if kwargs.get('save_extra_states', True):
             state_dicts.update({
                 k: v
@@ -260,10 +260,11 @@ def _prepare_model(
         cls,
         model: nn.Module,
         config: SwiftConfig,
+        adapter_name: str,
     ):
         assert (hasattr(config, SWIFT_TYPE_KEY))
         from .mapping import SWIFT_MAPPING
-        return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config)
+        return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config, adapter_name)
 
     def create_or_update_model_card(self, output_dir: str):
         """
@@ -381,6 +382,22 @@ def save_pretrained(self,
     def base_model(self):
         return self.model
 
+    def activate_adapter(self, adapter_name):
+        if adapter_name not in self.adapters:
+            return
+
+        from .mapping import SWIFT_MAPPING
+        SWIFT_MAPPING[self.adapters[adapter_name].config.swift_type][1]\
+            .activate_adapter(self.base_model, adapter_name, True)
+
+    def deactivate_adapter(self, adapter_name):
+        if adapter_name not in self.adapters:
+            return
+
+        from .mapping import SWIFT_MAPPING
+        SWIFT_MAPPING[self.adapters[adapter_name].config.swift_type][1]\
+            .activate_adapter(self.base_model, adapter_name, False)
+
     def get_trainable_parameters(self):
         """
         Get the content of trainable parameters in the model.
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 5cbb797970..4048d775a9 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -15,6 +15,7 @@
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
 from .utils import SwiftConfig, SwiftOutput
+from ..utils.torch_utils import find_sub_module
 
 if is_bnb_available():
     import bitsandbytes as bnb
@@ -90,12 +91,13 @@ def __post_init__(self):
 class LoRA:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: LoRAConfig):
+    def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str):
         """Prepare a model with `LoRAConfig`"""
         LoRA._dynamic_patch_lora(
             model,
             replace_modules=config.target_modules,
             r=config.r,
+            adapter_name=adapter_name,
             lora_alpha=config.lora_alpha,
             lora_dropout=config.lora_dropout,
             merge_weights=config.merge_weights,
@@ -103,8 +105,8 @@ def prepare_model(model: nn.Module, config: LoRAConfig):
             enable_lora=config.enable_lora,
             fan_in_fan_out=config.fan_in_fan_out)
 
-        def state_dict_callback(state_dict):
-            return lora_state_dict(state_dict, config.bias)
+        def state_dict_callback(state_dict, adapter_name):
+            return lora_state_dict(state_dict, model.lora_module_map, adapter_name, config.bias)
 
         def mark_trainable_callback(model):
             mark_lora_as_trainable(model, config.bias)
@@ -113,7 +115,16 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
+    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        for _module in modules:
+            if isinstance(module, LoRALayer):
+                module.activate(activate)
+            else:
+                module.active_adapter = 'default' if activate else 'invalid'
+
+    @staticmethod
+    def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name,
                             **kwargs):
         """Dynamic patch lora to model
 
@@ -126,7 +137,9 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         Returns:
             The lora modules
         """
-        modules = []
+        if not hasattr(model, 'lora_module_map'):
+            model.lora_module_map = {}
+        modules = {}
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(replace_modules, (str, list))
         AutoGPTQQuantLinear = get_auto_gptq_quant_linear(
@@ -209,9 +222,15 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                     lora_module = Embedding(
                         num_embeddings=sub_module.num_embeddings,
                         embedding_dim=sub_module.embedding_dim,
+                        padding_idx=sub_module.padding_idx,
+                        max_norm=sub_module.max_norm,
+                        norm_type=sub_module.norm_type,
+                        scale_grad_by_freq=sub_module.scale_grad_by_freq,
+                        sparse=sub_module.sparse,
                         r=kwargs['r'],
                         lora_alpha=kwargs['lora_alpha'],
-                        merge_weights=kwargs['merge_weights'])
+                        merge_weights=kwargs['merge_weights'],
+                        )
                 elif isinstance(sub_module, torch.nn.Conv2d):
                     kwargs.pop('fan_in_fan_out', None)
                     lora_module = Conv2d(
@@ -231,10 +250,11 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                     if getattr(sub_module, 'state', None) is not None:
                         lora_module.state = sub_module.state
                     lora_module.to(sub_module.weight.device)
+                    lora_module.adapter_name = adapter_name
                     setattr(module, _key, lora_module)
-                    modules.append(lora_module)
+                    modules[module_key] = adapter_name
 
-        return modules
+        model.lora_module_map.update(modules)
 
     @staticmethod
     def unpatch_lora(model, config: LoRAConfig):
@@ -247,11 +267,9 @@ def unpatch_lora(model, config: LoRAConfig):
         Args:
             model: The model called with `tune` function.
             config: The `LoRAConfig` to use.
-
-        Returns:
-            The lora modules.
         """
-        modules = []
+        if not hasattr(model, 'lora_module_map'):
+            model.lora_module_map = {}
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(config.replace_modules, (str, list))
         replace_modules = config.replace_modules
@@ -274,7 +292,18 @@ def unpatch_lora(model, config: LoRAConfig):
                     origin_module = torch.nn.Linear(
                         sub_module.in_features,
                         sub_module.out_features,
-                        bias=sub_module.bias is not None)
+                        bias=hasattr(sub_module, 'bias') and sub_module.bias is not None,
+                    )
+                elif isinstance(sub_module, Embedding):
+                    origin_module = torch.nn.Embedding(
+                        num_embeddings=sub_module.num_embeddings,
+                        embedding_dim=sub_module.embedding_dim,
+                        padding_idx=sub_module.padding_idx,
+                        max_norm=sub_module.max_norm,
+                        norm_type=sub_module.norm_type,
+                        scale_grad_by_freq=sub_module.scale_grad_by_freq,
+                        sparse=sub_module.sparse,
+                    )
                 elif isinstance(sub_module, Conv2d):
                     origin_module = torch.nn.Conv2d(
                         sub_module.in_channels,
@@ -289,19 +318,12 @@ def unpatch_lora(model, config: LoRAConfig):
                     sub_module.merge_weights = True
                     sub_module.eval()
                     origin_module.weight = sub_module.weight
-                    if sub_module.bias is not None:
+                    if getattr(sub_module, 'bias', None) is not None:
                         origin_module.bias = sub_module.bias
                     origin_module.to(sub_module.weight.device).to(
                         sub_module.weight.dtype)
                     setattr(module, _key, origin_module)
-                    modules.append(sub_module)
-
-        model.state_dict_hook_handle.remove()
-        if hasattr(model, 'load_state_dict_hook_handle'):
-            model.load_state_dict_hook_handle.remove()
-        else:
-            model.load_state_dict = model.load_state_dict_origin
-        return modules
+                    model.lora_module_map.pop(module_key, None)
 
 
 class LoRALayer:
@@ -314,6 +336,7 @@ def __init__(
         merge_weights: bool,
     ):
         self.r = r
+        self.old_r = r
         self.lora_alpha = lora_alpha
         # Optional dropout
         if lora_dropout > 0.:
@@ -324,6 +347,12 @@ def __init__(
         self.merged = False
         self.merge_weights = merge_weights
 
+    def activate(self, activate=True):
+        if activate:
+            self.r = self.old_r
+        else:
+            self.r = 0
+
 
 class Embedding(nn.Embedding, LoRALayer):
     # LoRA implemented in a dense layer
@@ -694,7 +723,7 @@ def forward(self, x: torch.Tensor):
         return nn.Conv2d.forward(self, x)
 
 
-def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
+def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'none') -> None:
     if bias == 'none':
         return
     elif bias == 'all':
@@ -703,7 +732,7 @@ def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
                 p.requires_grad = True
     elif bias == 'lora_only':
         for m in model.modules():
-            if isinstance(m, LoRALayer) and \
+            if adapter_name == getattr(m, 'adapter_name', None) and \
                     hasattr(m, 'bias') and \
                     m.bias is not None:
                 m.bias.requires_grad = True
@@ -711,18 +740,18 @@ def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
         raise NotImplementedError
 
 
-def lora_state_dict(state_dict, bias: str = 'none') -> Dict[str, torch.Tensor]:
+def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]:
     if bias == 'none':
-        return {k: state_dict[k] for k in state_dict if 'lora_' in k}
+        return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k, None) == adapter_name}
     elif bias == 'all':
         return {
             k: state_dict[k]
-            for k in state_dict if 'lora_' in k or 'bias' in k
+            for k in state_dict if ('lora_' in k and module_map.get(k, None) == adapter_name) or 'bias' in k
         }
     elif bias == 'lora_only':
         to_return = {}
         for k in state_dict:
-            if 'lora_' in k:
+            if 'lora_' in k and module_map.get(k, None) == adapter_name:
                 to_return[k] = state_dict[k]
                 bias_name = k.split('lora_')[0] + 'bias'
                 if bias_name in state_dict:
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index f426a4dd83..2600d841e4 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -3,12 +3,13 @@
 import re
 import types
 from dataclasses import dataclass, field
-from typing import Union
+from typing import Union, List
 
 import torch
 from torch import nn
 
 from .utils import SwiftConfig, SwiftOutput
+from ..utils.torch_utils import find_sub_module
 
 
 @dataclass
@@ -77,7 +78,7 @@ def __post_init__(self):
 class Prompt:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: PromptConfig):
+    def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str):
         module_keys = [key for key, _ in model.named_modules()]
         match_module_keys = []
         for module_key in module_keys:
@@ -91,7 +92,7 @@ def _forward(self, *args, **kwargs):
                         input_embedding = kwargs[config.embedding_pos]
 
                     input_embedding = getattr(
-                        self, 'prompt').forward(input_embedding)
+                        self, f'prompt_{adapter_name}').forward(input_embedding)
                     if isinstance(config.embedding_pos, int):
                         args = type(args)(
                             args[0:config.embedding_pos] + (input_embedding, )
@@ -109,7 +110,7 @@ def _forward(self, *args, **kwargs):
                         if attention_mask is not None:
                             attention_mask = getattr(
                                 self,
-                                'prompt').patch_attention_mask(attention_mask)
+                                f'prompt_{adapter_name}').patch_attention_mask(attention_mask)
                         if isinstance(config.attention_mask_pos, int):
                             args = type(args)(
                                 args[0:config.attention_mask_pos]
@@ -121,7 +122,7 @@ def _forward(self, *args, **kwargs):
                     forward_output = self.forward_origin(*args, **kwargs)
                     if config.extract_embedding:
                         forward_output = getattr(
-                            self, 'prompt').extract(forward_output)
+                            self, f'prompt_{adapter_name}').extract(forward_output)
 
                     return forward_output
 
@@ -136,13 +137,13 @@ def _forward(self, *args, **kwargs):
                                              config.prompt_length,
                                              config.attention_mask_value,
                                              config.attach_front)
-                setattr(module, 'prompt', prompt_module)
+                setattr(module, f'prompt_{adapter_name}', prompt_module)
                 match_module_keys.append(module_key)
 
-        def state_dict_callback(state_dict):
+        def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if 'prompt' in key
+                for key, value in state_dict.items() if f'prompt_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -151,6 +152,12 @@ def mark_trainable_callback(model):
         return SwiftOutput(config, state_dict_callback,
                            mark_trainable_callback)
 
+    @staticmethod
+    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        for _module in modules:
+            module.activate(activate)
+
 
 class PromptModule(nn.Module):
     """The implementation of vision prompt tuning method.
@@ -178,11 +185,13 @@ def __init__(self,
         self.prompt_length = prompt_length
         self.mask_values = mask_values
         self.attach_front = attach_front
-
+        self._activate = True
         self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim))
         nn.init.xavier_uniform_(self.prompt_token)
 
     def forward(self, x):
+        if not self._activate:
+            return x
         prompt_token = self.prompt_token.expand(x.shape[0], -1, -1)
 
         if self.layer_num == 0:
@@ -199,6 +208,9 @@ def forward(self, x):
                               dim=1)
         return x
 
+    def activate(self, activate=True):
+        self._activate = activate
+
     def patch_attention_mask(self, m):
         prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length),
                                            self.mask_values).to(m.device)
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index b72e000bcb..1f5d7f3a2f 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -5,11 +5,13 @@
 from dataclasses import dataclass, field
 from typing import Union, Dict, Optional, List
 
+import torch
 import torch.nn as nn
 
 from swift.utils.logger import get_logger
 from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner
 from .utils import SwiftConfig, SwiftOutput
+from ..utils.torch_utils import find_sub_module
 
 logger = get_logger()
 
@@ -112,7 +114,7 @@ def __post_init__(self):
 class ResTuning:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: ResTuningConfig) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: ResTuningConfig, adapter_name: str) -> SwiftOutput:
         """Prepare a model with `ResTuningConfig`"""
 
         def _forward_seq(self, input, *args, **kwargs):
@@ -123,17 +125,21 @@ def _forward_seq(self, input, *args, **kwargs):
 
         def _forward_target(self, *args, **kwargs):
             if self.target_modules_hook == "input":
-                args_main = _forward_restuning(self)
-                args_main = self.forward_origin(args_main, **kwargs)
+                args = list(args)
+                _arg = args[0 if self.target_hidden_pos is None else self.target_hidden_pos]
+                args_main = _forward_restuning(self, _arg)
+                args[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main
+                args_main = self.forward_origin(*args, **kwargs)
             else:
                 _args_main = self.forward_origin(*args, **kwargs)
-                args_main = _forward_restuning(self)
+                _arg = _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] if isinstance(_args_main, (tuple, list)) else _args_main
+                args_main = _forward_restuning(self, _arg)
                 if type(_args_main) != type(args_main):
-                    _args_main[self.target_hidden_pos] = args_main
+                    _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main
                     args_main = _args_main
             return args_main
 
-        def _forward_restuning(self):
+        def _forward_restuning(self, origin_arg):
             probe_results = []
             root_module_ins = self.root_module_ins_list[0]
             stem_module_ins_list = self.stem_module_ins_list
@@ -150,7 +156,7 @@ def _forward_restuning(self):
                     probe_results.append(st_mod.probe_input_data)
                 else:
                     probe_results.append(st_mod.probe_output_data)
-            args_main = getattr(top_module, 'restuning')(probe_results)
+            args_main = getattr(top_module, f'restuning_{adapter_name}')(probe_results, origin_arg)
             return args_main
 
         # 1. Matching the root module
@@ -208,7 +214,7 @@ def _forward_restuning(self):
             restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample,
                                                      config.upsample_out_channels, config.zero_init_last,
                                                      config.tuner_cfg)
-            setattr(top_module, 'restuning', restuning_module)
+            setattr(top_module, f'restuning_{adapter_name}', restuning_module)
 
         # 4. Matching the target module
         target_module_ins = None
@@ -235,10 +241,10 @@ def _forward_restuning(self):
         if target_module_ins is None:
             raise Exception(f"Cannot match target modules")
 
-        def state_dict_callback(state_dict):
+        def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if 'restuning' in key
+                for key, value in state_dict.items() if f'restuning_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -247,6 +253,12 @@ def mark_trainable_callback(model):
         return SwiftOutput(config, state_dict_callback,
                            mark_trainable_callback)
 
+    @staticmethod
+    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        for _module in modules:
+            module.activate(activate)
+
 
 class ResTuningBypassModule(nn.Module):
     """The implementation of ResTuningBypass method.
@@ -263,6 +275,7 @@ def __init__(
     ):
         super(ResTuningBypassModule, self).__init__()
 
+        self._activate = True
         self.bypass_blocks = nn.Sequential(*[
             ResTunerBypassBlock(
                 dim=dims[i] if isinstance(dims, list) else dims,
@@ -276,7 +289,12 @@ def __init__(
             )
             for i in range(depth)])
 
-    def forward(self, x_list, **kwargs):
+    def activate(self, activate=True):
+        self._activate = activate
+
+    def forward(self, x_list, origin_arg, **kwargs):
+        if not self._activate:
+            return origin_arg
         x_bypass = detach_tensors(x_list.pop(0))
         x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass
         x_list = detach_tensors(x_list)
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 9e4f043dd7..0101e74ab4 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -5,7 +5,7 @@
 import copy
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Union, Callable, Any
+from typing import Union, Callable, Any, List
 from collections import OrderedDict
 from itertools import repeat
 
@@ -15,6 +15,7 @@
 
 from swift.utils.logger import get_logger
 from .utils import SwiftConfig, SwiftOutput
+from ..utils.torch_utils import find_sub_module
 
 logger = get_logger()
 
@@ -61,7 +62,7 @@ def __post_init__(self):
 class Side:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: SideConfig) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: SideConfig, adapter_name: str) -> SwiftOutput:
         """Prepare a model with `SideConfig`"""
         module_keys = [key for key, _ in model.named_modules()]
 
@@ -77,9 +78,9 @@ def _forward(self, *args, **kwargs):
                     args_main = self.forward_origin(*args, **kwargs)
                     if isinstance(args_main, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, str):
-                            args_main[config.hidden_pos] = getattr(self, 'side')(*args, args_main[config.hidden_pos])
+                            args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos])
                     else:
-                        args_main = getattr(self, 'side')(*args, args_main)
+                        args_main = getattr(self, f'side_{adapter_name}')(*args, args_main)
                     return args_main
 
                 if isinstance(tgt_module, nn.Sequential):
@@ -96,12 +97,12 @@ def forward_seq(self, input, *args, **kwargs):
                     tgt_module.forward_origin = tgt_module.forward
                 tgt_module.forward = types.MethodType(_forward, tgt_module)
                 side_module = SideModule(config.dim, config.side_module_name)
-                setattr(tgt_module, 'side', side_module)
+                setattr(tgt_module, f'side_{adapter_name}', side_module)
 
-        def state_dict_callback(state_dict):
+        def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if 'side' in key
+                for key, value in state_dict.items() if f'side_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -110,6 +111,12 @@ def mark_trainable_callback(model):
         return SwiftOutput(config, state_dict_callback,
                            mark_trainable_callback)
 
+    @staticmethod
+    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        for _module in modules:
+            module.activate(activate)
+
 
 class SideModule(nn.Module):
     """The implementation of vision side-tuning method.
@@ -147,8 +154,14 @@ def __init__(
         else:
             raise ValueError(f'Unsupported side_module_name: {side_module_name}')
         self.alpha = nn.Parameter(torch.tensor(0.0))
+        self._activate = True
+
+    def activate(self, activate=True):
+        self._activate = activate
 
     def forward(self, x, x_main):
+        if not self._activate:
+            return x_main
         alpha_squashed = torch.sigmoid(self.alpha)
         x_side = self.side_net(x)
         x_out = alpha_squashed * x_main + (1 - alpha_squashed) * x_side
diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py
index 42faa94e84..0e0c4bed4f 100644
--- a/swift/tuners/utils.py
+++ b/swift/tuners/utils.py
@@ -109,10 +109,10 @@ class SwiftOutput:
             which is used to get the tuner's state dict among the model's state dict.
             This callback should receive a state dict, and returns a created state dict.
             Examples:
-                >>> def state_dict_callback(state_dict):
+                >>> def state_dict_callback(state_dict, adapter_name):
                 >>>     return {
                 >>>         key: value
-                >>>         for key, value in state_dict.items() if 'adapter' in key
+                >>>         for key, value in state_dict.items() if adapter_name in key
                 >>>     }
         mark_trainable_callback (`FunctionType`): A callback returned by the tuner
             which is used to mark the tuner's adapter's parameters to trainable.
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 03db127012..f2f1903273 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -98,6 +98,18 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     logger.info(''.join(s))
 
 
+def find_sub_module(module: torch.nn.Module, module_name: str) -> List[torch.nn.Module]:
+    _modules = list()
+    for name, sub_module in module.named_modules():
+        if not name:
+            continue
+        if module_name == name or getattr(sub_module, 'adapter_name', None) == module_name:
+            _modules.append(sub_module)
+        else:
+            _modules.extend(find_sub_module(sub_module, module_name))
+    return _modules
+
+
 def get_seed(random_state: RandomState) -> int:
     seed_max = np.iinfo(np.int32).max
     seed = random_state.randint(0, seed_max)
diff --git a/tests/utils/test_torch_utils.py b/tests/utils/test_torch_utils.py
new file mode 100644
index 0000000000..3517d7f475
--- /dev/null
+++ b/tests/utils/test_torch_utils.py
@@ -0,0 +1,14 @@
+import unittest
+from modelscope import Model
+from swift.utils.torch_utils import find_sub_module
+
+
+class TestTorchUtils(unittest.TestCase):
+
+    def test_find_sub_module(self):
+        model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+        self.assertTrue(find_sub_module(model, 'query') is not None)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 851124706b5c3083c1280e89ddcba9bc9d280050 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Fri, 8 Sep 2023 20:49:10 +0800
Subject: [PATCH 27/70] fix indent

---
 swift/tuners/adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 245bb3eeb2..37c4374760 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -167,7 +167,7 @@ def __init__(
         self.ln2 = nn.Linear(adapter_length, dim)
         self.init_weights()
         self._prepared = False
-		self._activate = True
+        self._activate = True
 
     def init_weights(self):
 

From d27fe8d63dbb1e304ea9a0b3e94d42558be03787 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 Sep 2023 14:17:43 +0800
Subject: [PATCH 28/70] fix

---
 .../pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh    | 2 +-
 examples/pytorch/llm/src/llm_infer.py                     | 2 +-
 swift/tuners/adapter.py                                   | 6 +++---
 swift/tuners/base.py                                      | 8 ++++----
 swift/tuners/lora.py                                      | 6 +++---
 swift/tuners/restuning_components.py                      | 2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh
index b47ece0d8c..46ad5c849f 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh
@@ -1,6 +1,6 @@
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_infer.py \
-    --model_type llama2-7b-chat \
+    --model_type llama2-70b-chat \
     --sft_type lora \
     --ckpt_dir "runs/llama2-70b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human true \
diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index da88454821..30ccd22c68 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -110,7 +110,7 @@ def llm_infer(args: InferArguments) -> None:
 
     # ### Preparing lora
     if args.sft_type == 'lora':
-        model = Swift.from_pretrained(model, args.ckpt_dir)
+        model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True)
 
     show_layers(model)
     print_model_info(model)
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 37c4374760..03b16d13e6 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -163,7 +163,7 @@ def __init__(
         self.adapter_length = adapter_length
         # self.adapter_type = adapter_type
         self.ln1 = nn.Linear(dim, adapter_length)
-        self.activate = act_layer()
+        self.act = act_layer()
         self.ln2 = nn.Linear(adapter_length, dim)
         self.init_weights()
         self._prepared = False
@@ -186,13 +186,13 @@ def forward(self, x, identity=None):
             return 0.
         if not self._prepared:
             self.ln1.to(x.device)
-            self.activate.to(x.device)
+            self.act.to(x.device)
             self.ln2.to(x.device)
             self._prepared = True
         
         x_dtype = x.dtype
         x = x.to(self.ln1.weight.dtype)
-        out = self.ln2(self.activate(self.ln1(x)))
+        out = self.ln2(self.act(self.ln1(x)))
         if identity is None:
             identity = x
         identity = identity.to(out.dtype)
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index 8243625dee..11a96ef9fa 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -318,14 +318,14 @@ def create_or_update_model_card(self, output_dir: str):
     def save_pretrained(self,
                         save_directory: str,
                         safe_serialization: bool = False,
-                        adapter_name: Union[str, List[str]] = 'default',
+                        adapter_name: Union[str, List[str]] = None,
                         **kwargs):
         """Save the adapters to a local directory.
 
         Args:
             save_directory (`str`): The directory to use.
             safe_serialization (`bool`): Use safe tensors to save the weights, default False.
-            adapter_name(`Union[str, List[str]]`): The adapters to be saved, default is `default`.
+            adapter_name(`Union[str, List[str]]`): The adapters to be saved, default is `None` to save all.
         """
         if os.path.isfile(save_directory):
             raise ValueError(
@@ -335,9 +335,9 @@ def save_pretrained(self,
         self.create_or_update_model_card(save_directory)
 
         adapter_names = adapter_name if isinstance(adapter_name,
-                                                   list) else [adapter_name]
+                                                   list) or adapter_name is None else [adapter_name]
         for adapter_name, output in self.adapters.items():
-            if adapter_name not in adapter_names:
+            if adapter_names is not None and adapter_name not in adapter_names:
                 continue
 
             # save only the trainable weights
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 4048d775a9..5e649a9e93 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -742,16 +742,16 @@ def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'non
 
 def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]:
     if bias == 'none':
-        return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k, None) == adapter_name}
+        return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name}
     elif bias == 'all':
         return {
             k: state_dict[k]
-            for k in state_dict if ('lora_' in k and module_map.get(k, None) == adapter_name) or 'bias' in k
+            for k in state_dict if ('lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name) or 'bias' in k
         }
     elif bias == 'lora_only':
         to_return = {}
         for k in state_dict:
-            if 'lora_' in k and module_map.get(k, None) == adapter_name:
+            if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name:
                 to_return[k] = state_dict[k]
                 bias_name = k.split('lora_')[0] + 'bias'
                 if bias_name in state_dict:
diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
index a3ab2dfe28..db50a945a2 100644
--- a/swift/tuners/restuning_components.py
+++ b/swift/tuners/restuning_components.py
@@ -60,7 +60,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="",
         self.layer_num = layer_num
         self.depth = depth
 
-        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 17
+        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 32
         self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
         self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
 

From fc2a1103d2b77f7d43975f2e42fd27c2d7d2ae42 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 Sep 2023 16:15:12 +0800
Subject: [PATCH 29/70] fix bugs

---
 swift/tuners/adapter.py         | 18 +++++++------
 swift/tuners/lora.py            |  6 ++---
 swift/tuners/prompt.py          |  4 +--
 swift/tuners/restuning.py       |  4 +--
 swift/tuners/side.py            |  9 ++++---
 tests/tuners/test_swift_base.py | 45 +++++++++++++++++++++++++++++++--
 6 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 03b16d13e6..1d070af1f4 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -91,15 +91,17 @@ def _forward(self, *args, **kwargs):
                     args = self.forward_origin(*args, **kwargs)
                     if isinstance(args, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, int):
-                            return args[0:config.hidden_pos] + args[
-                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) \
-                                + args[config.hidden_pos + 1:] # noqa
+                            _type = type(args)
+                            args = list(args)
+                            args[config.hidden_pos] = args[
+                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos])
+                            return _type(args)
                         else:
-                            kwargs[config.hidden_pos] = args[
+                            args[config.hidden_pos] = args[
                                 config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(
                                     args[config.hidden_pos])
                     elif isinstance(args, torch.Tensor):
-                        args = getattr(self, f'adapter_{adapter_name}')(args)
+                        args = args + getattr(self, f'adapter_{adapter_name}')(args)
                     return args
 
                 def _feed_forward_chunk(self, attention_output):
@@ -135,9 +137,9 @@ def mark_trainable_callback(model):
 
     @staticmethod
     def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        modules: List[torch.nn.Module] = find_sub_module(module, f'adapter_{adapter_name}')
         for _module in modules:
-            module.activate(activate)
+            _module.activate(activate)
 
 
 class AdapterModule(nn.Module):
@@ -182,7 +184,7 @@ def activate(self, activate=True):
         self._activate = activate
 
     def forward(self, x, identity=None):
-        if not self.activate:
+        if not self._activate:
             return 0.
         if not self._prepared:
             self.ln1.to(x.device)
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 5e649a9e93..8cffafa4b6 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -118,10 +118,10 @@ def mark_trainable_callback(model):
     def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
         modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
         for _module in modules:
-            if isinstance(module, LoRALayer):
-                module.activate(activate)
+            if isinstance(_module, LoRALayer):
+                _module.activate(activate)
             else:
-                module.active_adapter = 'default' if activate else 'invalid'
+                _module.active_adapter = 'default' if activate else 'invalid'
 
     @staticmethod
     def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name,
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index abf22455e1..a18e176fe7 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -159,9 +159,9 @@ def mark_trainable_callback(model):
 
     @staticmethod
     def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        modules: List[torch.nn.Module] = find_sub_module(module, f'prompt_{adapter_name}')
         for _module in modules:
-            module.activate(activate)
+            _module.activate(activate)
 
 
 class PromptModule(nn.Module):
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index 1f5d7f3a2f..cc385d700c 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -255,9 +255,9 @@ def mark_trainable_callback(model):
 
     @staticmethod
     def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        modules: List[torch.nn.Module] = find_sub_module(module, f'restuning_{adapter_name}')
         for _module in modules:
-            module.activate(activate)
+            _module.activate(activate)
 
 
 class ResTuningBypassModule(nn.Module):
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 0101e74ab4..2e509e9e11 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -80,7 +80,10 @@ def _forward(self, *args, **kwargs):
                         if isinstance(config.hidden_pos, str):
                             args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos])
                     else:
-                        args_main = getattr(self, f'side_{adapter_name}')(*args, args_main)
+                        _type = type(args_main)
+                        args_main = list(args_main)
+                        args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos])
+                        args_main = _type(args_main)
                     return args_main
 
                 if isinstance(tgt_module, nn.Sequential):
@@ -113,9 +116,9 @@ def mark_trainable_callback(model):
 
     @staticmethod
     def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        modules: List[torch.nn.Module] = find_sub_module(module, f'side_{adapter_name}')
         for _module in modules:
-            module.activate(activate)
+            _module.activate(activate)
 
 
 class SideModule(nn.Module):
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 7676a2a283..c515fab9f8 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -10,7 +10,8 @@
 from modelscope.models.nlp.structbert import (SbertConfig,
                                               SbertForSequenceClassification)
 from peft.utils import WEIGHTS_NAME
-
+from torch import nn
+import math
 from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig
 
 
@@ -27,15 +28,55 @@ def tearDown(self):
         super().tearDown()
 
     def test_swift_lora_forward(self):
+
+        from swift.tuners.lora import Linear
+        def reset_parameters(self):
+            nn.Linear.reset_parameters(self)
+            if hasattr(self, 'lora_A'):
+                # initialize A the same way as the default for nn.Linear and B to zero
+                nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+                nn.init.ones_(self.lora_B)
+
+        Linear.reset_parameters = reset_parameters
+
         model = Model.from_pretrained(
             'damo/nlp_structbert_sentence-similarity_chinese-base')
         preprocessor = Preprocessor.from_pretrained(
             'damo/nlp_structbert_sentence-similarity_chinese-base')
+        inputs = preprocessor('how are you')
         lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
+        outputs = model(**inputs)
         model = Swift.prepare_model(model, config=lora_config)
+        outputs_lora = model(**inputs)
+        model.deactivate_adapter('default')
+        outputs_deactivate = model(**inputs)
+        model.activate_adapter('default')
+        outputs_reactivate = model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+
+    def test_swift_adapter_forward(self):
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
         inputs = preprocessor('how are you')
+        adapter_config = AdapterConfig(
+            dim=model.config.hidden_size,
+            target_modules=r'.*layer\.\d+$',
+            method_name='feed_forward_chunk',
+            hidden_pos=0)
         outputs = model(**inputs)
-        self.assertTrue(hasattr(outputs, 'logits'))
+        model = Swift.prepare_model(model, config=adapter_config)
+        outputs_lora = model(**inputs)
+        model.deactivate_adapter('default')
+        outputs_deactivate = model(**inputs)
+        model.activate_adapter('default')
+        outputs_reactivate = model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_lora_injection(self):
         model = SbertForSequenceClassification(SbertConfig())

From 8528a71aae11050a6fdb2954039363480242ddfd Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 Sep 2023 22:07:26 +0800
Subject: [PATCH 30/70] fix

---
 swift/tuners/prompt.py               |  9 ++++-
 swift/tuners/restuning.py            | 10 ++---
 swift/tuners/restuning_components.py |  6 +--
 tests/tuners/test_swift_base.py      | 60 ++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index a18e176fe7..ec21650c3a 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -80,6 +80,7 @@ class Prompt:
     @staticmethod
     def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str):
         module_keys = [key for key, _ in model.named_modules()]
+        match_module_keys = []
         for module_key in module_keys:
             if isinstance(config.target_modules, str):
                 target_module_found = re.fullmatch(config.target_modules,
@@ -144,6 +145,7 @@ def _forward(self, *args, **kwargs):
                                              config.attention_mask_value,
                                              config.attach_front)
                 setattr(module, f'prompt_{adapter_name}', prompt_module)
+                match_module_keys.append(module_key)
 
         def state_dict_callback(state_dict, adapter_name):
             return {
@@ -217,9 +219,14 @@ def activate(self, activate=True):
         self._activate = activate
 
     def patch_attention_mask(self, m):
+        if not self._activate:
+            return m
         prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length),
                                            self.mask_values).to(m.device)
-        return torch.cat((prefix_attention_mask, m), dim=-1)
+        if self.attach_front:
+            return torch.cat((prefix_attention_mask, m), dim=-1)
+        else:
+            return torch.cat((m, prefix_attention_mask), dim=-1)
 
     def extract(self, x):
         if self.attach_front:
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index cc385d700c..bc16ce40b9 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -171,9 +171,9 @@ def _forward_restuning(self, origin_arg):
                         logger.warning(
                             f"Type of {type(root_module)} may not be supported because of its customized forward")
                     if config.root_modules_hook == "input":
-                        root_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                        root_module.register_forward_pre_hook(probe_input_pre_hook)
                     else:
-                        root_module.register_forward_hook(probe_output_hook, with_kwargs=True)
+                        root_module.register_forward_hook(probe_output_hook)
                     root_module.root_modules_hook = config.root_modules_hook
                     root_module_ins_list.append(root_module)
                     break
@@ -194,11 +194,11 @@ def _forward_restuning(self, origin_arg):
                     logger.warning(
                         f"Type of {type(stem_module)} may not be supported because of its customized forward")
                 if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0:
-                    stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                    stem_module.register_forward_pre_hook(probe_input_pre_hook)
                 if config.stem_modules_hook == "input":
-                    stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True)
+                    stem_module.register_forward_pre_hook(probe_input_pre_hook)
                 else:
-                    stem_module.register_forward_hook(probe_output_hook, with_kwargs=True)
+                    stem_module.register_forward_hook(probe_output_hook)
                 stem_module.stem_modules_hook = config.stem_modules_hook
                 stem_module_ins_list.append(stem_module)
         if isinstance(config.stem_modules, list):
diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
index db50a945a2..9c99543c37 100644
--- a/swift/tuners/restuning_components.py
+++ b/swift/tuners/restuning_components.py
@@ -306,13 +306,13 @@ def probe_tensors(module, feats, name):
     setattr(module, name, feats)
 
 
-def probe_input_pre_hook(self, args, kwargs):
+def probe_input_pre_hook(self, args):
     input = args[0]
     probe_tensors(self, input, 'probe_input_data')
-    return args, kwargs
+    return args
 
 
-def probe_output_hook(self, args, kwargs, result):
+def probe_output_hook(self, args, result):
     output = result
     probe_tensors(self, output, 'probe_output_data')
     return output
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index c515fab9f8..1abfd0994f 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -12,7 +12,7 @@
 from peft.utils import WEIGHTS_NAME
 from torch import nn
 import math
-from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig
+from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig, PromptConfig, ResTuningConfig
 
 
 class TestSwift(unittest.TestCase):
@@ -78,6 +78,53 @@ def test_swift_adapter_forward(self):
         self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
         self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
+    def test_swift_prompt_forward(self):
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        inputs = preprocessor('how are you')
+        prompt_config = PromptConfig(
+            dim=model.config.hidden_size,
+            target_modules=r'.*layer\.\d+$',
+            embedding_pos=0,
+            attention_mask_pos=1)
+        outputs = model(**inputs)
+        model = Swift.prepare_model(model, config=prompt_config)
+        outputs_lora = model(**inputs)
+        model.deactivate_adapter('default')
+        outputs_deactivate = model(**inputs)
+        model.activate_adapter('default')
+        outputs_reactivate = model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+
+    def test_swift_restuner_forward(self):
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        inputs = preprocessor('how are you')
+        restuner_config = ResTuningConfig(
+            dims=model.config.hidden_size,
+            root_modules=r'.*layer.0$',
+            stem_modules=r'.*layer\.\d+$',
+            target_modules=r'.*pooler',
+            target_modules_hook='input',
+            tuner_cfg="res_adapter",
+        )
+        outputs = model(**inputs)
+        model = Swift.prepare_model(model, config=restuner_config)
+        outputs_lora = model(**inputs)
+        model.deactivate_adapter('default')
+        outputs_deactivate = model(**inputs)
+        model.activate_adapter('default')
+        outputs_reactivate = model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+
     def test_swift_lora_injection(self):
         model = SbertForSequenceClassification(SbertConfig())
         model2 = copy.deepcopy(model)
@@ -154,7 +201,7 @@ def test_swift_side(self):
         print(
             f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}'
         )
-
+        result = model(torch.ones((1, 3, 224, 224))).logits
         side_config = SideConfig(
             dim=768,
             target_modules=r'vit',
@@ -162,7 +209,14 @@ def test_swift_side(self):
             hidden_pos='last_hidden_state')
 
         model = Swift.prepare_model(model, config=side_config)
-        result = model(torch.ones((1, 3, 224, 224))).logits
+        result_activate = model(torch.ones((1, 3, 224, 224))).logits
+        model.deactivate_adapter('default')
+        result_deactivate = model(torch.ones((1, 3, 224, 224))).logits
+        model.activate_adapter('default')
+        result_reactivate = model(torch.ones((1, 3, 224, 224))).logits
+        self.assertTrue(torch.allclose(result, result_deactivate))
+        self.assertTrue(not torch.allclose(result, result_activate))
+        self.assertTrue(torch.allclose(result_activate, result_reactivate))
         print(
             f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}'
         )

From 7b56a778826829f588b83a66744d782e3e12a3e9 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sat, 9 Sep 2023 22:13:54 +0800
Subject: [PATCH 31/70] fix inference

---
 swift/trainers/trainers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 144ca95807..298593c11d 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -104,6 +104,8 @@ def prediction_step(
             generation_inputs = inputs[self.model.main_input_name]
 
         gen_kwargs["input_ids"] = generation_inputs
+        gen_kwargs["pad_token_id"] = self.tokenizer.pad_token_id
+        gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
         gen_time = time.time()
         generated_tokens = self.model.generate(**gen_kwargs)
         gen_time = time.time() - gen_time

From 95ffddf278f4c63314de0c3f568ea43571468109 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 08:08:17 +0800
Subject: [PATCH 32/70] update code

---
 tests/tuners/test_swift_base.py | 43 ++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 1abfd0994f..5992cddcbe 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -192,39 +192,48 @@ def test_swift_multiple_adapters(self):
                 all(
                     torch.isclose(state_dict[key],
                                   state_dict2[key]).flatten().detach().cpu()))
-    def test_swift_side(self):
-        from transformers import AutoModelForImageClassification
-        model = AutoModelForImageClassification.from_pretrained(
-            'google/vit-base-patch16-224')
+
+    def test_swift_side_bert(self):
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        inputs = preprocessor('how are you')
         model2 = copy.deepcopy(model)
-        result_origin = model(torch.ones((1, 3, 224, 224))).logits
+        result_origin = model(**inputs).logits
         print(
-            f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}'
+            f'test_swift_side_bert result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}'
         )
-        result = model(torch.ones((1, 3, 224, 224))).logits
+
         side_config = SideConfig(
-            dim=768,
-            target_modules=r'vit',
-            side_module_name='fcn4',
-            hidden_pos='last_hidden_state')
+            dim=model.config.hidden_size,
+            target_modules=r'.*encoder.encoder',
+            side_module_name='mlp',
+            hidden_pos='last_hidden_state'
+        )
 
         model = Swift.prepare_model(model, config=side_config)
-        result_activate = model(torch.ones((1, 3, 224, 224))).logits
+        result_activate = model(**inputs).logits
         model.deactivate_adapter('default')
-        result_deactivate = model(torch.ones((1, 3, 224, 224))).logits
+        result_deactivate = model(**inputs).logits
         model.activate_adapter('default')
-        result_reactivate = model(torch.ones((1, 3, 224, 224))).logits
-        self.assertTrue(torch.allclose(result, result_deactivate))
-        self.assertTrue(not torch.allclose(result, result_activate))
+        result_reactivate = model(**inputs).logits
+        self.assertTrue(torch.allclose(result_origin, result_deactivate))
+        self.assertTrue(not torch.allclose(result_origin, result_activate))
         self.assertTrue(torch.allclose(result_activate, result_reactivate))
         print(
-            f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}'
+            f'test_swift_side_bert result shape: {result_origin.shape}, result sum: {torch.sum(result_origin)}'
         )
+
         self.assertTrue(isinstance(model, SwiftModel))
         model.save_pretrained(self.tmp_dir)
         self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
+        self.assertTrue(
+            os.path.exists(
+                os.path.join(self.tmp_dir, 'default', WEIGHTS_NAME)))
 
         model2 = Swift.from_pretrained(model2, self.tmp_dir)
+
         state_dict = model.state_dict()
         state_dict2 = model2.state_dict()
         for key in state_dict:

From 3193aae9653f4a53419b4e23fb5c8da346c8ff49 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 12:32:49 +0800
Subject: [PATCH 33/70] fix and pass pre-commit

---
 examples/pytorch/llm/src/llm_infer.py        |   3 +-
 examples/pytorch/llm/src/llm_sft.py          |  48 ++---
 examples/pytorch/llm/src/utils/dataset.py    |  38 ++--
 examples/pytorch/llm/src/utils/model.py      |  50 ++---
 examples/pytorch/llm/src/utils/preprocess.py |  31 +--
 swift/__init__.py                            |  23 +-
 swift/trainers/trainers.py                   | 188 ++++++++---------
 swift/tuners/adapter.py                      |  31 ++-
 swift/tuners/base.py                         |  13 +-
 swift/tuners/lora.py                         |  39 ++--
 swift/tuners/prompt.py                       |  28 ++-
 swift/tuners/restuning.py                    | 208 ++++++++++++-------
 swift/tuners/restuning_components.py         | 201 ++++++++++++------
 swift/tuners/side.py                         |  93 +++++----
 swift/utils/torch_utils.py                   |   6 +-
 tests/tuners/test_swift_base.py              |  52 +++--
 tests/tuners/test_swift_restuning.py         |  68 +++---
 tests/utils/test_torch_utils.py              |   5 +-
 18 files changed, 668 insertions(+), 457 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index 30ccd22c68..0d7730aa10 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -110,7 +110,8 @@ def llm_infer(args: InferArguments) -> None:
 
     # ### Preparing lora
     if args.sft_type == 'lora':
-        model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True)
+        model = Swift.from_pretrained(
+            model, args.ckpt_dir, inference_mode=True)
 
     show_layers(model)
     print_model_info(model)
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 9f52ce9bb6..f0acda895c 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -4,30 +4,29 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Dict, List
-from typing import Optional
+from typing import Dict, List, Optional
 
 import jieba
 import numpy as np
 import torch
 import torch.distributed as dist
-from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu)
-from rouge import Rouge
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
 from rouge.rouge import Rouge
 from transformers import BitsAndBytesConfig, GenerationConfig
-
-from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer,
-                   Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger)
-from swift.hub import HubApi, ModelScopeConfig
-from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
-                         seed_everything)
-from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
                    broadcast_string, find_all_linear_for_lora, get_dataset,
                    get_dist_setting, get_model_tokenizer, get_preprocess,
                    is_dist, is_master, plot_images, process_dataset,
                    select_bnb, select_dtype, show_layers)
 
+from swift import (AdapterConfig, HubStrategy, LoRAConfig, ResTuningConfig,
+                   Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift,
+                   SwiftConfig, get_logger)
+from swift.hub import HubApi, ModelScopeConfig
+from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
+                         seed_everything)
+from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset
+
 logger = get_logger()
 
 
@@ -37,8 +36,7 @@ class SftArguments:
         default='qwen-7b-chat',
         metadata={'choices': list(MODEL_MAPPING.keys())})
     # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G
-    sft_type: str = field(
-        default='lora')
+    sft_type: str = field(default='lora')
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -110,7 +108,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-                'SDK token can be found in https://modelscope.cn/my/myaccesstoken'
+            'SDK token can be found in https://modelscope.cn/my/myaccesstoken'
         })
 
     # other
@@ -118,7 +116,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-                "This parameter is used only when model_type.startswith('qwen-7b')"
+            "This parameter is used only when model_type.startswith('qwen-7b')"
         })
 
     def __post_init__(self):
@@ -241,8 +239,7 @@ def llm_sft(args: SftArguments) -> None:
             elif sft_type == 'restuner':
                 restuner_config = ResTuningConfig(
                     dims=model.config.hidden_size,
-                    **MODEL_MAPPING[args.model_type]['restuner_TM']
-                )
+                    **MODEL_MAPPING[args.model_type]['restuner_TM'])
                 logger.info(f'restuner_config: {restuner_config}')
                 swift_config['restuner'] = restuner_config
         model = Swift.prepare_model(model, swift_config)
@@ -330,8 +327,9 @@ def llm_sft(args: SftArguments) -> None:
         eval_steps=args.eval_steps,
         dataloader_num_workers=args.dataloader_num_workers,
         load_best_model_at_end=True,
-        metric_for_best_model='rouge-l',
-        greater_is_better=True,
+        metric_for_best_model='rouge-l'
+        if args.predict_with_generate else 'loss',
+        greater_is_better=args.predict_with_generate,
         sortish_sampler=True,
         optim=args.optim,
         hub_model_id=args.hub_model_id,
@@ -344,8 +342,8 @@ def llm_sft(args: SftArguments) -> None:
         gradient_checkpointing=args.gradient_checkpointing,
         predict_with_generate=args.predict_with_generate,
         generation_config=GenerationConfig.from_dict(generation_config),
-        local_rank=local_rank, 
-		**kwargs)
+        local_rank=local_rank,
+        **kwargs)
 
     if args.gradient_checkpointing:
         # fix: gradients will be None
@@ -389,7 +387,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
             try:
                 rouge = Rouge()
                 scores = rouge.get_scores(' '.join(hypothesis),
-                                        ' '.join(reference))
+                                          ' '.join(reference))
                 result = scores[0]
 
                 for k, v in result.items():
@@ -399,7 +397,8 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
                     list(pred),
                     smoothing_function=SmoothingFunction().method3)
                 score_dict['bleu-4'].append(round(bleu_score * 100, 4))
-            except:
+            except Exception as e:
+                logger.error(e)
                 logger.error(f'eval error {hypothesis}, {reference}')
 
         for k, v in score_dict.items():
@@ -413,7 +412,8 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
         train_dataset=train_dataset,
         eval_dataset=val_dataset,
         tokenizer=tokenizer,
-        compute_metrics=compute_metrics if args.predict_with_generate else None,
+        compute_metrics=compute_metrics
+        if args.predict_with_generate else None,
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 08adb9da4d..595dacf83a 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -45,14 +45,18 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset:
 
 def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
     dataset_train: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen', split='train').to_hf_dataset().rename_columns({
-            "content": "query",
-            "summary": "response",
+        'lvjianjin/AdvertiseGen',
+        split='train').to_hf_dataset().rename_columns({
+            'content': 'query',
+            'summary': 'response',
         })
     dataset_val: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset().rename_columns({
-            "content": "query",
-            "summary": "response",
+        'lvjianjin/AdvertiseGen',
+        split='validation').to_hf_dataset().rename_columns({
+            'content':
+            'query',
+            'summary':
+            'response',
         })
     return dataset_train, dataset_val
 
@@ -137,14 +141,16 @@ def get_instinwild_en_dataset() -> HfDataset:
 
 def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]:
     dataset_train: HfDataset = MsDataset.load(
-        'modelscope/DuReader_robust-QG', split='train').to_hf_dataset().rename_columns({
-            "text1": "query",
-            "text2": "response",
+        'modelscope/DuReader_robust-QG',
+        split='train').to_hf_dataset().rename_columns({
+            'text1': 'query',
+            'text2': 'response',
         })
     dataset_val: HfDataset = MsDataset.load(
-        'modelscope/DuReader_robust-QG', split='validation').to_hf_dataset().rename_columns({
-            "text1": "query",
-            "text2": "response",
+        'modelscope/DuReader_robust-QG',
+        split='validation').to_hf_dataset().rename_columns({
+            'text1': 'query',
+            'text2': 'response',
         })
     return dataset_train, dataset_val
 
@@ -368,15 +374,17 @@ def get_cmnli_zh_dataset() -> HfDataset:
 }
 
 
-def get_dataset(dataset_name_list: List[str]) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]:
+def get_dataset(
+    dataset_name_list: List[str]
+) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]:
     """Returns a dataset to be split or a train-val dataset tuple"""
     dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = []
     for dataset_name in dataset_name_list:
         get_function = DATASET_MAPPING[dataset_name]
         dataset_list.append(get_function())
 
-    assert(all(isinstance(dataset, tuple) for dataset in dataset_list)
-           or all(isinstance(dataset, HfDataset) for dataset in dataset_list))
+    assert (all(isinstance(dataset, tuple) for dataset in dataset_list)
+            or all(isinstance(dataset, HfDataset) for dataset in dataset_list))
     if not isinstance(dataset_list[0], tuple):
         dataset = concatenate_datasets(dataset_list)
     else:
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index aac85025bf..10e1c7f8b9 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -187,39 +187,39 @@ class AdapterTM(NamedTuple):
 class ResTunerTM(NamedTuple):
     # default lora target modules. qkv
     baichuan = {
-        "root_modules": r'.*layers.0$',
-        "stem_modules": r'.*layers\.\d+$',
-        "target_modules": r'.*model.norm',
-        "target_modules_hook": "input",
-        "tuner_cfg": "res_adapter",
+        'root_modules': r'.*layers.0$',
+        'stem_modules': r'.*layers\.\d+$',
+        'target_modules': r'.*model.norm',
+        'target_modules_hook': 'input',
+        'tuner_cfg': 'res_adapter',
     }
     chatglm2 = {
-        "root_modules": r'.*layers.0$',
-        "stem_modules": r'.*layers\.\d+$',
-        "target_modules": r'.*final_layernorm',
-        "target_modules_hook": "input",
-        "tuner_cfg": "res_adapter",
+        'root_modules': r'.*layers.0$',
+        'stem_modules': r'.*layers\.\d+$',
+        'target_modules': r'.*final_layernorm',
+        'target_modules_hook': 'input',
+        'tuner_cfg': 'res_adapter',
     }
     llama2 = {
-        "root_modules": r'.*layers.0$',
-        "stem_modules": r'.*layers\.\d+$',
-        "target_modules": r'.*model.norm',
-        "target_modules_hook": "input",
-        "tuner_cfg": "res_adapter",
+        'root_modules': r'.*layers.0$',
+        'stem_modules': r'.*layers\.\d+$',
+        'target_modules': r'.*model.norm',
+        'target_modules_hook': 'input',
+        'tuner_cfg': 'res_adapter',
     }
     qwen = {
-        "root_modules": r'.*transformer.h.0$',
-        "stem_modules": r'.*transformer.h\.\d+$',
-        "target_modules": r'.*transformer.ln_f',
-        "target_modules_hook": "input",
-        "tuner_cfg": "res_adapter",
+        'root_modules': r'.*transformer.h.0$',
+        'stem_modules': r'.*transformer.h\.\d+$',
+        'target_modules': r'.*transformer.ln_f',
+        'target_modules_hook': 'input',
+        'tuner_cfg': 'res_adapter',
     }
     polylm = {
-        "root_modules": r'.*transformer.h.0$',
-        "stem_modules": r'.*transformer.h\.\d+$',
-        "target_modules": r'.*transformer.ln_f',
-        "target_modules_hook": "input",
-        "tuner_cfg": "res_adapter",
+        'root_modules': r'.*transformer.h.0$',
+        'stem_modules': r'.*transformer.h\.\d+$',
+        'target_modules': r'.*transformer.ln_f',
+        'target_modules_hook': 'input',
+        'tuner_cfg': 'res_adapter',
     }
 
 
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index c4e44f5637..92decc5f1b 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -13,7 +13,7 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
-    'default_generation': {
+    'default-generation': {
         'prefix': [],
         'prompt': ['{{query}}'],
         'suffix': [['eos_token_id']],
@@ -37,7 +37,7 @@
         'chat_sep': ['\n\n'],
         'suffix': [['eos_token_id']],
     },
-    'chatglm2_generation': {
+    'chatglm2-generation': {
         'prefix': [[64790, 64792]],
         'prompt': ['{{query}}'],
         'suffix': [['eos_token_id']],
@@ -124,14 +124,14 @@ def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context],
 
 
 def _preprocess(
-    template_type: str,
-    tokenizer: PreTrainedTokenizer,
-    query: str,
-    response: Optional[str] = None,
-    history: Optional[History] = None,
-    system: Optional[str] = None,
-    max_length: Optional[int] = None,
-    validate_generation=True,  # do cross-validation with `model.generate()`
+        template_type: str,
+        tokenizer: PreTrainedTokenizer,
+        query: str,
+        response: Optional[str] = None,
+        history: Optional[History] = None,
+        system: Optional[str] = None,
+        max_length: Optional[int] = None,
+        validate_generation=True,  # do cross-validation with `model.generate()`
 ) -> Dict[str, List[int]]:
     if history is None:
         history = []
@@ -187,11 +187,11 @@ def _preprocess(
 
 
 def get_preprocess(
-        template_type: str,
-        tokenizer: PreTrainedTokenizer,
-        system: Optional[str] = None,
-        max_length: Optional[int] = None,
-        validate_generation=False,
+    template_type: str,
+    tokenizer: PreTrainedTokenizer,
+    system: Optional[str] = None,
+    max_length: Optional[int] = None,
+    validate_generation=False,
 ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]:
 
     def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
@@ -201,4 +201,5 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
         custom_system = example.get('system', system)
         return _preprocess(template_type, tokenizer, query, response, history,
                            custom_system, max_length, validate_generation)
+
     return preprocess
diff --git a/swift/__init__.py b/swift/__init__.py
index e41615c414..6e866d6515 100644
--- a/swift/__init__.py
+++ b/swift/__init__.py
@@ -5,15 +5,16 @@
 
 if TYPE_CHECKING:
     from .version import __version__, __release_datetime__
-    from .tuners import (
-        Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig,
-        SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM,
-        ResTuningConfig, SideConfig,
-        PeftModelForSeq2SeqLM, PeftModelForSequenceClassification,
-        PeftModelForTokenClassification, PrefixTuningConfig,
-        PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig,
-        get_peft_config, get_peft_model, get_peft_model_state_dict, Prompt,
-        PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift)
+    from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel,
+                         LoRA, LoRAConfig, SWIFT_MAPPING, LoraConfig,
+                         PeftConfig, PeftModel, PeftModelForCausalLM,
+                         ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM,
+                         PeftModelForSequenceClassification,
+                         PeftModelForTokenClassification, PrefixTuningConfig,
+                         PromptEncoderConfig, PromptLearningConfig,
+                         PromptTuningConfig, get_peft_config, get_peft_model,
+                         get_peft_model_state_dict, Prompt, PromptConfig,
+                         PromptModule, SwiftConfig, SwiftOutput, Swift)
     from .hub import snapshot_download, push_to_hub, push_to_hub_async, push_to_hub_in_queue
     from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend,
                            HubStrategy, IntervalStrategy, SchedulerType,
@@ -30,8 +31,8 @@
         'tuners': [
             'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA',
             'LoRAConfig', 'SWIFT_MAPPING', 'LoraConfig', 'PeftConfig',
-            'ResTuningConfig', 'SideConfig',
-            'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM',
+            'ResTuningConfig', 'SideConfig', 'PeftModel',
+            'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM',
             'PeftModelForSequenceClassification',
             'PeftModelForTokenClassification', 'PrefixTuningConfig',
             'PromptEncoderConfig', 'PromptLearningConfig',
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 298593c11d..b31da08f2d 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -8,7 +9,6 @@
 from transformers import Trainer as HfTrainer
 from transformers import trainer
 from transformers.deepspeed import is_deepspeed_zero3_enabled
-import time
 
 from .callback import DefaultFlowCallbackNew, ProgressCallbackNew
 from .mixin import PushToMsHubMixin, SwiftMixin
@@ -22,12 +22,18 @@ class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.perf = {
-            'gen_time': 0.,
-            'gen_len': 0,
-            'eval_memory': 0.,
-            'train_memory': None,
-            'model': self.model.get_trainable_parameters(),
+        self.perf: Dict[str, Any] = {
+            'gen_time':
+            0.,
+            'gen_len':
+            0,
+            'eval_memory':
+            0.,
+            'train_memory':
+            0.,
+            'model':
+            self.model.get_trainable_parameters() if hasattr(
+                self.model, 'get_trainable_parameters') else None,
         }
 
     def train(
@@ -37,127 +43,107 @@ def train(
     ):
         training_output = super().train(*args, **kwargs)
         if self.perf['train_memory'] is None:
-            self.perf['train_memory'] = torch.cuda.memory_allocated()
+            self.perf['train_memory'] = sum([
+                torch.cuda.memory_allocated(i)
+                for i in range(torch.cuda.device_count())
+            ])
         return training_output
 
     def prediction_step(
-            self,
-            model: nn.Module,
-            inputs: Dict[str, Union[torch.Tensor, Any]],
-            prediction_loss_only: bool,
-            ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on `model` using `inputs`.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (`nn.Module`):
-                The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument `labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (`bool`):
-                Whether or not to return the loss only.
-
-        Return:
-            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
-            labels (each being optional).
-        """
-
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+        **gen_kwargs,
+    ) -> Tuple[Optional[float], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
         if not self.args.predict_with_generate or prediction_loss_only:
             return super().prediction_step(
-                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
-            )
+                model,
+                inputs,
+                prediction_loss_only=prediction_loss_only,
+                ignore_keys=ignore_keys)
 
-        has_labels = "labels" in inputs
+        has_labels = 'labels' in inputs
         inputs = self._prepare_inputs(inputs)
 
         # XXX: adapt synced_gpus for fairscale as well
-        gen_kwargs = self.model.generation_config.to_dict().copy()
-        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
-            gen_kwargs["max_length"] = self.model.config.max_length
-        gen_kwargs["num_beams"] = (
-            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
-        )
+        # Priority (handled in generate):
+        # gen_kwargs > model.generation_config > default GenerationConfig()
+
+        if len(gen_kwargs) == 0 and hasattr(self, '_gen_kwargs'):
+            gen_kwargs = self._gen_kwargs.copy()
+
+        if gen_kwargs.get('max_length') is None and gen_kwargs.get(
+                'max_new_tokens') is None:
+            gen_kwargs['max_length'] = self.model.config.max_length
+        gen_kwargs['num_beams'] = (
+            gen_kwargs['num_beams'] if gen_kwargs.get('num_beams') is not None
+            else self.model.config.num_beams)
         default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
-        gen_kwargs["synced_gpus"] = (
-            gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
-        )
-
-        if "attention_mask" in inputs:
-            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
-        if "position_ids" in inputs:
-            gen_kwargs["position_ids"] = inputs.get("position_ids", None)
-        if "global_attention_mask" in inputs:
-            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
-
-        # prepare generation inputs
-        # some encoder-decoder models can have varying encoder's and thus
-        # varying model input names
-        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
+        gen_kwargs['synced_gpus'] = (
+            gen_kwargs['synced_gpus'] if gen_kwargs.get('synced_gpus')
+            is not None else default_synced_gpus)
+
+        # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
+        # (otherwise, it would continue generating from the padded `decoder_input_ids`)
+        if ('labels' in inputs and 'decoder_input_ids' in inputs and
+                inputs['labels'].shape == inputs['decoder_input_ids'].shape):
+            inputs = {
+                k: v
+                for k, v in inputs.items() if k != 'decoder_input_ids'
+            }
+
+        gen_kwargs['pad_token_id'] = self.tokenizer.pad_token_id
+        gen_kwargs['eos_token_id'] = self.tokenizer.eos_token_id
+        gen_time = time.time()
+        generated_tokens = self.model.generate(**inputs, **gen_kwargs)
+        gen_time = time.time() - gen_time
+
+        if hasattr(
+                self.model, 'encoder'
+        ) and self.model.encoder.main_input_name != self.model.main_input_name:
             generation_inputs = inputs[self.model.encoder.main_input_name]
         else:
             generation_inputs = inputs[self.model.main_input_name]
 
-        gen_kwargs["input_ids"] = generation_inputs
-        gen_kwargs["pad_token_id"] = self.tokenizer.pad_token_id
-        gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id
-        gen_time = time.time()
-        generated_tokens = self.model.generate(**gen_kwargs)
-        gen_time = time.time() - gen_time
         generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:]
         gen_len = len(generated_tokens[0])
         self.perf['gen_time'] = self.perf['gen_time'] + gen_time
         self.perf['gen_len'] = self.perf['gen_len'] + gen_len
-        self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), self.perf['eval_memory'])
+        self.perf['eval_memory'] = max(torch.cuda.memory_allocated(),
+                                       self.perf['eval_memory'])
 
         # in case the batch is shorter than max length, the output should be padded
-        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
-        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
-                gen_kwargs["max_new_tokens"] + 1
-        ):
-            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
-
-        loss = None
+        if gen_kwargs.get('max_length') is not None and generated_tokens.shape[
+                -1] < gen_kwargs['max_length']:
+            generated_tokens = self._pad_tensors_to_max_len(
+                generated_tokens, gen_kwargs['max_length'])
+        elif gen_kwargs.get('max_new_tokens'
+                            ) is not None and generated_tokens.shape[-1] < (
+                                gen_kwargs['max_new_tokens'] + 1):
+            generated_tokens = self._pad_tensors_to_max_len(
+                generated_tokens, gen_kwargs['max_new_tokens'] + 1)
 
         if self.args.prediction_loss_only:
-            return (loss, None, None)
+            return None, None, None
 
         if has_labels:
-            labels = inputs["labels"]
-            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
-                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
-            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
-                    gen_kwargs["max_new_tokens"] + 1
-            ):
-                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+            labels = inputs['labels']
+            if gen_kwargs.get('max_length') is not None and labels.shape[
+                    -1] < gen_kwargs['max_length']:
+                labels = self._pad_tensors_to_max_len(labels,
+                                                      gen_kwargs['max_length'])
+            elif gen_kwargs.get(
+                    'max_new_tokens') is not None and labels.shape[-1] < (
+                        gen_kwargs['max_new_tokens'] + 1):
+                labels = self._pad_tensors_to_max_len(
+                    labels, (gen_kwargs['max_new_tokens'] + 1))
         else:
             labels = None
 
-        return (loss, generated_tokens, labels)
-
-    def _pad_tensors_to_max_len(self, tensor, max_length):
-        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
-            # If PAD token is not defined at least EOS token has to be defined
-            pad_token_id = (
-                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
-            )
-        else:
-            if self.model.config.pad_token_id is not None:
-                pad_token_id = self.model.config.pad_token_id
-            else:
-                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
-
-        padded_tensor = pad_token_id * torch.ones(
-            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
-        )
-        padded_tensor[:, : tensor.shape[-1]] = tensor
-        return padded_tensor
+        return None, generated_tokens, labels
 
 
 # monkey patching
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 1d070af1f4..12f3d30641 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -3,11 +3,12 @@
 import re
 import types
 from dataclasses import dataclass, field
-from typing import Union, List
+from typing import List, Union
 
 import torch
 from torch import nn
 from transformers.activations import ACT2CLS
+
 from swift.utils.torch_utils import find_sub_module
 from .utils import SwiftConfig, SwiftOutput
 
@@ -71,7 +72,8 @@ def __post_init__(self):
 class Adapter:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: AdapterConfig, adapter_name: str) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: AdapterConfig,
+                      adapter_name: str) -> SwiftOutput:
         """Prepare a model with `AdapterConfig`"""
         module_keys = [key for key, _ in model.named_modules()]
 
@@ -94,14 +96,18 @@ def _forward(self, *args, **kwargs):
                             _type = type(args)
                             args = list(args)
                             args[config.hidden_pos] = args[
-                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos])
-                            return _type(args)
+                                config.hidden_pos] + getattr(
+                                    self, f'adapter_{adapter_name}')(
+                                        args[config.hidden_pos])
+                            args = _type(args)
                         else:
                             args[config.hidden_pos] = args[
-                                config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(
-                                    args[config.hidden_pos])
+                                config.hidden_pos] + getattr(
+                                    self, f'adapter_{adapter_name}')(
+                                        args[config.hidden_pos])
                     elif isinstance(args, torch.Tensor):
-                        args = args + getattr(self, f'adapter_{adapter_name}')(args)
+                        args = args + getattr(self, f'adapter_{adapter_name}')(
+                            args)
                     return args
 
                 def _feed_forward_chunk(self, attention_output):
@@ -126,7 +132,8 @@ def _feed_forward_chunk(self, attention_output):
         def state_dict_callback(state_dict, adapter_name: str):
             return {
                 key: value
-                for key, value in state_dict.items() if f'adapter_{adapter_name}' in key
+                for key, value in state_dict.items()
+                if f'adapter_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -136,8 +143,10 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, f'adapter_{adapter_name}')
+    def activate_adapter(module: torch.nn.Module, adapter_name: str,
+                         activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(
+            module, f'adapter_{adapter_name}')
         for _module in modules:
             _module.activate(activate)
 
@@ -191,7 +200,7 @@ def forward(self, x, identity=None):
             self.act.to(x.device)
             self.ln2.to(x.device)
             self._prepared = True
-        
+
         x_dtype = x.dtype
         x = x.to(self.ln1.weight.dtype)
         out = self.ln2(self.act(self.ln1(x)))
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index 11a96ef9fa..dd3f984dc0 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -52,7 +52,8 @@ def __init__(self,
 
         self.adapters = {}
         if isinstance(config, SwiftConfig):
-            self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config, DEFAULT_ADAPTER)
+            self.adapters[DEFAULT_ADAPTER] = self._prepare_model(
+                model, config, DEFAULT_ADAPTER)
         elif isinstance(config, dict):
             assert (all(isinstance(c, SwiftConfig) for c in config.values()))
             for adapter_name, config in config.items():
@@ -151,7 +152,8 @@ def state_dict(self,
         if kwargs.get('save_adapter', True):
             for name, output in self.adapters.items():
                 if adapter_name == name or adapter_name is None:
-                    state_dicts.update(output.state_dict_callback(destination, adapter_name))
+                    state_dicts.update(
+                        output.state_dict_callback(destination, adapter_name))
         if kwargs.get('save_extra_states', True):
             state_dicts.update({
                 k: v
@@ -264,7 +266,8 @@ def _prepare_model(
     ):
         assert (hasattr(config, SWIFT_TYPE_KEY))
         from .mapping import SWIFT_MAPPING
-        return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config, adapter_name)
+        return SWIFT_MAPPING[config.swift_type][1].prepare_model(
+            model, config, adapter_name)
 
     def create_or_update_model_card(self, output_dir: str):
         """
@@ -334,8 +337,8 @@ def save_pretrained(self,
         os.makedirs(save_directory, exist_ok=True)
         self.create_or_update_model_card(save_directory)
 
-        adapter_names = adapter_name if isinstance(adapter_name,
-                                                   list) or adapter_name is None else [adapter_name]
+        adapter_names = adapter_name if isinstance(
+            adapter_name, list) or adapter_name is None else [adapter_name]
         for adapter_name, output in self.adapters.items():
             if adapter_names is not None and adapter_name not in adapter_names:
                 continue
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 8cffafa4b6..1a52628bc9 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -14,8 +14,8 @@
                                is_bnb_available)
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
-from .utils import SwiftConfig, SwiftOutput
 from ..utils.torch_utils import find_sub_module
+from .utils import SwiftConfig, SwiftOutput
 
 if is_bnb_available():
     import bitsandbytes as bnb
@@ -106,7 +106,8 @@ def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str):
             fan_in_fan_out=config.fan_in_fan_out)
 
         def state_dict_callback(state_dict, adapter_name):
-            return lora_state_dict(state_dict, model.lora_module_map, adapter_name, config.bias)
+            return lora_state_dict(state_dict, model.lora_module_map,
+                                   adapter_name, config.bias)
 
         def mark_trainable_callback(model):
             mark_lora_as_trainable(model, config.bias)
@@ -115,7 +116,8 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
+    def activate_adapter(module: torch.nn.Module, adapter_name: str,
+                         activate: bool):
         modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
         for _module in modules:
             if isinstance(_module, LoRALayer):
@@ -124,8 +126,8 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool)
                 _module.active_adapter = 'default' if activate else 'invalid'
 
     @staticmethod
-    def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name,
-                            **kwargs):
+    def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
+                            adapter_name, **kwargs):
         """Dynamic patch lora to model
 
         Args:
@@ -230,7 +232,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name,
                         r=kwargs['r'],
                         lora_alpha=kwargs['lora_alpha'],
                         merge_weights=kwargs['merge_weights'],
-                        )
+                    )
                 elif isinstance(sub_module, torch.nn.Conv2d):
                     kwargs.pop('fan_in_fan_out', None)
                     lora_module = Conv2d(
@@ -292,7 +294,8 @@ def unpatch_lora(model, config: LoRAConfig):
                     origin_module = torch.nn.Linear(
                         sub_module.in_features,
                         sub_module.out_features,
-                        bias=hasattr(sub_module, 'bias') and sub_module.bias is not None,
+                        bias=hasattr(sub_module, 'bias')
+                        and sub_module.bias is not None,
                     )
                 elif isinstance(sub_module, Embedding):
                     origin_module = torch.nn.Embedding(
@@ -723,7 +726,9 @@ def forward(self, x: torch.Tensor):
         return nn.Conv2d.forward(self, x)
 
 
-def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'none') -> None:
+def mark_lora_as_trainable(model: nn.Module,
+                           adapter_name: str,
+                           bias: str = 'none') -> None:
     if bias == 'none':
         return
     elif bias == 'all':
@@ -740,18 +745,28 @@ def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'non
         raise NotImplementedError
 
 
-def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]:
+def lora_state_dict(state_dict,
+                    module_map: Dict,
+                    adapter_name: str,
+                    bias: str = 'none') -> Dict[str, torch.Tensor]:
     if bias == 'none':
-        return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name}
+        return {
+            k: state_dict[k]
+            for k in state_dict
+            if 'lora_' in k and module_map.get(k[:k.find('lora_')
+                                                 - 1], None) == adapter_name
+        }
     elif bias == 'all':
         return {
             k: state_dict[k]
-            for k in state_dict if ('lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name) or 'bias' in k
+            for k in state_dict if ('lora_' in k and module_map.get(
+                k[:k.find('lora_') - 1], None) == adapter_name) or 'bias' in k
         }
     elif bias == 'lora_only':
         to_return = {}
         for k in state_dict:
-            if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name:
+            if 'lora_' in k and module_map.get(k[:k.find('lora_') - 1],
+                                               None) == adapter_name:
                 to_return[k] = state_dict[k]
                 bias_name = k.split('lora_')[0] + 'bias'
                 if bias_name in state_dict:
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index ec21650c3a..3c64479369 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -3,13 +3,13 @@
 import re
 import types
 from dataclasses import dataclass, field
-from typing import Union, List
+from typing import List, Union
 
 import torch
 from torch import nn
 
-from .utils import SwiftConfig, SwiftOutput
 from ..utils.torch_utils import find_sub_module
+from .utils import SwiftConfig, SwiftOutput
 
 
 @dataclass
@@ -78,7 +78,8 @@ def __post_init__(self):
 class Prompt:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str):
+    def prepare_model(model: nn.Module, config: PromptConfig,
+                      adapter_name: str):
         module_keys = [key for key, _ in model.named_modules()]
         match_module_keys = []
         for module_key in module_keys:
@@ -99,7 +100,8 @@ def _forward(self, *args, **kwargs):
                         input_embedding = kwargs[config.embedding_pos]
 
                     input_embedding = getattr(
-                        self, f'prompt_{adapter_name}').forward(input_embedding)
+                        self,
+                        f'prompt_{adapter_name}').forward(input_embedding)
                     if isinstance(config.embedding_pos, int):
                         args = type(args)(
                             args[0:config.embedding_pos] + (input_embedding, )
@@ -117,7 +119,8 @@ def _forward(self, *args, **kwargs):
                         if attention_mask is not None:
                             attention_mask = getattr(
                                 self,
-                                f'prompt_{adapter_name}').patch_attention_mask(attention_mask)
+                                f'prompt_{adapter_name}').patch_attention_mask(
+                                    attention_mask)
                         if isinstance(config.attention_mask_pos, int):
                             args = type(args)(
                                 args[0:config.attention_mask_pos]
@@ -129,7 +132,8 @@ def _forward(self, *args, **kwargs):
                     forward_output = self.forward_origin(*args, **kwargs)
                     if config.extract_embedding:
                         forward_output = getattr(
-                            self, f'prompt_{adapter_name}').extract(forward_output)
+                            self,
+                            f'prompt_{adapter_name}').extract(forward_output)
 
                     return forward_output
 
@@ -150,7 +154,8 @@ def _forward(self, *args, **kwargs):
         def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if f'prompt_{adapter_name}' in key
+                for key, value in state_dict.items()
+                if f'prompt_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -160,8 +165,10 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, f'prompt_{adapter_name}')
+    def activate_adapter(module: torch.nn.Module, adapter_name: str,
+                         activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(
+            module, f'prompt_{adapter_name}')
         for _module in modules:
             _module.activate(activate)
 
@@ -199,7 +206,8 @@ def __init__(self,
     def forward(self, x):
         if not self._activate:
             return x
-        prompt_token = self.prompt_token.expand(x.shape[0], -1, -1).to(x.device)
+        prompt_token = self.prompt_token.expand(x.shape[0], -1,
+                                                -1).to(x.device)
 
         if self.layer_num == 0:
             if self.attach_front:
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index bc16ce40b9..d8ddbc5aab 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -3,15 +3,16 @@
 import re
 import types
 from dataclasses import dataclass, field
-from typing import Union, Dict, Optional, List
+from typing import Dict, List, Optional, Union
 
 import torch
 import torch.nn as nn
 
 from swift.utils.logger import get_logger
-from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner
-from .utils import SwiftConfig, SwiftOutput
 from ..utils.torch_utils import find_sub_module
+from .restuning_components import (ResTuner, detach_tensors,
+                                   probe_input_pre_hook, probe_output_hook)
+from .utils import SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -46,11 +47,12 @@ class ResTuningConfig(SwiftConfig):
     root_modules: str = field(
         default=None,
         metadata={
-            'help': 'The root module to be replaced, can a regex string (use the first matching module) or full match format'
+            'help':
+            'The root module to be replaced, can a regex string (use the first matching module) or full match format'
         })
 
     root_modules_hook: str = field(
-        default="input",
+        default='input',
         metadata={
             'help': 'The hook type of root modules, can be "input" or "output"'
         })
@@ -58,11 +60,12 @@ class ResTuningConfig(SwiftConfig):
     stem_modules: Optional[Union[List[str], str]] = field(
         default=None,
         metadata={
-            'help': 'The stem modules to be replaced, can a regex string or name list of full match format'
+            'help':
+            'The stem modules to be replaced, can a regex string or name list of full match format'
         })
 
     stem_modules_hook: str = field(
-        default="output",
+        default='output',
         metadata={
             'help': 'The hook type of stem modules, can be "input" or "output"'
         })
@@ -70,25 +73,30 @@ class ResTuningConfig(SwiftConfig):
     target_modules: str = field(
         default=None,
         metadata={
-            'help': 'The target module to be replaced, can a regex string (use the first matching module) or full match format'
+            'help':
+            'The target module to be replaced, can a regex string (use the first matching module) or full match format'
         })
 
     target_modules_hook: str = field(
-        default="input",
+        default='input',
         metadata={
-            'help': 'The hook type of target modules, can be "input" or "output"'
+            'help':
+            'The hook type of target modules, can be "input" or "output"'
         })
 
     target_hidden_pos: str = field(
         default=None,
         metadata={
             'help':
-                'The position of the hidden state for target modules output'
+            'The position of the hidden state for target modules output'
         })
 
     tuner_cfg: Optional[Union[List[Dict], Dict, str]] = field(
         default=None,
-        metadata={'help': 'The configuration of the tuning module, can a string or customized config'})
+        metadata={
+            'help':
+            'The configuration of the tuning module, can a string or customized config'
+        })
 
     use_upsample: bool = field(
         default=False,
@@ -96,15 +104,16 @@ class ResTuningConfig(SwiftConfig):
 
     upsample_out_channels: List[int] = field(
         default=None,
-        metadata={'help': 'The number of output channels when "use_upsample" is set to "True"'})
+        metadata={
+            'help':
+            'The number of output channels when "use_upsample" is set to "True"'
+        })
 
     zero_init_last: bool = field(
-        default=False,
-        metadata={'help': 'Zero init last weight'})
+        default=False, metadata={'help': 'Zero init last weight'})
 
     use_bypass: bool = field(
-        default=True,
-        metadata={'help': 'Whether to use bypass'})
+        default=True, metadata={'help': 'Whether to use bypass'})
 
     def __post_init__(self):
         from .mapping import SwiftTuners
@@ -114,28 +123,36 @@ def __post_init__(self):
 class ResTuning:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: ResTuningConfig, adapter_name: str) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: ResTuningConfig,
+                      adapter_name: str) -> SwiftOutput:
         """Prepare a model with `ResTuningConfig`"""
 
         def _forward_seq(self, input, *args, **kwargs):
             for idx, module in enumerate(self):
-                if idx >= len(self.origin_module_keys): continue
+                if idx >= len(self.origin_module_keys):
+                    continue
                 input = module(input)
             return input
 
         def _forward_target(self, *args, **kwargs):
-            if self.target_modules_hook == "input":
+            if self.target_modules_hook == 'input':
                 args = list(args)
-                _arg = args[0 if self.target_hidden_pos is None else self.target_hidden_pos]
+                _arg = args[0 if self.target_hidden_pos is None else self.
+                            target_hidden_pos]
                 args_main = _forward_restuning(self, _arg)
-                args[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main
+                args[0 if self.target_hidden_pos is None else self.
+                     target_hidden_pos] = args_main
                 args_main = self.forward_origin(*args, **kwargs)
             else:
                 _args_main = self.forward_origin(*args, **kwargs)
-                _arg = _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] if isinstance(_args_main, (tuple, list)) else _args_main
+                _arg = _args_main[0 if self.target_hidden_pos is None else self
+                                  .target_hidden_pos] if isinstance(
+                                      _args_main,
+                                      (tuple, list)) else _args_main
                 args_main = _forward_restuning(self, _arg)
                 if type(_args_main) != type(args_main):
-                    _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main
+                    _args_main[0 if self.target_hidden_pos is None else self.
+                               target_hidden_pos] = args_main
                     args_main = _args_main
             return args_main
 
@@ -156,7 +173,9 @@ def _forward_restuning(self, origin_arg):
                     probe_results.append(st_mod.probe_input_data)
                 else:
                     probe_results.append(st_mod.probe_output_data)
-            args_main = getattr(top_module, f'restuning_{adapter_name}')(probe_results, origin_arg)
+            args_main = getattr(top_module,
+                                f'restuning_{adapter_name}')(probe_results,
+                                                             origin_arg)
             return args_main
 
         # 1. Matching the root module
@@ -166,19 +185,23 @@ def _forward_restuning(self, origin_arg):
             for module_key in module_keys:
                 if re.fullmatch(config.root_modules, module_key):
                     root_module = model.get_submodule(module_key)
-                    logger.info(f"Matching root module [{module_key}] of type {type(root_module)}")
+                    logger.info(
+                        f'Matching root module [{module_key}] of type {type(root_module)}'
+                    )
                     if isinstance(root_module, (nn.ModuleList, nn.ModuleDict)):
                         logger.warning(
-                            f"Type of {type(root_module)} may not be supported because of its customized forward")
-                    if config.root_modules_hook == "input":
-                        root_module.register_forward_pre_hook(probe_input_pre_hook)
+                            f'Type of {type(root_module)} may not be supported because of its customized forward'
+                        )
+                    if config.root_modules_hook == 'input':
+                        root_module.register_forward_pre_hook(
+                            probe_input_pre_hook)
                     else:
                         root_module.register_forward_hook(probe_output_hook)
                     root_module.root_modules_hook = config.root_modules_hook
                     root_module_ins_list.append(root_module)
                     break
             if len(root_module_ins_list) == 0:
-                logger.error(f"Cannot match root modules")
+                logger.error('Cannot match root modules')
 
         # 2. Matching the stem module
         stem_module_ins_list = []
@@ -188,32 +211,40 @@ def _forward_restuning(self, origin_arg):
                     (isinstance(config.stem_modules, list) and module_key in config.stem_modules):
                 stem_module = model.get_submodule(module_key)
                 if isinstance(config.stem_modules, list):
-                    stem_module_ins_index.append(config.stem_modules.index(module_key))
-                logger.info(f"Matching stem module [{module_key}] of type {type(stem_module)}")
+                    stem_module_ins_index.append(
+                        config.stem_modules.index(module_key))
+                logger.info(
+                    f'Matching stem module [{module_key}] of type {type(stem_module)}'
+                )
                 if isinstance(stem_module, (nn.ModuleList, nn.ModuleDict)):
                     logger.warning(
-                        f"Type of {type(stem_module)} may not be supported because of its customized forward")
-                if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0:
+                        f'Type of {type(stem_module)} may not be supported because of its customized forward'
+                    )
+                if len(root_module_ins_list) == 0 and len(
+                        stem_module_ins_list) == 0:
                     stem_module.register_forward_pre_hook(probe_input_pre_hook)
-                if config.stem_modules_hook == "input":
+                if config.stem_modules_hook == 'input':
                     stem_module.register_forward_pre_hook(probe_input_pre_hook)
                 else:
                     stem_module.register_forward_hook(probe_output_hook)
                 stem_module.stem_modules_hook = config.stem_modules_hook
                 stem_module_ins_list.append(stem_module)
         if isinstance(config.stem_modules, list):
-            stem_module_ins_list = [stem_module_ins_list[stem_module_ins_index.index(i)] for i in
-                                    range(len(stem_module_ins_index))]
+            stem_module_ins_list = [
+                stem_module_ins_list[stem_module_ins_index.index(i)]
+                for i in range(len(stem_module_ins_index))
+            ]
         depth = len(stem_module_ins_list)
         if len(stem_module_ins_list) == 0:
-            raise Exception(f"Cannot match source modules")
+            raise Exception('Cannot match source modules')
 
         # 3. Init restuning module
         if len(stem_module_ins_list) != 0:
             top_module = model.get_submodule('')
-            restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample,
-                                                     config.upsample_out_channels, config.zero_init_last,
-                                                     config.tuner_cfg)
+            restuning_module = ResTuningBypassModule(
+                config.dims, depth, config.use_upsample,
+                config.upsample_out_channels, config.zero_init_last,
+                config.tuner_cfg)
             setattr(top_module, f'restuning_{adapter_name}', restuning_module)
 
         # 4. Matching the target module
@@ -221,10 +252,13 @@ def _forward_restuning(self, origin_arg):
         for module_key in module_keys:
             if re.fullmatch(config.target_modules, module_key):
                 tgt_module = model.get_submodule(module_key)
-                logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}")
+                logger.info(
+                    f'Matching target module [{module_key}] of type {type(tgt_module)}'
+                )
                 if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)):
                     raise Exception(
-                        f"Type of {type(tgt_module)} may not be supported because of its customized forward")
+                        f'Type of {type(tgt_module)} may not be supported because of its customized forward'
+                    )
 
                 tgt_module.target_modules_hook = config.target_modules_hook
                 tgt_module.target_hidden_pos = config.target_hidden_pos
@@ -233,18 +267,22 @@ def _forward_restuning(self, origin_arg):
                 target_module_ins = tgt_module
 
                 if isinstance(tgt_module, nn.Sequential):
-                    tgt_module.origin_module_keys = copy.deepcopy(list(tgt_module._modules.keys()))
-                    tgt_module.forward_origin = types.MethodType(_forward_seq, tgt_module)
+                    tgt_module.origin_module_keys = copy.deepcopy(
+                        list(tgt_module._modules.keys()))
+                    tgt_module.forward_origin = types.MethodType(
+                        _forward_seq, tgt_module)
                 else:
                     tgt_module.forward_origin = tgt_module.forward
-                tgt_module.forward = types.MethodType(_forward_target, tgt_module)
+                tgt_module.forward = types.MethodType(_forward_target,
+                                                      tgt_module)
         if target_module_ins is None:
-            raise Exception(f"Cannot match target modules")
+            raise Exception('Cannot match target modules')
 
         def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if f'restuning_{adapter_name}' in key
+                for key, value in state_dict.items()
+                if f'restuning_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -254,8 +292,10 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, f'restuning_{adapter_name}')
+    def activate_adapter(module: torch.nn.Module, adapter_name: str,
+                         activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(
+            module, f'restuning_{adapter_name}')
         for _module in modules:
             _module.activate(activate)
 
@@ -265,13 +305,13 @@ class ResTuningBypassModule(nn.Module):
     """
 
     def __init__(
-            self,
-            dims,
-            depth,
-            use_upsample=False,
-            upsample_out_channels=None,
-            zero_init_last=False,
-            tuner_cfg=None,
+        self,
+        dims,
+        depth,
+        use_upsample=False,
+        upsample_out_channels=None,
+        zero_init_last=False,
+        tuner_cfg=None,
     ):
         super(ResTuningBypassModule, self).__init__()
 
@@ -282,12 +322,13 @@ def __init__(
                 layer_num=i,
                 depth=depth,
                 use_upsample=use_upsample,
-                upsample_out_channels=upsample_out_channels[i] if isinstance(upsample_out_channels,
-                                                                             list) else upsample_out_channels,
+                upsample_out_channels=upsample_out_channels[i] if isinstance(
+                    upsample_out_channels, list) else upsample_out_channels,
                 zero_init_last=zero_init_last,
-                tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list) else tuner_cfg
-            )
-            for i in range(depth)])
+                tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list
+                                                     ) else tuner_cfg)
+            for i in range(depth)
+        ])
 
     def activate(self, activate=True):
         self._activate = activate
@@ -296,17 +337,29 @@ def forward(self, x_list, origin_arg, **kwargs):
         if not self._activate:
             return origin_arg
         x_bypass = detach_tensors(x_list.pop(0))
-        x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass
+        x_bypass = x_bypass[0] if isinstance(x_bypass,
+                                             (list, tuple)) else x_bypass
         x_list = detach_tensors(x_list)
-        x_list = [_x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list]
+        x_list = [
+            _x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list
+        ]
         for i, (bp_blk, x_stem) in enumerate(zip(self.bypass_blocks, x_list)):
-            target_size = x_list[i + 1].shape[2:] if i < len(x_list) - 1 else None
+            target_size = x_list[
+                i + 1].shape[2:] if i < len(x_list) - 1 else None
             x_bypass = bp_blk(x_stem, x_bypass, target_size, **kwargs)
         return x_bypass
 
 
 class ResTunerBypassBlock(nn.Module):
-    def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_last=False, tuner_cfg=None, **kwargs):
+
+    def __init__(self,
+                 dim,
+                 layer_num=-1,
+                 depth=-1,
+                 use_upsample=False,
+                 zero_init_last=False,
+                 tuner_cfg=None,
+                 **kwargs):
         super().__init__()
         self.layer_num = layer_num
         self.depth = depth
@@ -314,16 +367,21 @@ def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_la
         if isinstance(tuner_cfg, str):
             lateral_cfg = tuner_cfg
             vertical_cfg = tuner_cfg
-            aux_cfg = "upsample" if use_upsample and layer_num != depth - 1 else None
+            aux_cfg = 'upsample' if use_upsample and layer_num != depth - 1 else None
         elif isinstance(tuner_cfg, dict):
-            lateral_cfg = tuner_cfg['lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None
-            vertical_cfg = tuner_cfg['vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None
+            lateral_cfg = tuner_cfg[
+                'lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None
+            vertical_cfg = tuner_cfg[
+                'vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None
             aux_cfg = tuner_cfg['aux_cfg'] if 'aux_cfg' in tuner_cfg else None
 
-        self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "lateral", lateral_cfg, **kwargs)
-        self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "vertical", vertical_cfg, **kwargs)
+        self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last,
+                                      'lateral', lateral_cfg, **kwargs)
+        self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last,
+                                       'vertical', vertical_cfg, **kwargs)
         if aux_cfg and len(aux_cfg) != 0:
-            self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "aux", aux_cfg, **kwargs)
+            self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last,
+                                      'aux', aux_cfg, **kwargs)
 
     def forward(self, x_stem, x_bypass, target_size=None, **kwargs):
         x_lateral = self.lateral_tuner(x_stem)
@@ -332,10 +390,4 @@ def forward(self, x_stem, x_bypass, target_size=None, **kwargs):
         x_bypass_out = x_lateral + x_vertical
         if hasattr(self, 'aux_tuner'):
             x_bypass_out = self.aux_tuner(x_bypass_out, target_size)
-
-        # logger.info(f"x_main:{x_stem.shape} / {torch.sum(x_stem)}, x_side:{x_bypass.shape} / {torch.sum(x_bypass)}")
-        # logger.info(f"x_lateral:{x_lateral.shape} / {torch.sum(x_lateral)}, x_vertical:{x_vertical.shape} / {torch.sum(x_vertical)}")
-        # logger.info(f"x_bypass_out: {x_bypass_out.shape} / {torch.sum(x_bypass_out)}")
-
         return x_bypass_out
-
diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py
index 9c99543c37..e7f02aa5d8 100644
--- a/swift/tuners/restuning_components.py
+++ b/swift/tuners/restuning_components.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -11,8 +12,15 @@
 
 
 class ResTuner(nn.Module):
-    def __init__(
-            self, dim=None, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg={}, **kwargs):
+
+    def __init__(self,
+                 dim=None,
+                 layer_num=-1,
+                 depth=-1,
+                 zero_init_last=False,
+                 stage='',
+                 tuner_cfg={},
+                 **kwargs):
         super().__init__()
         self.dim = dim
         self.layer_num = layer_num
@@ -20,32 +28,51 @@ def __init__(
         self.stage = stage
         self.tuner_cfg = tuner_cfg
 
-        if (isinstance(tuner_cfg, str) and tuner_cfg == "res_adapter") or \
-                (isinstance(tuner_cfg, dict) and "res_adapter" in tuner_cfg):
-            tuner_cfg = tuner_cfg['res_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg
-            self.tuner = ResAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last,
-                                    stage=stage, tuner_cfg=tuner_cfg, **kwargs)
-        elif (isinstance(tuner_cfg, str) and tuner_cfg == "res_group_adapter") or \
-                (isinstance(tuner_cfg, dict) and "res_group_adapter" in tuner_cfg):
-            tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg
-            self.tuner = ResGroupAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last,
-                                         stage=stage, tuner_cfg=tuner_cfg, **kwargs)
-        elif (isinstance(tuner_cfg, str) and tuner_cfg == "upsample") or \
-                (isinstance(tuner_cfg, dict) and "upsample" in tuner_cfg):
-            tuner_cfg = tuner_cfg['upsample'] if isinstance(tuner_cfg, dict) else tuner_cfg
+        if (isinstance(tuner_cfg, str) and tuner_cfg == 'res_adapter') or \
+                (isinstance(tuner_cfg, dict) and 'res_adapter' in tuner_cfg):
+            tuner_cfg = tuner_cfg['res_adapter'] if isinstance(
+                tuner_cfg, dict) else tuner_cfg
+            self.tuner = ResAdapter(
+                dim=dim,
+                layer_num=layer_num,
+                depth=depth,
+                zero_init_last=zero_init_last,
+                stage=stage,
+                tuner_cfg=tuner_cfg,
+                **kwargs)
+        elif (isinstance(tuner_cfg, str) and tuner_cfg == 'res_group_adapter') or \
+                (isinstance(tuner_cfg, dict) and 'res_group_adapter' in tuner_cfg):
+            tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance(
+                tuner_cfg, dict) else tuner_cfg
+            self.tuner = ResGroupAdapter(
+                dim=dim,
+                layer_num=layer_num,
+                depth=depth,
+                zero_init_last=zero_init_last,
+                stage=stage,
+                tuner_cfg=tuner_cfg,
+                **kwargs)
+        elif (isinstance(tuner_cfg, str) and tuner_cfg == 'upsample') or \
+                (isinstance(tuner_cfg, dict) and 'upsample' in tuner_cfg):
+            tuner_cfg = tuner_cfg['upsample'] if isinstance(
+                tuner_cfg, dict) else tuner_cfg
             if 'upsample_out_channels' in kwargs:
                 out_channels = kwargs['upsample_out_channels']
                 use_conv = True if out_channels else False
             else:
                 out_channels = dim
                 use_conv = False
-            self.tuner = Upsample(channels=dim, use_conv=use_conv, out_channels=out_channels, tuner_cfg=tuner_cfg,
-                                  **kwargs)
+            self.tuner = Upsample(
+                channels=dim,
+                use_conv=use_conv,
+                out_channels=out_channels,
+                tuner_cfg=tuner_cfg,
+                **kwargs)
         else:
             self.tuner = Identity()
 
     def forward(self, x, *args, **kwargs):
-        if self.tuner_cfg == "zero" or "zero" in self.tuner_cfg:
+        if self.tuner_cfg == 'zero' or 'zero' in self.tuner_cfg:
             x_out = 0.0
         else:
             x_out = self.tuner(x, *args, **kwargs)
@@ -53,30 +80,45 @@ def forward(self, x, *args, **kwargs):
 
 
 class ResAdapter(nn.Module):
-    def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU,
+
+    def __init__(self,
+                 dim,
+                 layer_num=-1,
+                 depth=-1,
+                 zero_init_last=False,
+                 stage='',
+                 tuner_cfg=None,
+                 act_layer=nn.GELU,
                  **kwargs):
         super(ResAdapter, self).__init__()
         self.dim = dim
         self.layer_num = layer_num
         self.depth = depth
 
-        self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 32
-        self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
-        self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
-
-        self.adapter_length = self.adapter_length[self.layer_num] if isinstance(self.adapter_length,
-                                                                                list) else self.adapter_length
-        assert isinstance(self.adapter_length, int) or (
-                isinstance(self.adapter_length, tuple) and len(self.adapter_length) == 3)
+        self.adapter_length = tuner_cfg[
+            'adapter_length'] if 'adapter_length' in tuner_cfg else 32
+        self.adapter_type = tuner_cfg[
+            'adapter_type'] if 'adapter_type' in tuner_cfg else None
+        self.adapter_weight = tuner_cfg[
+            'adapter_weight'] if 'adapter_weight' in tuner_cfg else None
+
+        self.adapter_length = self.adapter_length[
+            self.layer_num] if isinstance(self.adapter_length,
+                                          list) else self.adapter_length
+        assert isinstance(self.adapter_length,
+                          int) or (isinstance(self.adapter_length, tuple)
+                                   and len(self.adapter_length) == 3)
         if isinstance(self.adapter_length, int):
             self.ln1 = nn.Linear(dim, self.adapter_length)
         else:
-            self.ln1 = nn.Linear(self.adapter_length[0], self.adapter_length[1])
+            self.ln1 = nn.Linear(self.adapter_length[0],
+                                 self.adapter_length[1])
         self.activate = act_layer()
         if isinstance(self.adapter_length, int):
             self.ln2 = nn.Linear(self.adapter_length, dim)
         else:
-            self.ln2 = nn.Linear(self.adapter_length[1], self.adapter_length[2])
+            self.ln2 = nn.Linear(self.adapter_length[1],
+                                 self.adapter_length[2])
             dim = self.adapter_length[2]
 
         self._xavier_init_weights(self.ln1)
@@ -109,46 +151,64 @@ def forward(self, x):
             self.activate.to(x.device)
             self.ln2.to(x.device)
             self._prepared = True
-        
+
         x_dtype = x.dtype
         x = x.to(self.ln1.weight.dtype)
         x_shortcut = x
         if len(x_shortcut.size()) == 4:
             B, C, N1, N2 = x.size()
-            x = x.view(x_shortcut.size()[0], x_shortcut.size()[1], -1).permute(0, 2, 1)
+            x = x.view(x_shortcut.size()[0],
+                       x_shortcut.size()[1], -1).permute(0, 2, 1)
 
         x_adapter = self.ln2(self.activate(self.ln1(x)))
 
         if self.adapter_weight:
-            x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight)
+            x_adapter = apply_data_weight(x_adapter, self.scaling,
+                                          self.adapter_weight)
 
         if len(x_shortcut.size()) == 4:
-            x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1],
-                                                        x_shortcut.size()[2], x_shortcut.size()[3])
+            x_adapter = x_adapter.permute(0, 2,
+                                          1).view(x_shortcut.size()[0],
+                                                  x_adapter.size()[-1],
+                                                  x_shortcut.size()[2],
+                                                  x_shortcut.size()[3])
         x_out = x_shortcut + x_adapter
         return x_out.to(x_dtype)
 
 
 class ResGroupAdapter(nn.Module):
-    def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU,
+
+    def __init__(self,
+                 dim,
+                 layer_num=-1,
+                 depth=-1,
+                 zero_init_last=False,
+                 stage='',
+                 tuner_cfg=None,
+                 act_layer=nn.GELU,
                  **kwargs):
         super(ResGroupAdapter, self).__init__()
         self.dim = dim
         self.layer_num = layer_num
         self.depth = depth
 
-        self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None
-        self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None
+        self.adapter_type = tuner_cfg[
+            'adapter_type'] if 'adapter_type' in tuner_cfg else None
+        self.adapter_weight = tuner_cfg[
+            'adapter_weight'] if 'adapter_weight' in tuner_cfg else None
 
         self.adapter_dim = tuner_cfg['dim'] if 'dim' in tuner_cfg else dim
         self.adapter_head = tuner_cfg['head'] if 'head' in tuner_cfg else 4
-        self.adapter_scale_factor = tuner_cfg['scale_factor'] if 'scale_factor' in tuner_cfg else 2
+        self.adapter_scale_factor = tuner_cfg[
+            'scale_factor'] if 'scale_factor' in tuner_cfg else 2
 
         assert self.adapter_dim % self.adapter_head == 0, 'adapter dim should be divisible by adapter head'
         self.dim_mlp = self.adapter_dim // self.adapter_head
 
-        self.ln1 = nn.Linear(self.dim_mlp, self.dim_mlp * self.adapter_scale_factor)
-        self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor, self.dim_mlp)
+        self.ln1 = nn.Linear(self.dim_mlp,
+                             self.dim_mlp * self.adapter_scale_factor)
+        self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor,
+                             self.dim_mlp)
         self.activate = act_layer()
 
         self._kaiming_init_weights(self.ln1)
@@ -180,29 +240,35 @@ def forward(self, x):
             self.activate.to(x.device)
             self.ln2.to(x.device)
             self._prepared = True
-        
+
         x_dtype = x.dtype
         x = x.to(self.ln1.weight.dtype)
         x_shortcut = x
 
         batch, inner_dim, height, width = x.shape
 
-        x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width,
+                                                  inner_dim)
 
-        x_adapter = rearrange(x_adapter, "b n (c h) -> (b h) n c", h=self.adapter_head)
+        x_adapter = rearrange(
+            x_adapter, 'b n (c h) -> (b h) n c', h=self.adapter_head)
         x_adapter = self.ln2(self.activate(self.ln1(x_adapter)))
-        x_adapter = rearrange(x_adapter, "(b h) n c -> b n (c h)", h=self.adapter_head)
+        x_adapter = rearrange(
+            x_adapter, '(b h) n c -> b n (c h)', h=self.adapter_head)
 
         if self.adapter_weight:
-            x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight)
+            x_adapter = apply_data_weight(x_adapter, self.scaling,
+                                          self.adapter_weight)
 
-        x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous()
+        x_adapter = x_adapter.reshape(batch, height, width,
+                                      -1).permute(0, 3, 1, 2).contiguous()
         x_out = x_shortcut + x_adapter
 
         return x_out.to(x_dtype)
 
 
 class Identity(nn.Module):
+
     def __init__(self):
         super().__init__()
 
@@ -219,16 +285,23 @@ class Upsample(nn.Module):
                  upsampling occurs in the inner-two dimensions.
     """
 
-    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, **kwargs):
+    def __init__(self,
+                 channels,
+                 use_conv=False,
+                 out_channels=None,
+                 padding=1,
+                 **kwargs):
         super().__init__()
         self.channels = channels
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         if use_conv:
-            self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=padding)
+            self.conv = nn.Conv2d(
+                self.channels, self.out_channels, 3, padding=padding)
         self.init_weights()
 
     def init_weights(self):
+
         def _init_weights(m):
             if isinstance(m, nn.Conv2d):
                 nn.init.zeros_(m.weight)
@@ -239,9 +312,11 @@ def _init_weights(m):
     def forward(self, x, target_size=None, *args, **kwargs):
         assert x.shape[1] == self.channels
         if target_size is None:
-            x = F.interpolate(x.float(), scale_factor=2, mode="nearest").type_as(x)
+            x = F.interpolate(
+                x.float(), scale_factor=2, mode='nearest').type_as(x)
         else:
-            x = F.interpolate(x.float(), target_size, mode="nearest").type_as(x)
+            x = F.interpolate(
+                x.float(), target_size, mode='nearest').type_as(x)
         if self.use_conv:
             x = self.conv(x)
         return x
@@ -250,27 +325,27 @@ def forward(self, x, target_size=None, *args, **kwargs):
 def init_weight_type(dim, weight_type):
     if weight_type is None:
         scaling = None
-    elif weight_type == "gate":
+    elif weight_type == 'gate':
         scaling = nn.Linear(dim, 1)
-    elif weight_type == "scale":
+    elif weight_type == 'scale':
         scaling = nn.Parameter(torch.Tensor(1))
         scaling.data.fill_(1)
-    elif weight_type == "scale_kv":
+    elif weight_type == 'scale_kv':
         scaling_k = nn.Parameter(torch.Tensor(1))
         scaling_k.data.fill_(1)
         scaling_v = nn.Parameter(torch.Tensor(1))
         scaling_v.data.fill_(1)
         scaling = (scaling_k, scaling_v)
-    elif weight_type == "scale_channel":
+    elif weight_type == 'scale_channel':
         scaling = nn.Parameter(torch.Tensor(dim))
         scaling.data.fill_(1)
-    elif weight_type == "scale_kv_channel":
+    elif weight_type == 'scale_kv_channel':
         scaling_k = nn.Parameter(torch.Tensor(dim))
         scaling_k.data.fill_(1)
         scaling_v = nn.Parameter(torch.Tensor(dim))
         scaling_v.data.fill_(1)
         scaling = (scaling_k, scaling_v)
-    elif weight_type and weight_type.startswith("scalar"):
+    elif weight_type and weight_type.startswith('scalar'):
         scaling = float(weight_type.split('_')[-1])
     else:
         scaling = None
@@ -278,9 +353,11 @@ def init_weight_type(dim, weight_type):
 
 
 def apply_data_weight(data, scaling, weight_type):
-    if weight_type in ["gate"]:
-        scaling = torch.mean(torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1)
-    elif weight_type in ["scale", "scale_channel"] or weight_type.startswith('scalar'):
+    if weight_type in ['gate']:
+        scaling = torch.mean(
+            torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1)
+    elif weight_type in ['scale', 'scale_channel'
+                         ] or weight_type.startswith('scalar'):
         scaling = scaling
     else:
         scaling = None
@@ -291,7 +368,10 @@ def apply_data_weight(data, scaling, weight_type):
 
 def detach_tensors(feats):
     if type(feats) in [list, tuple]:
-        feats = [detach_tensors(feat) if feat is not None else None for feat in feats]
+        feats = [
+            detach_tensors(feat) if feat is not None else None
+            for feat in feats
+        ]
     elif isinstance(feats, dict):
         feats = {key: detach_tensors(val) for key, val in feats.items()}
     elif isinstance(feats, torch.Tensor):
@@ -316,4 +396,3 @@ def probe_output_hook(self, args, result):
     output = result
     probe_tensors(self, output, 'probe_output_data')
     return output
-
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 2e509e9e11..5f25f879a1 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -1,21 +1,21 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
 import inspect
 import re
 import types
-import copy
+from collections import OrderedDict
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Union, Callable, Any, List
-from collections import OrderedDict
 from itertools import repeat
+from typing import Any, Callable, List, Union
 
 import torch
-from torch import nn
 import torchvision
+from torch import nn
 
 from swift.utils.logger import get_logger
-from .utils import SwiftConfig, SwiftOutput
 from ..utils.torch_utils import find_sub_module
+from .utils import SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -45,13 +45,14 @@ class SideConfig(SwiftConfig):
         })
 
     side_module_name: str = field(
-        default=1., metadata={'help': 'The name of the additive side networks'})
+        default=1.,
+        metadata={'help': 'The name of the additive side networks'})
 
     hidden_pos: Union[str, int] = field(
         default=0,
         metadata={
             'help':
-                'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)'
+            'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)'
         })
 
     def __post_init__(self):
@@ -62,40 +63,51 @@ def __post_init__(self):
 class Side:
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: SideConfig, adapter_name: str) -> SwiftOutput:
+    def prepare_model(model: nn.Module, config: SideConfig,
+                      adapter_name: str) -> SwiftOutput:
         """Prepare a model with `SideConfig`"""
         module_keys = [key for key, _ in model.named_modules()]
 
         for module_key in module_keys:
             if re.fullmatch(config.target_modules, module_key):  # noqa
                 tgt_module = model.get_submodule(module_key)
-                logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}")
+                logger.info(
+                    f'Matching target module [{module_key}] of type {type(tgt_module)}'
+                )
                 if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)):
                     raise Exception(
-                        f"Type of {type(tgt_module)} may not be supported because of its customized forward")
+                        f'Type of {type(tgt_module)} may not be supported because of its customized forward'
+                    )
 
                 def _forward(self, *args, **kwargs):
                     args_main = self.forward_origin(*args, **kwargs)
                     if isinstance(args_main, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, str):
-                            args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos])
+                            args_main[config.hidden_pos] = getattr(
+                                self, f'side_{adapter_name}')(
+                                    *args, args_main[config.hidden_pos])
                     else:
                         _type = type(args_main)
                         args_main = list(args_main)
-                        args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos])
+                        args_main[config.hidden_pos] = getattr(
+                            self, f'side_{adapter_name}')(
+                                *args, args_main[config.hidden_pos])
                         args_main = _type(args_main)
                     return args_main
 
                 if isinstance(tgt_module, nn.Sequential):
-                    tgt_module.tgt_module_keys = copy.deepcopy(list(tgt_module._modules.keys()))
+                    tgt_module.tgt_module_keys = copy.deepcopy(
+                        list(tgt_module._modules.keys()))
 
                     def forward_seq(self, input, *args, **kwargs):
                         for idx, module in enumerate(self):
-                            if idx >= len(tgt_module.tgt_module_keys): continue
+                            if idx >= len(tgt_module.tgt_module_keys):
+                                continue
                             input = module(input)
                         return input
 
-                    tgt_module.forward_origin = types.MethodType(forward_seq, tgt_module)
+                    tgt_module.forward_origin = types.MethodType(
+                        forward_seq, tgt_module)
                 else:
                     tgt_module.forward_origin = tgt_module.forward
                 tgt_module.forward = types.MethodType(_forward, tgt_module)
@@ -105,7 +117,8 @@ def forward_seq(self, input, *args, **kwargs):
         def state_dict_callback(state_dict, adapter_name):
             return {
                 key: value
-                for key, value in state_dict.items() if f'side_{adapter_name}' in key
+                for key, value in state_dict.items()
+                if f'side_{adapter_name}' in key
             }
 
         def mark_trainable_callback(model):
@@ -115,8 +128,10 @@ def mark_trainable_callback(model):
                            mark_trainable_callback)
 
     @staticmethod
-    def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, f'side_{adapter_name}')
+    def activate_adapter(module: torch.nn.Module, adapter_name: str,
+                         activate: bool):
+        modules: List[torch.nn.Module] = find_sub_module(
+            module, f'side_{adapter_name}')
         for _module in modules:
             _module.activate(activate)
 
@@ -134,11 +149,7 @@ class SideModule(nn.Module):
         side_module_name: The name of the additive side networks.
     """
 
-    def __init__(
-            self,
-            dim,
-            side_module_name='fcn4'
-    ):
+    def __init__(self, dim, side_module_name='fcn4'):
         super(SideModule, self).__init__()
 
         side_module_name = side_module_name.lower()
@@ -149,13 +160,13 @@ def __init__(
         elif side_module_name == 'alexnet':
             mm = torchvision.models.alexnet(pretrained=True)
             self.side_net = nn.Sequential(
-                OrderedDict([
-                    ('features', mm.features), ('avgpool', mm.avgpool),
-                    ('flatten', nn.Flatten()),
-                    ('fc', nn.Linear(9216, dim, bias=False))
-                ]))
+                OrderedDict([('features', mm.features),
+                             ('avgpool', mm.avgpool),
+                             ('flatten', nn.Flatten()),
+                             ('fc', nn.Linear(9216, dim, bias=False))]))
         else:
-            raise ValueError(f'Unsupported side_module_name: {side_module_name}')
+            raise ValueError(
+                f'Unsupported side_module_name: {side_module_name}')
         self.alpha = nn.Parameter(torch.tensor(0.0))
         self._activate = True
 
@@ -237,27 +248,29 @@ class Mlp(nn.Module):
     """
 
     def __init__(
-            self,
-            in_features,
-            hidden_features=None,
-            out_features=None,
-            act_layer=nn.GELU,
-            norm_layer=None,
-            bias=True,
-            drop=0.,
-            use_conv=False,
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.,
+        use_conv=False,
     ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         bias = tuple(repeat(bias, 2))
         drop_probs = tuple(repeat(drop, 2))
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        linear_layer = partial(
+            nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
 
         self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.norm = norm_layer(
+            hidden_features) if norm_layer is not None else nn.Identity()
         self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
         self.drop2 = nn.Dropout(drop_probs[1])
 
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index f2f1903273..a8c6153f0d 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -98,12 +98,14 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     logger.info(''.join(s))
 
 
-def find_sub_module(module: torch.nn.Module, module_name: str) -> List[torch.nn.Module]:
+def find_sub_module(module: torch.nn.Module,
+                    module_name: str) -> List[torch.nn.Module]:
     _modules = list()
     for name, sub_module in module.named_modules():
         if not name:
             continue
-        if module_name == name or getattr(sub_module, 'adapter_name', None) == module_name:
+        if module_name == name or getattr(sub_module, 'adapter_name',
+                                          None) == module_name:
             _modules.append(sub_module)
         else:
             _modules.extend(find_sub_module(sub_module, module_name))
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 5992cddcbe..79082f5c92 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -1,4 +1,5 @@
 import copy
+import math
 import os
 import shutil
 import tempfile
@@ -11,8 +12,9 @@
                                               SbertForSequenceClassification)
 from peft.utils import WEIGHTS_NAME
 from torch import nn
-import math
-from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig, PromptConfig, ResTuningConfig
+
+from swift import (AdapterConfig, LoRAConfig, PromptConfig, ResTuningConfig,
+                   SideConfig, Swift, SwiftModel, push_to_hub)
 
 
 class TestSwift(unittest.TestCase):
@@ -30,6 +32,7 @@ def tearDown(self):
     def test_swift_lora_forward(self):
 
         from swift.tuners.lora import Linear
+
         def reset_parameters(self):
             nn.Linear.reset_parameters(self)
             if hasattr(self, 'lora_A'):
@@ -52,9 +55,12 @@ def reset_parameters(self):
         outputs_deactivate = model(**inputs)
         model.activate_adapter('default')
         outputs_reactivate = model(**inputs)
-        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
-        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
-        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+        self.assertTrue(
+            torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(
+            not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(
+            torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_adapter_forward(self):
         model = Model.from_pretrained(
@@ -74,9 +80,12 @@ def test_swift_adapter_forward(self):
         outputs_deactivate = model(**inputs)
         model.activate_adapter('default')
         outputs_reactivate = model(**inputs)
-        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
-        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
-        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+        self.assertTrue(
+            torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(
+            not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(
+            torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_prompt_forward(self):
         model = Model.from_pretrained(
@@ -96,9 +105,12 @@ def test_swift_prompt_forward(self):
         outputs_deactivate = model(**inputs)
         model.activate_adapter('default')
         outputs_reactivate = model(**inputs)
-        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
-        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
-        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+        self.assertTrue(
+            torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(
+            not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(
+            torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_restuner_forward(self):
         model = Model.from_pretrained(
@@ -112,7 +124,7 @@ def test_swift_restuner_forward(self):
             stem_modules=r'.*layer\.\d+$',
             target_modules=r'.*pooler',
             target_modules_hook='input',
-            tuner_cfg="res_adapter",
+            tuner_cfg='res_adapter',
         )
         outputs = model(**inputs)
         model = Swift.prepare_model(model, config=restuner_config)
@@ -121,9 +133,12 @@ def test_swift_restuner_forward(self):
         outputs_deactivate = model(**inputs)
         model.activate_adapter('default')
         outputs_reactivate = model(**inputs)
-        self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits))
-        self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits))
-        self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
+        self.assertTrue(
+            torch.allclose(outputs.logits, outputs_deactivate.logits))
+        self.assertTrue(
+            not torch.allclose(outputs.logits, outputs_lora.logits))
+        self.assertTrue(
+            torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_lora_injection(self):
         model = SbertForSequenceClassification(SbertConfig())
@@ -202,15 +217,14 @@ def test_swift_side_bert(self):
         model2 = copy.deepcopy(model)
         result_origin = model(**inputs).logits
         print(
-            f'test_swift_side_bert result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}'
-        )
+            f'test_swift_side_bert result_origin shape: {result_origin.shape}, '
+            f'result_origin sum: {torch.sum(result_origin)}')
 
         side_config = SideConfig(
             dim=model.config.hidden_size,
             target_modules=r'.*encoder.encoder',
             side_module_name='mlp',
-            hidden_pos='last_hidden_state'
-        )
+            hidden_pos='last_hidden_state')
 
         model = Swift.prepare_model(model, config=side_config)
         result_activate = model(**inputs).logits
diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py
index 421544d0df..43522fbbe3 100644
--- a/tests/tuners/test_swift_restuning.py
+++ b/tests/tuners/test_swift_restuning.py
@@ -6,8 +6,7 @@
 
 import torch
 
-from swift import ResTuningConfig
-from swift import Swift, SwiftModel
+from swift import ResTuningConfig, Swift, SwiftModel
 
 
 class TestSwiftResTuning(unittest.TestCase):
@@ -37,18 +36,24 @@ def model_comparison(self, model, model2):
         model_key = list(model.state_dict().keys())
         model2_key = list(model2.state_dict().keys())
         self.assertTrue(model_key == model2_key)
-        model_val = torch.sum(torch.stack([torch.sum(val) for val in model.state_dict().values()]))
-        model2_val = torch.sum(torch.stack([torch.sum(val) for val in model2.state_dict().values()]))
+        model_val = torch.sum(
+            torch.stack(
+                [torch.sum(val) for val in model.state_dict().values()]))
+        model2_val = torch.sum(
+            torch.stack(
+                [torch.sum(val) for val in model2.state_dict().values()]))
         self.assertTrue(torch.isclose(model_val, model2_val))
 
     def test_swift_restuning_vit(self):
         from transformers import AutoModelForImageClassification
-        model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")
+        model = AutoModelForImageClassification.from_pretrained(
+            'google/vit-base-patch16-224')
         model_swift_1 = copy.deepcopy(model)
         model_swift_2 = copy.deepcopy(model)
         result_origin = model(torch.ones((1, 3, 224, 224))).logits
         print(
-            f"test_swift_restuning_vit result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}")
+            f'test_swift_restuning_vit result_origin shape: {result_origin.shape}, '
+            f'result_origin sum: {torch.sum(result_origin)}')
 
         # load type - 1
         self.set_random_seed()
@@ -57,15 +62,17 @@ def test_swift_restuning_vit(self):
             root_modules=r'.*vit.encoder.layer.0$',
             stem_modules=r'.*vit.encoder.layer\.\d+$',
             target_modules=r'.*vit.layernorm',
-            target_modules_hook="input",
-            tuner_cfg="res_adapter",
+            target_modules_hook='input',
+            tuner_cfg='res_adapter',
         )
-        model_swift_1 = Swift.prepare_model(model_swift_1, config=restuning_config_1)
+        model_swift_1 = Swift.prepare_model(
+            model_swift_1, config=restuning_config_1)
         self.assertTrue(isinstance(model_swift_1, SwiftModel))
         print(model_swift_1.get_trainable_parameters())
         result_swift_1 = model_swift_1(torch.ones((1, 3, 224, 224))).logits
         print(
-            f"test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, result_swift_1 sum: {torch.sum(result_swift_1)}")
+            f'test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, '
+            f'result_swift_1 sum: {torch.sum(result_swift_1)}')
 
         # load type - 2
         self.set_random_seed()
@@ -74,18 +81,21 @@ def test_swift_restuning_vit(self):
             root_modules=r'.*vit.encoder.layer.0$',
             stem_modules=r'.*vit.encoder.layer\.\d+$',
             target_modules=r'.*vit.encoder',
-            target_modules_hook="output",
-            target_hidden_pos="last_hidden_state",
-            tuner_cfg="res_adapter",
+            target_modules_hook='output',
+            target_hidden_pos='last_hidden_state',
+            tuner_cfg='res_adapter',
         )
-        model_swift_2 = Swift.prepare_model(model_swift_2, config=restuning_config_2)
+        model_swift_2 = Swift.prepare_model(
+            model_swift_2, config=restuning_config_2)
         self.assertTrue(isinstance(model_swift_2, SwiftModel))
         print(model_swift_2.get_trainable_parameters())
         result_swift_2 = model_swift_2(torch.ones((1, 3, 224, 224))).logits
         print(
-            f"test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, result_swift_2 sum: {torch.sum(result_swift_2)}")
+            f'test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, '
+            f'result_swift_2 sum: {torch.sum(result_swift_2)}')
 
-        self.assertTrue(all(torch.isclose(result_swift_1, result_swift_2).flatten()))
+        self.assertTrue(
+            all(torch.isclose(result_swift_1, result_swift_2).flatten()))
 
         model_swift_1.save_pretrained(self.tmp_dir)
         self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
@@ -94,37 +104,43 @@ def test_swift_restuning_vit(self):
 
     def test_swift_restuning_diffusers_sd(self):
         from diffusers import UNet2DConditionModel
-        model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        model = UNet2DConditionModel.from_pretrained(
+            'runwayml/stable-diffusion-v1-5', subfolder='unet')
         model.requires_grad_(False)
         model2 = copy.deepcopy(model)
         self.set_random_seed()
         input_data = {
-            "sample": torch.ones((1, 4, 64, 64)),
-            "timestep": 10,
-            "encoder_hidden_states": torch.ones((1, 77, 768))
+            'sample': torch.ones((1, 4, 64, 64)),
+            'timestep': 10,
+            'encoder_hidden_states': torch.ones((1, 77, 768))
         }
         result_origin = model(**input_data).sample
         print(
-            f"test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}")
+            f'test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, '
+            f'result_origin sum: {torch.sum(result_origin)}')
 
         self.set_random_seed()
         restuning_config = ResTuningConfig(
             dims=[1280, 1280, 1280, 640, 320],
             root_modules='mid_block',
-            stem_modules=['mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2', 'up_blocks.3'],
+            stem_modules=[
+                'mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2',
+                'up_blocks.3'
+            ],
             target_modules='conv_norm_out',
-            tuner_cfg="res_group_adapter",
+            tuner_cfg='res_group_adapter',
             use_upsample=True,
             upsample_out_channels=[1280, 1280, 640, 320, None],
-            zero_init_last=True
-        )
+            zero_init_last=True)
 
         model = Swift.prepare_model(model, config=restuning_config)
         self.assertTrue(isinstance(model, SwiftModel))
         print(model.get_trainable_parameters())
 
         result = model(**input_data).sample
-        print(f"test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}")
+        print(
+            f'test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}'
+        )
         model.save_pretrained(self.tmp_dir)
         self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
         model2 = Swift.from_pretrained(model2, self.tmp_dir)
diff --git a/tests/utils/test_torch_utils.py b/tests/utils/test_torch_utils.py
index 3517d7f475..106f5148eb 100644
--- a/tests/utils/test_torch_utils.py
+++ b/tests/utils/test_torch_utils.py
@@ -1,12 +1,15 @@
 import unittest
+
 from modelscope import Model
+
 from swift.utils.torch_utils import find_sub_module
 
 
 class TestTorchUtils(unittest.TestCase):
 
     def test_find_sub_module(self):
-        model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
         self.assertTrue(find_sub_module(model, 'query') is not None)
 
 

From c13ea0e86ffe497b6fb424f1c456f4a7dd2fae9b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 17:21:53 +0800
Subject: [PATCH 34/70] fix

---
 swift/trainers/trainers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index b31da08f2d..7ed0774d13 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -74,6 +74,8 @@ def prediction_step(
 
         if len(gen_kwargs) == 0 and hasattr(self, '_gen_kwargs'):
             gen_kwargs = self._gen_kwargs.copy()
+            if hasattr(self.model, 'generation_config'):
+                gen_kwargs.update(self.model.generation_config.to_dict())
 
         if gen_kwargs.get('max_length') is None and gen_kwargs.get(
                 'max_new_tokens') is None:

From 00ad79ba5238bb6b8b6f41597c3f934360ba4f9c Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 20:01:46 +0800
Subject: [PATCH 35/70] add logger

---
 swift/tuners/adapter.py   | 6 ++++++
 swift/tuners/lora.py      | 5 +++--
 swift/tuners/prompt.py    | 6 ++++++
 swift/tuners/restuning.py | 2 +-
 swift/tuners/side.py      | 6 ++++--
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 12f3d30641..90abd3e4c1 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -9,9 +9,12 @@
 from torch import nn
 from transformers.activations import ACT2CLS
 
+from swift import get_logger
 from swift.utils.torch_utils import find_sub_module
 from .utils import SwiftConfig, SwiftOutput
 
+logger = get_logger()
+
 
 @dataclass
 class AdapterConfig(SwiftConfig):
@@ -128,6 +131,9 @@ def _feed_forward_chunk(self, attention_output):
                                                config.adapter_length,
                                                ACT2CLS[config.act_layer])
                 setattr(module, f'adapter_{adapter_name}', adapter_module)
+                logger.info(
+                    f'Adapter modules(module_key): {module_key}.adapter_{adapter_name}'
+                )
 
         def state_dict_callback(state_dict, adapter_name: str):
             return {
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 1a52628bc9..13301134dc 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-import logging
 import math
 import re
 from dataclasses import dataclass, field
@@ -14,6 +13,7 @@
                                is_bnb_available)
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
+from swift import get_logger
 from ..utils.torch_utils import find_sub_module
 from .utils import SwiftConfig, SwiftOutput
 
@@ -28,7 +28,7 @@
 if is_auto_gptq_available():
     from peft.tuners.lora import QuantLinear
 
-logger = logging.getLogger()
+logger = get_logger()
 
 
 @dataclass
@@ -257,6 +257,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                     modules[module_key] = adapter_name
 
         model.lora_module_map.update(modules)
+        logger.info(f'Lora modules(module_key -> adapter_name): {modules}')
 
     @staticmethod
     def unpatch_lora(model, config: LoRAConfig):
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 3c64479369..00e5c56863 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -8,9 +8,12 @@
 import torch
 from torch import nn
 
+from swift import get_logger
 from ..utils.torch_utils import find_sub_module
 from .utils import SwiftConfig, SwiftOutput
 
+logger = get_logger()
+
 
 @dataclass
 class PromptConfig(SwiftConfig):
@@ -149,6 +152,9 @@ def _forward(self, *args, **kwargs):
                                              config.attention_mask_value,
                                              config.attach_front)
                 setattr(module, f'prompt_{adapter_name}', prompt_module)
+                logger.info(
+                    f'Prompt modules(module_key): {module_key}.prompt_{adapter_name}'
+                )
                 match_module_keys.append(module_key)
 
         def state_dict_callback(state_dict, adapter_name):
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index d8ddbc5aab..38842a78ce 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn as nn
 
-from swift.utils.logger import get_logger
+from swift import get_logger
 from ..utils.torch_utils import find_sub_module
 from .restuning_components import (ResTuner, detach_tensors,
                                    probe_input_pre_hook, probe_output_hook)
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 5f25f879a1..8d3c869730 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -1,13 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import copy
-import inspect
 import re
 import types
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from functools import partial
 from itertools import repeat
-from typing import Any, Callable, List, Union
+from typing import List, Union
 
 import torch
 import torchvision
@@ -113,6 +112,9 @@ def forward_seq(self, input, *args, **kwargs):
                 tgt_module.forward = types.MethodType(_forward, tgt_module)
                 side_module = SideModule(config.dim, config.side_module_name)
                 setattr(tgt_module, f'side_{adapter_name}', side_module)
+                logger.info(
+                    f'Side modules(module_key): {module_key}.side_{adapter_name}'
+                )
 
         def state_dict_callback(state_dict, adapter_name):
             return {

From f3a5126ece358157b7b7eaf59c5562109221de63 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 20:57:46 +0800
Subject: [PATCH 36/70] add perf item

---
 examples/pytorch/llm/src/llm_sft.py |  3 +++
 swift/trainers/trainers.py          | 23 +++++++----------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index bec7ee7069..cf88dd3a11 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -420,6 +420,9 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
+    for i in range(torch.cuda.device_count()):
+        trainer.perf['memory'][f'device:{i}'] = torch.cuda.max_memory_reserved(
+            i)
     logger.info(trainer.perf)
 
     # ### Visualization
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 7ed0774d13..b1a2df0de7 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -27,26 +27,19 @@ def __init__(self, *args, **kwargs):
             0.,
             'gen_len':
             0,
-            'eval_memory':
-            0.,
-            'train_memory':
+            'memory': {},
+            'train_time':
             0.,
             'model':
             self.model.get_trainable_parameters() if hasattr(
                 self.model, 'get_trainable_parameters') else None,
         }
 
-    def train(
-        self,
-        *args,
-        **kwargs,
-    ):
-        training_output = super().train(*args, **kwargs)
-        if self.perf['train_memory'] is None:
-            self.perf['train_memory'] = sum([
-                torch.cuda.memory_allocated(i)
-                for i in range(torch.cuda.device_count())
-            ])
+    def training_step(self, *args, **kwargs) -> torch.Tensor:
+        train_time = time.time()
+        training_output = super().training_step(*args, **kwargs)
+        train_time = time.time() - train_time
+        self.perf['train_time'] = self.perf['train_time'] + train_time
         return training_output
 
     def prediction_step(
@@ -114,8 +107,6 @@ def prediction_step(
         gen_len = len(generated_tokens[0])
         self.perf['gen_time'] = self.perf['gen_time'] + gen_time
         self.perf['gen_len'] = self.perf['gen_len'] + gen_len
-        self.perf['eval_memory'] = max(torch.cuda.memory_allocated(),
-                                       self.perf['eval_memory'])
 
         # in case the batch is shorter than max length, the output should be padded
         if gen_kwargs.get('max_length') is not None and generated_tokens.shape[

From 5655f901f45c1b67fb51295acdcb9f28a47f4e56 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 22:10:17 +0800
Subject: [PATCH 37/70] fix comments

---
 examples/pytorch/llm/src/llm_sft.py           | 75 ++++---------------
 examples/pytorch/llm/src/utils/__init__.py    |  1 +
 .../pytorch/llm/src/utils/metric_utils.py     | 59 +++++++++++++++
 examples/pytorch/llm/src/utils/preprocess.py  |  2 +-
 swift/tuners/adapter.py                       |  4 +-
 swift/tuners/lora.py                          |  6 +-
 swift/tuners/prompt.py                        |  4 +-
 swift/tuners/restuning.py                     |  4 +-
 swift/tuners/side.py                          |  4 +-
 9 files changed, 88 insertions(+), 71 deletions(-)
 create mode 100644 examples/pytorch/llm/src/utils/metric_utils.py

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index cf88dd3a11..336f7391a7 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -6,12 +6,9 @@
 from functools import partial
 from typing import Dict, List, Optional
 
-import jieba
-import numpy as np
 import torch
 import torch.distributed as dist
-from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from rouge.rouge import Rouge
+from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
                    broadcast_string, find_all_linear_for_lora, get_dataset,
@@ -36,7 +33,8 @@ class SftArguments:
         default='qwen-7b-chat',
         metadata={'choices': list(MODEL_MAPPING.keys())})
     # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G
-    sft_type: str = field(default='lora')
+    sft_type: str = field(
+        default='lora', metadata={'choices': ['lora', 'full']})
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -119,6 +117,13 @@ class SftArguments:
             "This parameter is used only when model_type.startswith('qwen-7b')"
         })
 
+    # generation config, only useful when `predict_with_generate=True`
+    do_sample: bool = True
+    top_p: float = 0.7
+    max_new_tokens: int = None
+    temperature: float = 0.95
+    top_k: int = 20
+
     def __post_init__(self):
         if is_dist():
             rank, local_rank, _, _ = get_dist_setting()
@@ -263,10 +268,11 @@ def llm_sft(args: SftArguments) -> None:
                                                      args.dataset_seed)
 
     generation_config = {
-        'do_sample': True,
-        'top_p': 0.7,
-        'max_length': args.max_length,
-        'temperature': 0.95
+        'do_sample': args.do_sample,
+        'top_p': args.top_p,
+        'max_new_tokens': args.max_new_tokens,
+        'temperature': args.temperature,
+        'top_k': args.top_k,
     }
 
     preprocess_func = get_preprocess(
@@ -359,55 +365,6 @@ def llm_sft(args: SftArguments) -> None:
 
     logger.info(f'trainer_args: {trainer_args}')
 
-    def compute_metrics(prediction):
-        preds, labels = prediction[0], prediction[1]
-
-        score_dict = {
-            'rouge-1': [],
-            'rouge-2': [],
-            'rouge-l': [],
-            'bleu-4': []
-        }
-
-        def _decode(tokens, ignore_pad_token_for_loss=False):
-            if ignore_pad_token_for_loss:
-                tokens = np.where(tokens != -100, tokens,
-                                  tokenizer.pad_token_id)
-            tokens = np.where(tokens < tokenizer.vocab_size, tokens,
-                              tokenizer.pad_token_id)
-            return [
-                t for t in tokenizer.batch_decode(
-                    tokens, skip_special_tokens=True)
-            ]
-
-        for pred, label in zip(preds, labels):
-            pred = ''.join(_decode(pred, False))
-            label = ''.join(_decode(label, True))
-            hypothesis = list(jieba.cut(pred))
-            if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
-                hypothesis = [tokenizer.decode(tokenizer.eos_token_id)]
-            reference = list(jieba.cut(label))
-            try:
-                rouge = Rouge()
-                scores = rouge.get_scores(' '.join(hypothesis),
-                                          ' '.join(reference))
-                result = scores[0]
-
-                for k, v in result.items():
-                    score_dict[k].append(round(v['f'] * 100, 4))
-                bleu_score = sentence_bleu(
-                    [list(label)],
-                    list(pred),
-                    smoothing_function=SmoothingFunction().method3)
-                score_dict['bleu-4'].append(round(bleu_score * 100, 4))
-            except Exception as e:
-                logger.error(e)
-                logger.error(f'eval error {hypothesis}, {reference}')
-
-        for k, v in score_dict.items():
-            score_dict[k] = float(np.mean(v))
-        return score_dict
-
     trainer = Seq2SeqTrainer(
         model=model,
         args=trainer_args,
@@ -415,7 +372,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False):
         train_dataset=train_dataset,
         eval_dataset=val_dataset,
         tokenizer=tokenizer,
-        compute_metrics=compute_metrics
+        compute_metrics=partial(compute_nlg_metrics, tokenizer=tokenizer)
         if args.predict_with_generate else None,
     )
 
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
index 935cec0479..ef4909dab0 100644
--- a/examples/pytorch/llm/src/utils/__init__.py
+++ b/examples/pytorch/llm/src/utils/__init__.py
@@ -1,4 +1,5 @@
 from .dataset import DATASET_MAPPING, get_dataset, process_dataset
+from .metric_utils import compute_nlg_metrics
 from .model import MODEL_MAPPING, get_model_tokenizer
 from .preprocess import TEMPLATE_MAPPING, get_preprocess
 from .utils import (broadcast_string, download_dataset,
diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py
new file mode 100644
index 0000000000..0220128212
--- /dev/null
+++ b/examples/pytorch/llm/src/utils/metric_utils.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import jieba
+import numpy as np
+from swift import get_logger
+from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+from rouge.rouge import Rouge
+
+logger = get_logger()
+
+
+def compute_nlg_metrics(tokenizer, prediction):
+    preds, labels = prediction[0], prediction[1]
+
+    score_dict = {
+        'rouge-1': [],
+        'rouge-2': [],
+        'rouge-l': [],
+        'bleu-4': []
+    }
+
+    def _decode(tokens, ignore_pad_token_for_loss=False):
+        if ignore_pad_token_for_loss:
+            tokens = np.where(tokens != -100, tokens,
+                              tokenizer.pad_token_id)
+        tokens = np.where(tokens < tokenizer.vocab_size, tokens,
+                          tokenizer.pad_token_id)
+        return [
+            t for t in tokenizer.batch_decode(
+                tokens, skip_special_tokens=True)
+        ]
+
+    for pred, label in zip(preds, labels):
+        pred = ''.join(_decode(pred, False))
+        label = ''.join(_decode(label, True))
+        hypothesis = list(jieba.cut(pred))
+        if len(hypothesis) == 0 or ''.join(hypothesis) == '.':
+            hypothesis = [tokenizer.decode(tokenizer.eos_token_id)]
+        reference = list(jieba.cut(label))
+        try:
+            rouge = Rouge()
+            scores = rouge.get_scores(' '.join(hypothesis),
+                                      ' '.join(reference))
+            result = scores[0]
+
+            for k, v in result.items():
+                score_dict[k].append(round(v['f'] * 100, 4))
+            bleu_score = sentence_bleu(
+                [list(label)],
+                list(pred),
+                smoothing_function=SmoothingFunction().method3)
+            score_dict['bleu-4'].append(round(bleu_score * 100, 4))
+        except Exception as e:
+            logger.error(e)
+            logger.error(f'eval error {hypothesis}, {reference}')
+
+    for k, v in score_dict.items():
+        score_dict[k] = float(np.mean(v))
+    return score_dict
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index 92decc5f1b..d75d131c48 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -191,7 +191,7 @@ def get_preprocess(
     tokenizer: PreTrainedTokenizer,
     system: Optional[str] = None,
     max_length: Optional[int] = None,
-    validate_generation=False,
+    validate_generation: Optional[bool] = False,
 ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]:
 
     def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]:
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 90abd3e4c1..d458e75f6e 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -154,7 +154,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'adapter_{adapter_name}')
         for _module in modules:
-            _module.activate(activate)
+            _module.set_activation(activate)
 
 
 class AdapterModule(nn.Module):
@@ -195,7 +195,7 @@ def _init_weights(m):
 
         self.apply(_init_weights)
 
-    def activate(self, activate=True):
+    def set_activation(self, activate=True):
         self._activate = activate
 
     def forward(self, x, identity=None):
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 13301134dc..f4600d331c 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -110,7 +110,7 @@ def state_dict_callback(state_dict, adapter_name):
                                    adapter_name, config.bias)
 
         def mark_trainable_callback(model):
-            mark_lora_as_trainable(model, config.bias)
+            mark_lora_as_trainable(model, adapter_name, config.bias)
 
         return SwiftOutput(config, state_dict_callback,
                            mark_trainable_callback)
@@ -121,7 +121,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
         for _module in modules:
             if isinstance(_module, LoRALayer):
-                _module.activate(activate)
+                _module.set_activation(activate)
             else:
                 _module.active_adapter = 'default' if activate else 'invalid'
 
@@ -351,7 +351,7 @@ def __init__(
         self.merged = False
         self.merge_weights = merge_weights
 
-    def activate(self, activate=True):
+    def set_activation(self, activate=True):
         if activate:
             self.r = self.old_r
         else:
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 00e5c56863..8d0bd6c796 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -176,7 +176,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'prompt_{adapter_name}')
         for _module in modules:
-            _module.activate(activate)
+            _module.set_activation(activate)
 
 
 class PromptModule(nn.Module):
@@ -229,7 +229,7 @@ def forward(self, x):
                               dim=1)
         return x
 
-    def activate(self, activate=True):
+    def set_activation(self, activate=True):
         self._activate = activate
 
     def patch_attention_mask(self, m):
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index 38842a78ce..7858561fa6 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -297,7 +297,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'restuning_{adapter_name}')
         for _module in modules:
-            _module.activate(activate)
+            _module.set_activation(activate)
 
 
 class ResTuningBypassModule(nn.Module):
@@ -330,7 +330,7 @@ def __init__(
             for i in range(depth)
         ])
 
-    def activate(self, activate=True):
+    def set_activation(self, activate=True):
         self._activate = activate
 
     def forward(self, x_list, origin_arg, **kwargs):
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 8d3c869730..0bf2b548ad 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -135,7 +135,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'side_{adapter_name}')
         for _module in modules:
-            _module.activate(activate)
+            _module.set_activation(activate)
 
 
 class SideModule(nn.Module):
@@ -172,7 +172,7 @@ def __init__(self, dim, side_module_name='fcn4'):
         self.alpha = nn.Parameter(torch.tensor(0.0))
         self._activate = True
 
-    def activate(self, activate=True):
+    def set_activation(self, activate=True):
         self._activate = activate
 
     def forward(self, x, x_main):

From 87427514b628cf85d1c716ad532e42d8ffea75d8 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 22:23:48 +0800
Subject: [PATCH 38/70] fix comments

---
 examples/pytorch/llm/src/llm_sft.py            |  8 ++++----
 examples/pytorch/llm/src/utils/metric_utils.py | 17 ++++++-----------
 examples/pytorch/llm/src/utils/model.py        |  4 ++--
 examples/pytorch/llm/src/utils/preprocess.py   |  6 ++----
 swift/tuners/adapter.py                        | 12 ++++++------
 5 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 336f7391a7..d5bc692e71 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -275,20 +275,20 @@ def llm_sft(args: SftArguments) -> None:
         'top_k': args.top_k,
     }
 
-    preprocess_func = get_preprocess(
+    preprocess_func_train = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
         validate_generation=False)
-    train_dataset = train_dataset.map(preprocess_func)
-    preprocess_func = get_preprocess(
+    train_dataset = train_dataset.map(preprocess_func_train)
+    preprocess_func_eval = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
         validate_generation=True)
-    val_dataset = val_dataset.map(preprocess_func)
+    val_dataset = val_dataset.map(preprocess_func_eval)
     del dataset
     # Data analysis
     stat_dataset(train_dataset)
diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py
index 0220128212..9d96c8a1e0 100644
--- a/examples/pytorch/llm/src/utils/metric_utils.py
+++ b/examples/pytorch/llm/src/utils/metric_utils.py
@@ -2,32 +2,27 @@
 
 import jieba
 import numpy as np
-from swift import get_logger
 from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
 from rouge.rouge import Rouge
 
+from swift import get_logger
+
 logger = get_logger()
 
 
 def compute_nlg_metrics(tokenizer, prediction):
     preds, labels = prediction[0], prediction[1]
 
-    score_dict = {
-        'rouge-1': [],
-        'rouge-2': [],
-        'rouge-l': [],
-        'bleu-4': []
-    }
+    score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
 
     def _decode(tokens, ignore_pad_token_for_loss=False):
         if ignore_pad_token_for_loss:
-            tokens = np.where(tokens != -100, tokens,
-                              tokenizer.pad_token_id)
+            tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id)
         tokens = np.where(tokens < tokenizer.vocab_size, tokens,
                           tokenizer.pad_token_id)
         return [
-            t for t in tokenizer.batch_decode(
-                tokens, skip_special_tokens=True)
+            t
+            for t in tokenizer.batch_decode(tokens, skip_special_tokens=True)
         ]
 
     for pred, label in zip(preds, labels):
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index 10e1c7f8b9..abfe1140e4 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -176,7 +176,7 @@ class LoRATM(NamedTuple):
 
 
 class AdapterTM(NamedTuple):
-    # default lora target modules. qkv
+    # default adapter target modules.
     baichuan = ['mlp']
     chatglm2 = ['mlp']
     llama2 = ['mlp']
@@ -185,7 +185,7 @@ class AdapterTM(NamedTuple):
 
 
 class ResTunerTM(NamedTuple):
-    # default lora target modules. qkv
+    # default res-tuning config.
     baichuan = {
         'root_modules': r'.*layers.0$',
         'stem_modules': r'.*layers\.\d+$',
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index d75d131c48..61158fe09b 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -131,7 +131,8 @@ def _preprocess(
         history: Optional[History] = None,
         system: Optional[str] = None,
         max_length: Optional[int] = None,
-        validate_generation=True,  # do cross-validation with `model.generate()`
+        validate_generation: Optional[
+            bool] = True,  # do cross-validation with `model.generate()`
 ) -> Dict[str, List[int]]:
     if history is None:
         history = []
@@ -180,9 +181,6 @@ def _preprocess(
         if labels is not None:
             labels = labels[-max_length:]
 
-    # if validate_generation:
-    #     input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids
-
     return {'input_ids': input_ids, 'labels': labels}
 
 
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index d458e75f6e..490a108d1c 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -179,9 +179,9 @@ def __init__(
         self.dim = dim
         self.adapter_length = adapter_length
         # self.adapter_type = adapter_type
-        self.ln1 = nn.Linear(dim, adapter_length)
+        self.linear1 = nn.Linear(dim, adapter_length)
         self.act = act_layer()
-        self.ln2 = nn.Linear(adapter_length, dim)
+        self.linear2 = nn.Linear(adapter_length, dim)
         self.init_weights()
         self._prepared = False
         self._activate = True
@@ -202,14 +202,14 @@ def forward(self, x, identity=None):
         if not self._activate:
             return 0.
         if not self._prepared:
-            self.ln1.to(x.device)
+            self.linear1.to(x.device)
             self.act.to(x.device)
-            self.ln2.to(x.device)
+            self.linear2.to(x.device)
             self._prepared = True
 
         x_dtype = x.dtype
-        x = x.to(self.ln1.weight.dtype)
-        out = self.ln2(self.act(self.ln1(x)))
+        x = x.to(self.linear1.weight.dtype)
+        out = self.linear2(self.act(self.linear1(x)))
         if identity is None:
             identity = x
         identity = identity.to(out.dtype)

From 8470274f46e71f6824b371aa91b250db7c5314cd Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 22:39:31 +0800
Subject: [PATCH 39/70] update readme

---
 README.md    | 5 ++++-
 README_CN.md | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e5d72aa36c..bdc5d17eb6 100644
--- a/README.md
+++ b/README.md
@@ -21,12 +21,15 @@ Currently supported approches (and counting):
 1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685)
 2. Adapter: [Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751)
 3. Prompt Tuning: [Visual Prompt Tuning](https://arxiv.org/abs/2203.12119)
-4. All tuners offered on [Peft](https://github.com/huggingface/peft).
+4. Side: [Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503)
+5. ResTuning-Bypass
+7. All tuners offered on [Peft](https://github.com/huggingface/peft)
 
 Key features:
 
 1. By integrating the ModelScope library, models can be readily obatined via a model-id.
 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result.
+3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with different tuners in different threads in a time-sharing manner.
 
 ## LLM SFT Example
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
diff --git a/README_CN.md b/README_CN.md
index 878c90ad31..1ebe678276 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -20,11 +20,14 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 1. LoRA：[LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685)
 2. Adapter：[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751)
 3. Prompt Tuning: [Visual Prompt Tuning](https://arxiv.org/abs/2203.12119)
-4. 所有在[Peft](https://github.com/huggingface/peft)上提供的tuners。
+4. Side: [Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503)
+5. ResTuning-Bypass
+6. 所有在[Peft](https://github.com/huggingface/peft)上提供的tuners
 
 关键特点：
 1. 通过集成ModelScope库，可以通过model id轻松获取模型。
 2. SWIFT提供的tuners可以组合在一起，以便在模型上探索多个tuners，以获得最佳结果。
+3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活，用户可以用一个模型在不同线程中分时使用不同的tuners。
 
 ## 大模型微调的例子
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)

From f1d6de31f8c112a214b55a855c125f5faa55153e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Sun, 10 Sep 2023 23:10:37 +0800
Subject: [PATCH 40/70] Fixbug

---
 swift/tuners/adapter.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 490a108d1c..cfbe13fdd4 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -98,19 +98,16 @@ def _forward(self, *args, **kwargs):
                         if isinstance(config.hidden_pos, int):
                             _type = type(args)
                             args = list(args)
-                            args[config.hidden_pos] = args[
-                                config.hidden_pos] + getattr(
-                                    self, f'adapter_{adapter_name}')(
-                                        args[config.hidden_pos])
+                            args[config.hidden_pos] = getattr(
+                                self, f'adapter_{adapter_name}')(
+                                    args[config.hidden_pos])
                             args = _type(args)
                         else:
-                            args[config.hidden_pos] = args[
-                                config.hidden_pos] + getattr(
-                                    self, f'adapter_{adapter_name}')(
-                                        args[config.hidden_pos])
+                            args[config.hidden_pos] = getattr(
+                                self, f'adapter_{adapter_name}')(
+                                    args[config.hidden_pos])
                     elif isinstance(args, torch.Tensor):
-                        args = args + getattr(self, f'adapter_{adapter_name}')(
-                            args)
+                        args = getattr(self, f'adapter_{adapter_name}')(args)
                     return args
 
                 def _feed_forward_chunk(self, attention_output):

From 564b7d72071c02eadd7e08478a485bd7dbb8e6c7 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 Sep 2023 00:08:06 +0800
Subject: [PATCH 41/70] fix

---
 examples/pytorch/llm/src/utils/metric_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py
index 9d96c8a1e0..d4f964a5e6 100644
--- a/examples/pytorch/llm/src/utils/metric_utils.py
+++ b/examples/pytorch/llm/src/utils/metric_utils.py
@@ -10,7 +10,7 @@
 logger = get_logger()
 
 
-def compute_nlg_metrics(tokenizer, prediction):
+def compute_nlg_metrics(prediction, tokenizer):
     preds, labels = prediction[0], prediction[1]
 
     score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}

From a6cf6321764d6bf99ae634657631110188b062a6 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 Sep 2023 17:07:04 +0800
Subject: [PATCH 42/70] fix comments

---
 examples/pytorch/llm/src/llm_sft.py | 16 +++++++++-------
 swift/trainers/trainers.py          |  7 +++++++
 swift/utils/torch_utils.py          |  3 +--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index d5bc692e71..95410ab487 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -33,8 +33,7 @@ class SftArguments:
         default='qwen-7b-chat',
         metadata={'choices': list(MODEL_MAPPING.keys())})
     # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G
-    sft_type: str = field(
-        default='lora', metadata={'choices': ['lora', 'full']})
+    sft_type: str = field(default='lora')
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -76,7 +75,6 @@ class SftArguments:
 
     gradient_checkpointing: bool = False
     batch_size: int = 1
-    eval_batch_size: int = 1
     num_train_epochs: int = 1
     # if max_steps >= 0, override num_train_epochs
     max_steps: int = -1
@@ -137,6 +135,11 @@ def __post_init__(self):
             # Initialize in advance
             dist.init_process_group(backend=self.ddp_backend)
 
+        from swift import SWIFT_MAPPING
+        all_types = [key.lower() for key in SWIFT_MAPPING.keys()] + ['full']
+        sft_type = [_type.strip() for _type in self.sft_type.split(',')]
+        assert all([_type.lower() in all_types for _type in sft_type]), \
+            f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}'
         if self.sft_type == 'full':
             assert self.quantization_bit is None, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
@@ -270,6 +273,7 @@ def llm_sft(args: SftArguments) -> None:
     generation_config = {
         'do_sample': args.do_sample,
         'top_p': args.top_p,
+        'max_length': None,
         'max_new_tokens': args.max_new_tokens,
         'temperature': args.temperature,
         'top_k': args.top_k,
@@ -315,7 +319,8 @@ def llm_sft(args: SftArguments) -> None:
         do_eval=True,
         evaluation_strategy='steps',
         per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=args.eval_batch_size,
+        per_device_eval_batch_size=1
+        if args.predict_with_generate else args.batch_size,
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         learning_rate=args.learning_rate,
         weight_decay=args.weight_decay,
@@ -377,9 +382,6 @@ def llm_sft(args: SftArguments) -> None:
     )
 
     trainer.train(trainer_args.resume_from_checkpoint)
-    for i in range(torch.cuda.device_count()):
-        trainer.perf['memory'][f'device:{i}'] = torch.cuda.max_memory_reserved(
-            i)
     logger.info(trainer.perf)
 
     # ### Visualization
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index b1a2df0de7..5c2de223af 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -34,12 +34,19 @@ def __init__(self, *args, **kwargs):
             self.model.get_trainable_parameters() if hasattr(
                 self.model, 'get_trainable_parameters') else None,
         }
+        self._iter_perf = 0
 
     def training_step(self, *args, **kwargs) -> torch.Tensor:
         train_time = time.time()
         training_output = super().training_step(*args, **kwargs)
         train_time = time.time() - train_time
         self.perf['train_time'] = self.perf['train_time'] + train_time
+        self._iter_perf += 1
+        if self._iter_perf > 20 and not self.perf[
+                'memory'] and torch.cuda.device_count() > 0:
+            for i in range(torch.cuda.device_count()):
+                self.perf['memory'][
+                    f'device:{i}'] = torch.cuda.memory_reserved(i)
         return training_output
 
     def prediction_step(
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index a8c6153f0d..7a177ce903 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -92,9 +92,8 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     s = [
         f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
         f'{n_buffers:.4f}M Buffers, ',
-        f'Trainable percentage: {100 * n_grads / n_params:.2f}%'
+        f'Trainable percentage: {100 * n_grads / n_params:.2f}%.'
     ]
-    s += '.'
     logger.info(''.join(s))
 
 

From 985f4a0a4f5f85b59424fbec23ac033e4741edf5 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 Sep 2023 20:13:02 +0800
Subject: [PATCH 43/70] fix CI

---
 examples/pytorch/llm/src/llm_sft.py           | 63 +++++------------
 examples/pytorch/llm/src/utils/swift_utils.py | 54 +++++++++++++++
 swift/__init__.py                             | 22 +++---
 swift/tuners/__init__.py                      |  8 +--
 swift/tuners/adapter.py                       |  2 +-
 swift/tuners/prompt.py                        |  2 +-
 tests/tuners/test_swift_base.py               | 69 +++++++++++++++++++
 7 files changed, 156 insertions(+), 64 deletions(-)
 create mode 100644 examples/pytorch/llm/src/utils/swift_utils.py

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 95410ab487..c82e35ffdc 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -4,21 +4,21 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from dataclasses import dataclass, field
 from functools import partial
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 import torch
 import torch.distributed as dist
 from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics
+from examples.pytorch.llm.src.utils.swift_utils import prepare_model
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, find_all_linear_for_lora, get_dataset,
-                   get_dist_setting, get_model_tokenizer, get_preprocess,
-                   is_dist, is_master, plot_images, process_dataset,
-                   select_bnb, select_dtype, show_layers)
-
-from swift import (AdapterConfig, HubStrategy, LoRAConfig, ResTuningConfig,
-                   Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift,
-                   SwiftConfig, get_logger)
+                   broadcast_string, get_dataset, get_dist_setting,
+                   get_model_tokenizer, get_preprocess, is_dist, is_master,
+                   plot_images, process_dataset, select_bnb, select_dtype,
+                   show_layers)
+
+from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
+                   Swift, get_logger)
 from swift.hub import HubApi, ModelScopeConfig
 from swift.utils import (add_version_to_work_dir, parse_args, print_model_info,
                          seed_everything)
@@ -135,8 +135,12 @@ def __post_init__(self):
             # Initialize in advance
             dist.init_process_group(backend=self.ddp_backend)
 
-        from swift import SWIFT_MAPPING
-        all_types = [key.lower() for key in SWIFT_MAPPING.keys()] + ['full']
+        from swift import SwiftTuners
+        all_types = [
+            SwiftTuners.LORA.lower(),
+            SwiftTuners.ADAPTER.lower(),
+            SwiftTuners.RESTUNING.lower()
+        ] + ['full']
         sft_type = [_type.strip() for _type in self.sft_type.split(',')]
         assert all([_type.lower() in all_types for _type in sft_type]), \
             f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}'
@@ -215,42 +219,7 @@ def llm_sft(args: SftArguments) -> None:
         args.model_type, torch_dtype=args.torch_dtype, **kwargs)
 
     if args.resume_from_ckpt is None:
-        swift_config: Dict[str, SwiftConfig] = dict()
-        for sft_type in args.sft_type.split(','):
-            if sft_type == 'lora':
-                if 'ALL' in args.lora_target_modules:
-                    assert len(args.lora_target_modules) == 1
-                    args.lora_target_modules = find_all_linear_for_lora(
-                        model, args.quantization_bit, args.model_type)
-                    logger.info(
-                        f'Setting lora_target_modules: {args.lora_target_modules}'
-                    )
-
-                lora_config = LoRAConfig(
-                    r=args.lora_rank,
-                    target_modules=args.lora_target_modules,
-                    lora_alpha=args.lora_alpha,
-                    lora_dropout=args.lora_dropout_p)
-                logger.info(f'lora_config: {lora_config}')
-                swift_config['lora'] = lora_config
-            elif sft_type == 'adapter':
-                adapter_config = AdapterConfig(
-                    dim=model.config.hidden_size,
-                    target_modules=MODEL_MAPPING[args.model_type].get(
-                        'adapter_TM', ['mlp']),
-                    method_name='forward',
-                    hidden_pos=0,
-                    adapter_length=args.adapter_length,
-                )
-                logger.info(f'adapter_config: {adapter_config}')
-                swift_config['adapter'] = adapter_config
-            elif sft_type == 'restuner':
-                restuner_config = ResTuningConfig(
-                    dims=model.config.hidden_size,
-                    **MODEL_MAPPING[args.model_type]['restuner_TM'])
-                logger.info(f'restuner_config: {restuner_config}')
-                swift_config['restuner'] = restuner_config
-        model = Swift.prepare_model(model, swift_config)
+        model = prepare_model(model, args)
     else:
         model = Swift.from_pretrained(
             model, args.resume_from_ckpt, is_trainable=True)
diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
new file mode 100644
index 0000000000..ee286a1b75
--- /dev/null
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict, Any
+
+import torch.nn
+
+from swift import (AdapterConfig, LoRAConfig, ResTuningConfig,
+                   Swift,
+                   SwiftConfig, get_logger)
+from .model import (MODEL_MAPPING)
+from .utils import find_all_linear_for_lora
+from swift import SwiftTuners
+
+logger = get_logger()
+
+
+def prepare_model(model: torch.nn.Module,
+                  args: Any,
+                  ):
+    swift_config: Dict[str, SwiftConfig] = dict()
+    for sft_type in [_type.strip() for _type in args.sft_type.split(',')]:
+        if sft_type.lower() == SwiftTuners.LORA.lower():
+            if 'ALL' in args.lora_target_modules:
+                assert len(args.lora_target_modules) == 1
+                args.lora_target_modules = find_all_linear_for_lora(
+                    model, args.quantization_bit, args.model_type)
+                logger.info(
+                    f'Setting lora_target_modules: {args.lora_target_modules}'
+                )
+
+            lora_config = LoRAConfig(
+                r=args.lora_rank,
+                target_modules=args.lora_target_modules,
+                lora_alpha=args.lora_alpha,
+                lora_dropout=args.lora_dropout_p)
+            logger.info(f'lora_config: {lora_config}')
+            swift_config['lora'] = lora_config
+        elif sft_type.lower() == SwiftTuners.ADAPTER.lower():
+            adapter_config = AdapterConfig(
+                dim=model.config.hidden_size,
+                target_modules=MODEL_MAPPING[args.model_type].get(
+                    'adapter_TM', ['mlp']),
+                method_name='forward',
+                hidden_pos=0,
+                adapter_length=args.adapter_length,
+            )
+            logger.info(f'adapter_config: {adapter_config}')
+            swift_config['adapter'] = adapter_config
+        elif sft_type.lower() == SwiftTuners.RESTUNING.lower():
+            restuner_config = ResTuningConfig(
+                dims=model.config.hidden_size,
+                **MODEL_MAPPING[args.model_type]['restuner_TM'])
+            logger.info(f'restuner_config: {restuner_config}')
+            swift_config['restuner'] = restuner_config
+    return Swift.prepare_model(model, swift_config)
diff --git a/swift/__init__.py b/swift/__init__.py
index 6e866d6515..9049f2e70d 100644
--- a/swift/__init__.py
+++ b/swift/__init__.py
@@ -5,16 +5,15 @@
 
 if TYPE_CHECKING:
     from .version import __version__, __release_datetime__
-    from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel,
-                         LoRA, LoRAConfig, SWIFT_MAPPING, LoraConfig,
-                         PeftConfig, PeftModel, PeftModelForCausalLM,
-                         ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM,
-                         PeftModelForSequenceClassification,
-                         PeftModelForTokenClassification, PrefixTuningConfig,
-                         PromptEncoderConfig, PromptLearningConfig,
-                         PromptTuningConfig, get_peft_config, get_peft_model,
-                         get_peft_model_state_dict, Prompt, PromptConfig,
-                         PromptModule, SwiftConfig, SwiftOutput, Swift)
+    from .tuners import (
+        Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig,
+        SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM,
+        ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM,
+        PeftModelForSequenceClassification, PeftModelForTokenClassification,
+        PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig,
+        PromptTuningConfig, get_peft_config, get_peft_model,
+        get_peft_model_state_dict, Prompt, PromptConfig, PromptModule,
+        SwiftConfig, SwiftOutput, Swift, SwiftTuners)
     from .hub import snapshot_download, push_to_hub, push_to_hub_async, push_to_hub_in_queue
     from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend,
                            HubStrategy, IntervalStrategy, SchedulerType,
@@ -38,7 +37,8 @@
             'PromptEncoderConfig', 'PromptLearningConfig',
             'PromptTuningConfig', 'get_peft_config', 'get_peft_model',
             'get_peft_model_state_dict', 'Prompt', 'PromptConfig',
-            'PromptModule', 'SwiftConfig', 'SwiftOutput', 'Swift'
+            'PromptModule', 'SwiftConfig', 'SwiftOutput', 'Swift',
+            'SwiftTuners'
         ],
         'trainers': [
             'EvaluationStrategy', 'FSDPOption', 'HPSearchBackend',
diff --git a/swift/tuners/__init__.py b/swift/tuners/__init__.py
index 6ebb813e90..1ecc496850 100644
--- a/swift/tuners/__init__.py
+++ b/swift/tuners/__init__.py
@@ -7,9 +7,9 @@
     from .adapter import Adapter, AdapterConfig, AdapterModule
     from .base import SwiftModel, Swift
     from .lora import LoRA, LoRAConfig
-    from .mapping import SWIFT_MAPPING
+    from .mapping import SWIFT_MAPPING, SwiftTuners
     from .side import Side, SideConfig, SideModule
-    from .restuning import ResTuning, ResTuningConfig, ResTuningModule
+    from .restuning import ResTuning, ResTuningConfig, ResTuningBypassModule
     from .peft import (LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM,
                        PeftModelForSeq2SeqLM,
                        PeftModelForSequenceClassification,
@@ -24,9 +24,9 @@
         'adapter': ['Adapter', 'AdapterConfig', 'AdapterModule'],
         'base': ['SwiftModel', 'Swift'],
         'lora': ['LoRA', 'LoRAConfig'],
-        'mapping': ['SWIFT_MAPPING'],
+        'mapping': ['SWIFT_MAPPING', 'SwiftTuners'],
         'side': ['Side', 'SideConfig', 'SideModule'],
-        'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningModule'],
+        'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningBypassModule'],
         'peft': [
             'LoraConfig', 'PeftConfig', 'PeftModel', 'PeftModelForCausalLM',
             'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification',
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index cfbe13fdd4..1da707b1a0 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -197,7 +197,7 @@ def set_activation(self, activate=True):
 
     def forward(self, x, identity=None):
         if not self._activate:
-            return 0.
+            return x
         if not self._prepared:
             self.linear1.to(x.device)
             self.act.to(x.device)
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 8d0bd6c796..a9d223aa20 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -38,7 +38,7 @@ class PromptConfig(SwiftConfig):
         extract_embedding: Whether the embedding is extracted at final stage to keep the same dims with inputs
     """
 
-    dim: int = field(
+    dim: Union[int, List[int]] = field(
         default=None, metadata={'help': 'The dimension of the hidden states'})
 
     target_modules: str = field(
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 79082f5c92..3f3c19ccc6 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -4,6 +4,7 @@
 import shutil
 import tempfile
 import unittest
+from concurrent.futures import ThreadPoolExecutor
 from time import time
 
 import torch
@@ -208,6 +209,74 @@ def test_swift_multiple_adapters(self):
                     torch.isclose(state_dict[key],
                                   state_dict2[key]).flatten().detach().cpu()))
 
+    def test_swift_multiple_adapters_switching(self):
+        from swift.tuners.lora import Linear
+        from swift.tuners.adapter import AdapterModule
+
+        def reset_parameters(self):
+            nn.Linear.reset_parameters(self)
+            if hasattr(self, 'lora_A'):
+                # initialize A the same way as the default for nn.Linear and B to zero
+                nn.init.ones_(self.lora_A)
+                nn.init.ones_(self.lora_B)
+
+        Linear.reset_parameters = reset_parameters
+
+        def init_weights(self):
+
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    nn.init.ones_(m.weight)
+                    nn.init.ones_(m.bias)
+
+            self.apply(_init_weights)
+
+        AdapterModule.init_weights = init_weights
+
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        inputs = preprocessor('how are you')
+        model1 = copy.deepcopy(model)
+        model2 = copy.deepcopy(model)
+        model1 = Swift.prepare_model(
+            model1,
+            config={
+                'lora': LoRAConfig(target_modules=['query', 'key', 'value'])
+            })
+        model2 = Swift.prepare_model(
+            model2,
+            config={
+                'adapter':
+                AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=r'.*layer\.\d+$',
+                    method_name='feed_forward_chunk',
+                    hidden_pos=0)
+            })
+        model = Swift.prepare_model(
+            model,
+            config={
+                'lora':
+                LoRAConfig(target_modules=['query', 'key', 'value']),
+                'adapter':
+                AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=r'.*layer\.\d+$',
+                    method_name='feed_forward_chunk',
+                    hidden_pos=0)
+            })
+        model.deactivate_adapter('adapter')
+        outputs1 = model(**inputs)
+        outputs2 = model1(**inputs)
+        self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits))
+        model.activate_adapter('adapter')
+        model.deactivate_adapter('lora')
+        outputs1 = model(**inputs)
+        outputs2 = model2(**inputs)
+        self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits))
+
     def test_swift_side_bert(self):
         model = Model.from_pretrained(
             'damo/nlp_structbert_sentence-similarity_chinese-base')

From 985780c4d6c468a8d3504dc4695a8eeb6af608ad Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 11 Sep 2023 23:51:47 +0800
Subject: [PATCH 44/70] support thread local

---
 examples/pytorch/llm/src/utils/swift_utils.py |  20 ++--
 swift/tuners/adapter.py                       |  12 +--
 swift/tuners/base.py                          |  15 +++
 swift/tuners/lora.py                          | 101 ++++++++++++++----
 swift/tuners/prompt.py                        |  14 ++-
 swift/tuners/restuning.py                     |  12 +--
 swift/tuners/side.py                          |  12 +--
 swift/tuners/utils.py                         |  19 ++++
 tests/tuners/test_swift_base.py               |  24 +++++
 9 files changed, 169 insertions(+), 60 deletions(-)

diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
index ee286a1b75..3f11634f00 100644
--- a/examples/pytorch/llm/src/utils/swift_utils.py
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -1,21 +1,20 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Dict, Any
+from typing import Any, Dict
 
 import torch.nn
 
-from swift import (AdapterConfig, LoRAConfig, ResTuningConfig,
-                   Swift,
-                   SwiftConfig, get_logger)
-from .model import (MODEL_MAPPING)
+from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, Swift,
+                   SwiftConfig, SwiftTuners, get_logger)
+from .model import MODEL_MAPPING
 from .utils import find_all_linear_for_lora
-from swift import SwiftTuners
 
 logger = get_logger()
 
 
-def prepare_model(model: torch.nn.Module,
-                  args: Any,
-                  ):
+def prepare_model(
+    model: torch.nn.Module,
+    args: Any,
+):
     swift_config: Dict[str, SwiftConfig] = dict()
     for sft_type in [_type.strip() for _type in args.sft_type.split(',')]:
         if sft_type.lower() == SwiftTuners.LORA.lower():
@@ -24,8 +23,7 @@ def prepare_model(model: torch.nn.Module,
                 args.lora_target_modules = find_all_linear_for_lora(
                     model, args.quantization_bit, args.model_type)
                 logger.info(
-                    f'Setting lora_target_modules: {args.lora_target_modules}'
-                )
+                    f'Setting lora_target_modules: {args.lora_target_modules}')
 
             lora_config = LoRAConfig(
                 r=args.lora_rank,
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 1da707b1a0..26b6adabf8 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -11,7 +11,7 @@
 
 from swift import get_logger
 from swift.utils.torch_utils import find_sub_module
-from .utils import SwiftConfig, SwiftOutput
+from .utils import ActivationMixin, SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -151,10 +151,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'adapter_{adapter_name}')
         for _module in modules:
+            _module: ActivationMixin
             _module.set_activation(activate)
 
 
-class AdapterModule(nn.Module):
+class AdapterModule(nn.Module, ActivationMixin):
     """The implementation of adapter tuning method.
 
     Adapters project input tokens by an MLP layer.
@@ -173,6 +174,7 @@ def __init__(
         act_layer=nn.GELU,
     ):
         super(AdapterModule, self).__init__()
+        super(nn.Module, self).__init__()
         self.dim = dim
         self.adapter_length = adapter_length
         # self.adapter_type = adapter_type
@@ -181,7 +183,6 @@ def __init__(
         self.linear2 = nn.Linear(adapter_length, dim)
         self.init_weights()
         self._prepared = False
-        self._activate = True
 
     def init_weights(self):
 
@@ -192,11 +193,8 @@ def _init_weights(m):
 
         self.apply(_init_weights)
 
-    def set_activation(self, activate=True):
-        self._activate = activate
-
     def forward(self, x, identity=None):
-        if not self._activate:
+        if not self.is_activated():
             return x
         if not self._prepared:
             self.linear1.to(x.device)
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index dd3f984dc0..fbae7f93bd 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -385,8 +385,21 @@ def save_pretrained(self,
     def base_model(self):
         return self.model
 
+    def set_active_adapters(self, adapter_names: List[str]):
+        if not adapter_names:
+            return
+
+        adapter_names = set(adapter_names)
+        for adapter_name in (adapter_names & set(self.adapters.keys())):
+            self.activate_adapter(adapter_name)
+
+        for adapter_name in (set(self.adapters.keys()) - adapter_names):
+            self.deactivate_adapter(adapter_name)
+
     def activate_adapter(self, adapter_name):
         if adapter_name not in self.adapters:
+            logger.warning(
+                f'{adapter_name} not in adapters: {self.adapters.keys()}')
             return
 
         from .mapping import SWIFT_MAPPING
@@ -395,6 +408,8 @@ def activate_adapter(self, adapter_name):
 
     def deactivate_adapter(self, adapter_name):
         if adapter_name not in self.adapters:
+            logger.warning(
+                f'{adapter_name} not in adapters: {self.adapters.keys()}')
             return
 
         from .mapping import SWIFT_MAPPING
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index f4600d331c..af78a831d9 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -15,19 +15,87 @@
 
 from swift import get_logger
 from ..utils.torch_utils import find_sub_module
-from .utils import SwiftConfig, SwiftOutput
+from .utils import ActivationMixin, SwiftConfig, SwiftOutput
 
 if is_bnb_available():
     import bitsandbytes as bnb
 
     from peft.tuners.lora import Linear8bitLt
 
+    class Linear8bitLtSwift(ActivationMixin, Linear8bitLt):
+
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ):
+            super(ActivationMixin,
+                  self).__init__(adapter_name, in_features, out_features, r,
+                                 lora_alpha, lora_dropout, **kwargs)
+            super(Linear8bitLtSwift, self).__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if not self.is_activated():
+                return bnb.nn.Linear8bitLt.forward(self, x)
+            return super().forward(x)
+
+
 if is_bnb_4bit_available():
     from peft.tuners.lora import Linear4bit
 
+    class Linear4bitSwift(ActivationMixin, Linear4bit):
+
+        def __init__(
+            self,
+            adapter_name,
+            in_features,
+            out_features,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ):
+            super(ActivationMixin,
+                  self).__init__(adapter_name, in_features, out_features, r,
+                                 lora_alpha, lora_dropout, **kwargs)
+            super(Linear4bitSwift, self).__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if not self.is_activated():
+                return bnb.nn.Linear4bit.forward(self, x)
+            return super().forward(x)
+
+
 if is_auto_gptq_available():
     from peft.tuners.lora import QuantLinear
 
+    class QuantLinearSwift(ActivationMixin, QuantLinear):
+
+        def __init__(
+            self,
+            adapter_name,
+            quant_linear_module,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            **kwargs,
+        ):
+            super(ActivationMixin,
+                  self).__init__(adapter_name, quant_linear_module, r,
+                                 lora_alpha, lora_dropout, **kwargs)
+            super(QuantLinearSwift, self).__init__()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if not self.is_activated():
+                return self.quant_linear_module(x)
+            return super().forward(x)
+
+
 logger = get_logger()
 
 
@@ -120,10 +188,8 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
                          activate: bool):
         modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
         for _module in modules:
-            if isinstance(_module, LoRALayer):
-                _module.set_activation(activate)
-            else:
-                _module.active_adapter = 'default' if activate else 'invalid'
+            _module: ActivationMixin
+            _module.set_activation(activate)
 
     @staticmethod
     def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
@@ -174,7 +240,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         'index':
                         sub_module.index,
                     })
-                    lora_module = Linear8bitLt(
+                    lora_module = Linear8bitLtSwift(
                         'default',
                         sub_module.in_features,
                         sub_module.out_features,
@@ -193,7 +259,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         'quant_type':
                         sub_module.weight.quant_type,
                     })
-                    lora_module = Linear4bit(
+                    lora_module = Linear4bitSwift(
                         'default',
                         sub_module.in_features,
                         sub_module.out_features,
@@ -202,7 +268,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         **four_bit_kwargs)
                 elif AutoGPTQQuantLinear is not None and isinstance(
                         sub_module, AutoGPTQQuantLinear):
-                    lora_module = QuantLinear('default', sub_module, **kwargs)
+                    lora_module = QuantLinearSwift('default', sub_module,
+                                                   **kwargs)
                     sub_module.weight = sub_module.qweight
                 elif isinstance(sub_module, torch.nn.Linear):
                     if use_merged_linear:
@@ -330,7 +397,7 @@ def unpatch_lora(model, config: LoRAConfig):
                     model.lora_module_map.pop(module_key, None)
 
 
-class LoRALayer:
+class LoRALayer(ActivationMixin):
 
     def __init__(
         self,
@@ -339,8 +406,8 @@ def __init__(
         lora_dropout: float,
         merge_weights: bool,
     ):
+        super().__init__()
         self.r = r
-        self.old_r = r
         self.lora_alpha = lora_alpha
         # Optional dropout
         if lora_dropout > 0.:
@@ -351,12 +418,6 @@ def __init__(
         self.merged = False
         self.merge_weights = merge_weights
 
-    def set_activation(self, activate=True):
-        if activate:
-            self.r = self.old_r
-        else:
-            self.r = 0
-
 
 class Embedding(nn.Embedding, LoRALayer):
     # LoRA implemented in a dense layer
@@ -420,7 +481,7 @@ def eval(self):
             self.merged = True
 
     def forward(self, x: torch.Tensor):
-        if self.r > 0 and not self.merged:
+        if self.r > 0 and not self.merged and self.is_activated():
             result = nn.Embedding.forward(self, x)
             if self.r > 0:
                 after_A = F.embedding(x, self.lora_A.T, self.padding_idx,
@@ -511,7 +572,7 @@ def forward(self, x: torch.Tensor):
         def T(w):
             return w.T if self.fan_in_fan_out else w
 
-        if self.r > 0 and not self.merged:
+        if self.r > 0 and not self.merged and self.is_activated():
             result = F.linear(x, T(self.weight), bias=self.bias)
             if self.r > 0:
                 x_dtype = x.dtype
@@ -631,7 +692,7 @@ def forward(self, x: torch.Tensor):
         def T(w):
             return w.T if self.fan_in_fan_out else w
 
-        if self.merged:
+        if self.merged or not self.is_activated():
             return F.linear(x, T(self.weight), bias=self.bias)
         else:
             result = F.linear(x, T(self.weight), bias=self.bias)
@@ -713,7 +774,7 @@ def eval(self):
             self.merged = True
 
     def forward(self, x: torch.Tensor):
-        if self.r > 0 and not self.merged:
+        if self.r > 0 and not self.merged and self.is_activated():
             return F.conv2d(
                 x,
                 self.weight +  # noqa
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index a9d223aa20..661eb4dbbe 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -10,7 +10,7 @@
 
 from swift import get_logger
 from ..utils.torch_utils import find_sub_module
-from .utils import SwiftConfig, SwiftOutput
+from .utils import ActivationMixin, SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -176,10 +176,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'prompt_{adapter_name}')
         for _module in modules:
+            _module: ActivationMixin
             _module.set_activation(activate)
 
 
-class PromptModule(nn.Module):
+class PromptModule(nn.Module, ActivationMixin):
     """The implementation of vision prompt tuning method.
 
     Visual prompt tuning (VPT) is proposed to initialize tunable prompt tokens
@@ -200,17 +201,17 @@ def __init__(self,
                  mask_values=0.,
                  attach_front=True):
         super(PromptModule, self).__init__()
+        super(nn.Module, self).__init__()
         self.dim = dim
         self.layer_num = layer_num
         self.prompt_length = prompt_length
         self.mask_values = mask_values
         self.attach_front = attach_front
-        self._activate = True
         self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim))
         nn.init.xavier_uniform_(self.prompt_token)
 
     def forward(self, x):
-        if not self._activate:
+        if not self.is_activated():
             return x
         prompt_token = self.prompt_token.expand(x.shape[0], -1,
                                                 -1).to(x.device)
@@ -229,11 +230,8 @@ def forward(self, x):
                               dim=1)
         return x
 
-    def set_activation(self, activate=True):
-        self._activate = activate
-
     def patch_attention_mask(self, m):
-        if not self._activate:
+        if not self.is_activated():
             return m
         prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length),
                                            self.mask_values).to(m.device)
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index 7858561fa6..e40290e06d 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -12,7 +12,7 @@
 from ..utils.torch_utils import find_sub_module
 from .restuning_components import (ResTuner, detach_tensors,
                                    probe_input_pre_hook, probe_output_hook)
-from .utils import SwiftConfig, SwiftOutput
+from .utils import ActivationMixin, SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -297,10 +297,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'restuning_{adapter_name}')
         for _module in modules:
+            _module: ActivationMixin
             _module.set_activation(activate)
 
 
-class ResTuningBypassModule(nn.Module):
+class ResTuningBypassModule(nn.Module, ActivationMixin):
     """The implementation of ResTuningBypass method.
     """
 
@@ -314,8 +315,8 @@ def __init__(
         tuner_cfg=None,
     ):
         super(ResTuningBypassModule, self).__init__()
+        super(nn.Module, self).__init__()
 
-        self._activate = True
         self.bypass_blocks = nn.Sequential(*[
             ResTunerBypassBlock(
                 dim=dims[i] if isinstance(dims, list) else dims,
@@ -330,11 +331,8 @@ def __init__(
             for i in range(depth)
         ])
 
-    def set_activation(self, activate=True):
-        self._activate = activate
-
     def forward(self, x_list, origin_arg, **kwargs):
-        if not self._activate:
+        if not self.is_activated():
             return origin_arg
         x_bypass = detach_tensors(x_list.pop(0))
         x_bypass = x_bypass[0] if isinstance(x_bypass,
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 0bf2b548ad..5e766b72b0 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -14,7 +14,7 @@
 
 from swift.utils.logger import get_logger
 from ..utils.torch_utils import find_sub_module
-from .utils import SwiftConfig, SwiftOutput
+from .utils import ActivationMixin, SwiftConfig, SwiftOutput
 
 logger = get_logger()
 
@@ -135,10 +135,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
         modules: List[torch.nn.Module] = find_sub_module(
             module, f'side_{adapter_name}')
         for _module in modules:
+            _module: ActivationMixin
             _module.set_activation(activate)
 
 
-class SideModule(nn.Module):
+class SideModule(nn.Module, ActivationMixin):
     """The implementation of vision side-tuning method.
 
     Side-Tuning only needs to train one side network and
@@ -153,6 +154,7 @@ class SideModule(nn.Module):
 
     def __init__(self, dim, side_module_name='fcn4'):
         super(SideModule, self).__init__()
+        super(nn.Module, self).__init__()
 
         side_module_name = side_module_name.lower()
         if side_module_name == 'fcn4':
@@ -170,13 +172,9 @@ def __init__(self, dim, side_module_name='fcn4'):
             raise ValueError(
                 f'Unsupported side_module_name: {side_module_name}')
         self.alpha = nn.Parameter(torch.tensor(0.0))
-        self._activate = True
-
-    def set_activation(self, activate=True):
-        self._activate = activate
 
     def forward(self, x, x_main):
-        if not self._activate:
+        if not self.is_activated():
             return x_main
         alpha_squashed = torch.sigmoid(self.alpha)
         x_side = self.side_net(x)
diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py
index 0e0c4bed4f..8dee34b0b0 100644
--- a/swift/tuners/utils.py
+++ b/swift/tuners/utils.py
@@ -2,8 +2,10 @@
 # Copyright 2023-present the HuggingFace Inc. team.
 
 import os
+import threading
 from dataclasses import asdict, dataclass, field
 from types import FunctionType
+from typing import Dict
 
 import json
 from peft.utils import CONFIG_NAME
@@ -125,3 +127,20 @@ class SwiftOutput:
     config: SwiftConfig = None
     state_dict_callback: FunctionType = None
     mark_trainable_callback: FunctionType = None
+
+
+class ActivationMixin:
+
+    USE_UNIQUE_THREAD = 'USE_UNIQUE_THREAD'
+
+    def __init__(self):
+        self._thread_inf: Dict[int, bool] = {}
+        self._unique_thread = os.environ.get(ActivationMixin.USE_UNIQUE_THREAD)
+
+    def set_activation(self, activate=True):
+        tid = 0 if self._unique_thread else threading.get_ident()
+        self._thread_inf[tid] = activate
+
+    def is_activated(self):
+        tid = 0 if self._unique_thread else threading.get_ident()
+        return self._thread_inf.get(tid, True)
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 3f3c19ccc6..ba2f0f100e 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -277,6 +277,30 @@ def _init_weights(m):
         outputs2 = model2(**inputs)
         self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits))
 
+        def thread_func1():
+            model.set_active_adapters(['lora'])
+            outputs_single = model1(**inputs)
+            outputs_t1 = model(**inputs)
+            self.assertTrue(
+                torch.allclose(outputs_single.logits, outputs_t1.logits))
+
+        def thread_func2():
+            model.set_active_adapters(['adapter'])
+            outputs_single = model2(**inputs)
+            outputs_t2 = model(**inputs)
+            self.assertTrue(
+                torch.allclose(outputs_single.logits, outputs_t2.logits))
+
+        with ThreadPoolExecutor(2) as executor:
+            f1 = executor.submit(thread_func1)
+            f2 = executor.submit(thread_func2)
+            e1 = f1.exception()
+            e2 = f2.exception()
+            if e1 is not None:
+                raise e1
+            if e2 is not None:
+                raise e2
+
     def test_swift_side_bert(self):
         model = Model.from_pretrained(
             'damo/nlp_structbert_sentence-similarity_chinese-base')

From 9a2777c01a32daf410fa0517fb979850005a736e Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 Sep 2023 11:18:05 +0800
Subject: [PATCH 45/70] fix CI

---
 tests/tuners/test_swift_restuning.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py
index 43522fbbe3..016c8d7361 100644
--- a/tests/tuners/test_swift_restuning.py
+++ b/tests/tuners/test_swift_restuning.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from swift import ResTuningConfig, Swift, SwiftModel
+from swift import ResTuningConfig, Swift, SwiftModel, snapshot_download
 
 
 class TestSwiftResTuning(unittest.TestCase):
@@ -45,9 +45,9 @@ def model_comparison(self, model, model2):
         self.assertTrue(torch.isclose(model_val, model2_val))
 
     def test_swift_restuning_vit(self):
+        model_dir = snapshot_download('AI-ModelScope/vit-base-patch16-224')
         from transformers import AutoModelForImageClassification
-        model = AutoModelForImageClassification.from_pretrained(
-            'google/vit-base-patch16-224')
+        model = AutoModelForImageClassification.from_pretrained(model_dir)
         model_swift_1 = copy.deepcopy(model)
         model_swift_2 = copy.deepcopy(model)
         result_origin = model(torch.ones((1, 3, 224, 224))).logits
@@ -103,9 +103,10 @@ def test_swift_restuning_vit(self):
         self.model_comparison(model_swift_1, model_loaded)
 
     def test_swift_restuning_diffusers_sd(self):
+        model_dir = snapshot_download('AI-ModelScope/stable-diffusion-v1-5')
         from diffusers import UNet2DConditionModel
         model = UNet2DConditionModel.from_pretrained(
-            'runwayml/stable-diffusion-v1-5', subfolder='unet')
+            model_dir, subfolder='unet')
         model.requires_grad_(False)
         model2 = copy.deepcopy(model)
         self.set_random_seed()

From b5f46d256860f10a6d4bd9f711aaf5e058624d23 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 Sep 2023 22:24:22 +0800
Subject: [PATCH 46/70] fix bug

---
 swift/tuners/base.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index fbae7f93bd..a4ee75ba8a 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -198,7 +198,7 @@ def load_state_file(path):
     def from_pretrained(cls,
                         model: nn.Module,
                         model_id: str = None,
-                        adapter_name: Union[str, List[str]] = 'default',
+                        adapter_name: Union[str, List[str]] = None,
                         inference_mode: bool = False,
                         revision: str = None,
                         **kwargs):
@@ -230,6 +230,12 @@ def from_pretrained(cls,
             )
         if not os.path.exists(model_id):
             model_dir = snapshot_download(model_id, revision=revision)
+        if adapter_name is None:
+            adapter_name = [
+                sub_dir for sub_dir in os.listdir(model_dir)
+                if os.path.isdir(os.path.join(model_dir, sub_dir)) and
+                os.path.isfile(os.path.join(model_dir, sub_dir, CONFIG_NAME))
+            ]
         for _name in adapter_name if isinstance(adapter_name,
                                                 list) else [adapter_name]:
             sub_folder = os.path.join(model_dir, _name)
@@ -466,7 +472,7 @@ def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig,
     @staticmethod
     def from_pretrained(model: nn.Module,
                         model_id: str = None,
-                        adapter_name: Union[str, List[str]] = 'default',
+                        adapter_name: Union[str, List[str]] = None,
                         revision: str = None,
                         **kwargs):
         """Prepare a model by a model_id in the ModelScope hub or a local dir.
@@ -489,8 +495,9 @@ def from_pretrained(model: nn.Module,
                 _json = json.load(f)
             is_peft_model = PEFT_TYPE_KEY in _json
 
-        _name = adapter_name if isinstance(adapter_name,
-                                           str) else adapter_name[0]
+        _name = adapter_name if isinstance(
+            adapter_name, str) or adapter_name is None else adapter_name[0]
+        _name = _name or ''
         if os.path.exists(os.path.join(model_id, _name, CONFIG_NAME)):
             with open(os.path.join(model_id, _name, CONFIG_NAME), 'r') as f:
                 _json = json.load(f)
@@ -500,7 +507,7 @@ def from_pretrained(model: nn.Module,
                 model,
                 model_id,
                 revision=revision,
-                adapter_name=adapter_name,
+                adapter_name=adapter_name or 'default',
                 **kwargs)
         else:
             return SwiftModel.from_pretrained(

From 3da517b7144fa57ed777dbac7ddfbc8ab31cf330 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 12 Sep 2023 22:34:45 +0800
Subject: [PATCH 47/70] fix bug

---
 swift/tuners/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py
index 8dee34b0b0..b43c23b6fe 100644
--- a/swift/tuners/utils.py
+++ b/swift/tuners/utils.py
@@ -135,7 +135,7 @@ class ActivationMixin:
 
     def __init__(self):
         self._thread_inf: Dict[int, bool] = {}
-        self._unique_thread = os.environ.get(ActivationMixin.USE_UNIQUE_THREAD)
+        self._unique_thread = int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0'))
 
     def set_activation(self, activate=True):
         tid = 0 if self._unique_thread else threading.get_ident()

From a9426f38c4c2d37f06c44c56144f298a263883b2 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 13:09:49 +0800
Subject: [PATCH 48/70] support tuner on one module

---
 swift/tuners/adapter.py   |  6 +++---
 swift/tuners/base.py      | 12 +++++++++---
 swift/tuners/lora.py      |  3 +++
 swift/tuners/prompt.py    |  4 ++--
 swift/tuners/restuning.py | 13 +++++++------
 swift/tuners/side.py      | 10 +++++-----
 6 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 26b6adabf8..0a33011dc6 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -93,7 +93,7 @@ def prepare_model(model: nn.Module, config: AdapterConfig,
                 module = model.get_submodule(module_key)
 
                 def _forward(self, *args, **kwargs):
-                    args = self.forward_origin(*args, **kwargs)
+                    args = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
                     if isinstance(args, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, int):
                             _type = type(args)
@@ -115,9 +115,9 @@ def _feed_forward_chunk(self, attention_output):
 
                 # TODO The `config.method_name` method should not be replaced twice.
 
-                module.forward_origin = getattr(module, config.method_name)
+                setattr(module, f'forward_origin_{adapter_name}', getattr(module, config.method_name))
                 num_args_in_forward_chunk_fn = len(
-                    inspect.signature(module.forward_origin).parameters)
+                    inspect.signature(getattr(module, f'forward_origin_{adapter_name}')).parameters)
                 if config.method_name == 'feed_forward_chunk' and num_args_in_forward_chunk_fn == 1:
                     setattr(module, config.method_name,
                             types.MethodType(_feed_forward_chunk, module))
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index a4ee75ba8a..c7c98cc06a 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -28,7 +28,7 @@ class SwiftModel(nn.Module):
     """The Swift wrapper model.
 
     Args:
-        model (`torch.nn.Module`) A module to be tuned by Swift.
+        model (`Union[nn.Module, 'SwiftModel']`) A module to be tuned by Swift.
         config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of adapter_name: SwiftConfig.
             If it's a config class, the adapter_name will be `default`
         extra_state_keys (`List[str]`, `optional`) A list of regex to match the extra state keys to be saved.
@@ -36,12 +36,19 @@ class SwiftModel(nn.Module):
     """
 
     def __init__(self,
-                 model: nn.Module,
+                 model: Union[nn.Module, 'SwiftModel'],
                  config: Union[SwiftConfig, Dict[str, SwiftConfig]],
                  extra_state_keys: List[str] = None,
                  inference_mode: bool = False,
                  **kwargs):
         super().__init__()
+        self.adapters = {}
+        if isinstance(model, SwiftModel):
+            self.adapters = model.adapters
+            extra_state_keys = extra_state_keys or []
+            extra_state_keys.extend(model.extra_state_keys)
+            model = model.base_model
+
         if (getattr(model, 'hf_device_map', None) is not None) and (
                 len(set(model.hf_device_map.values()) & {'cpu', 'disk'}) > 0):
             from accelerate.hooks import remove_hook_from_submodules
@@ -50,7 +57,6 @@ def __init__(self,
         for _, p in model.named_parameters():
             p.requires_grad = False
 
-        self.adapters = {}
         if isinstance(config, SwiftConfig):
             self.adapters[DEFAULT_ADAPTER] = self._prepare_model(
                 model, config, DEFAULT_ADAPTER)
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index af78a831d9..9bdef953d7 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -12,6 +12,7 @@
 from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available,
                                is_bnb_available)
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
+from peft.tuners.lora import LoraLayer
 
 from swift import get_logger
 from ..utils.torch_utils import find_sub_module
@@ -311,6 +312,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         dilation=sub_module.dilation,
                         groups=sub_module.groups,
                         **kwargs)
+                elif isinstance(sub_module, (LoRALayer, LoraLayer)):
+
 
                 if lora_module is not None:
                     lora_module.weight = sub_module.weight
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 661eb4dbbe..35e0217e4f 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -132,7 +132,7 @@ def _forward(self, *args, **kwargs):
                         else:
                             kwargs[config.attention_mask_pos] = attention_mask
 
-                    forward_output = self.forward_origin(*args, **kwargs)
+                    forward_output = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
                     if config.extract_embedding:
                         forward_output = getattr(
                             self,
@@ -140,7 +140,7 @@ def _forward(self, *args, **kwargs):
 
                     return forward_output
 
-                module.forward_origin = module.forward
+                setattr(module, f'forward_origin_{adapter_name}', module.forward)
                 module.forward = types.MethodType(_forward, module)
                 if isinstance(config.dim, list):
                     input_dim = config.dim[len(match_module_keys)]
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index e40290e06d..cf06e20307 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -142,9 +142,9 @@ def _forward_target(self, *args, **kwargs):
                 args_main = _forward_restuning(self, _arg)
                 args[0 if self.target_hidden_pos is None else self.
                      target_hidden_pos] = args_main
-                args_main = self.forward_origin(*args, **kwargs)
+                args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
             else:
-                _args_main = self.forward_origin(*args, **kwargs)
+                _args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
                 _arg = _args_main[0 if self.target_hidden_pos is None else self
                                   .target_hidden_pos] if isinstance(
                                       _args_main,
@@ -266,13 +266,14 @@ def _forward_restuning(self, origin_arg):
                 tgt_module.stem_module_ins_list = stem_module_ins_list
                 target_module_ins = tgt_module
 
-                if isinstance(tgt_module, nn.Sequential):
+                if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'origin_module_keys'):
                     tgt_module.origin_module_keys = copy.deepcopy(
                         list(tgt_module._modules.keys()))
-                    tgt_module.forward_origin = types.MethodType(
-                        _forward_seq, tgt_module)
+
+                    setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType(
+                        _forward_seq, tgt_module))
                 else:
-                    tgt_module.forward_origin = tgt_module.forward
+                    setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward)
                 tgt_module.forward = types.MethodType(_forward_target,
                                                       tgt_module)
         if target_module_ins is None:
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 5e766b72b0..d03db26fe5 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -79,7 +79,7 @@ def prepare_model(model: nn.Module, config: SideConfig,
                     )
 
                 def _forward(self, *args, **kwargs):
-                    args_main = self.forward_origin(*args, **kwargs)
+                    args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
                     if isinstance(args_main, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, str):
                             args_main[config.hidden_pos] = getattr(
@@ -94,7 +94,7 @@ def _forward(self, *args, **kwargs):
                         args_main = _type(args_main)
                     return args_main
 
-                if isinstance(tgt_module, nn.Sequential):
+                if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'tgt_module_keys'):
                     tgt_module.tgt_module_keys = copy.deepcopy(
                         list(tgt_module._modules.keys()))
 
@@ -105,10 +105,10 @@ def forward_seq(self, input, *args, **kwargs):
                             input = module(input)
                         return input
 
-                    tgt_module.forward_origin = types.MethodType(
-                        forward_seq, tgt_module)
+                    setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType(
+                        forward_seq, tgt_module))
                 else:
-                    tgt_module.forward_origin = tgt_module.forward
+                    setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward)
                 tgt_module.forward = types.MethodType(_forward, tgt_module)
                 side_module = SideModule(config.dim, config.side_module_name)
                 setattr(tgt_module, f'side_{adapter_name}', side_module)

From d40c6aa74c6b3c5f8b2f898c23b33bb6b939f94c Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 14:36:52 +0800
Subject: [PATCH 49/70] fix lora

---
 swift/tuners/adapter.py    | 11 ++++++---
 swift/tuners/lora.py       | 50 +++++++++++++++++++-------------------
 swift/tuners/prompt.py     |  7 ++++--
 swift/tuners/restuning.py  | 17 ++++++++-----
 swift/tuners/side.py       | 14 +++++++----
 swift/tuners/utils.py      |  3 ++-
 swift/utils/torch_utils.py |  3 +--
 7 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 0a33011dc6..3beffcfca8 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -93,7 +93,9 @@ def prepare_model(model: nn.Module, config: AdapterConfig,
                 module = model.get_submodule(module_key)
 
                 def _forward(self, *args, **kwargs):
-                    args = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
+                    args = getattr(self,
+                                   f'forward_origin_{adapter_name}')(*args,
+                                                                     **kwargs)
                     if isinstance(args, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, int):
                             _type = type(args)
@@ -115,9 +117,12 @@ def _feed_forward_chunk(self, attention_output):
 
                 # TODO The `config.method_name` method should not be replaced twice.
 
-                setattr(module, f'forward_origin_{adapter_name}', getattr(module, config.method_name))
+                setattr(module, f'forward_origin_{adapter_name}',
+                        getattr(module, config.method_name))
                 num_args_in_forward_chunk_fn = len(
-                    inspect.signature(getattr(module, f'forward_origin_{adapter_name}')).parameters)
+                    inspect.signature(
+                        getattr(module,
+                                f'forward_origin_{adapter_name}')).parameters)
                 if config.method_name == 'feed_forward_chunk' and num_args_in_forward_chunk_fn == 1:
                     setattr(module, config.method_name,
                             types.MethodType(_feed_forward_chunk, module))
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 9bdef953d7..26ab294ef3 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -4,6 +4,7 @@
 import math
 import re
 from dataclasses import dataclass, field
+from types import MethodType
 from typing import Dict, List
 
 import torch
@@ -11,8 +12,8 @@
 import torch.nn.functional as F
 from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available,
                                is_bnb_available)
-from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 from peft.tuners.lora import LoraLayer
+from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
 from swift import get_logger
 from ..utils.torch_utils import find_sub_module
@@ -175,8 +176,7 @@ def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str):
             fan_in_fan_out=config.fan_in_fan_out)
 
         def state_dict_callback(state_dict, adapter_name):
-            return lora_state_dict(state_dict, model.lora_module_map,
-                                   adapter_name, config.bias)
+            return lora_state_dict(state_dict, adapter_name, config.bias)
 
         def mark_trainable_callback(model):
             mark_lora_as_trainable(model, adapter_name, config.bias)
@@ -187,7 +187,8 @@ def mark_trainable_callback(model):
     @staticmethod
     def activate_adapter(module: torch.nn.Module, adapter_name: str,
                          activate: bool):
-        modules: List[torch.nn.Module] = find_sub_module(module, adapter_name)
+        modules: List[torch.nn.Module] = find_sub_module(
+            module, f'loramodule_{adapter_name}')
         for _module in modules:
             _module: ActivationMixin
             _module.set_activation(activate)
@@ -206,8 +207,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
         Returns:
             The lora modules
         """
-        if not hasattr(model, 'lora_module_map'):
-            model.lora_module_map = {}
         modules = {}
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(replace_modules, (str, list))
@@ -222,10 +221,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                     module_key.endswith(target_key)
                     for target_key in replace_modules)
             if target_module_found:  # noqa
-                parts = module_key.split('.')
-                module = model.get_submodule('.'.join(parts[:-1]))
                 sub_module = model.get_submodule(module_key)
-                _key = parts[-1]
 
                 lora_module = None
                 if getattr(model, 'is_loaded_in_8bit', False) and isinstance(
@@ -312,8 +308,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         dilation=sub_module.dilation,
                         groups=sub_module.groups,
                         **kwargs)
-                elif isinstance(sub_module, (LoRALayer, LoraLayer)):
 
+                def _forward(self, *args, **kwargs):
+                    for _name, _module in sub_module.named_modules():
+                        if f'loramodule_{adapter_name}' in _name and _module.is_activated(
+                        ):
+                            return _module.forward(*args, **kwargs)
+                    return self.forward_origin(*args, **kwargs)
 
                 if lora_module is not None:
                     lora_module.weight = sub_module.weight
@@ -322,11 +323,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                     if getattr(sub_module, 'state', None) is not None:
                         lora_module.state = sub_module.state
                     lora_module.to(sub_module.weight.device)
-                    lora_module.adapter_name = adapter_name
-                    setattr(module, _key, lora_module)
+                    setattr(sub_module, f'loramodule_{adapter_name}',
+                            lora_module)
+                    if not hasattr(sub_module, 'forward_origin'):
+                        sub_module.forward_origin = sub_module.forward
+                        sub_module.forward = MethodType(_forward, sub_module)
                     modules[module_key] = adapter_name
 
-        model.lora_module_map.update(modules)
         logger.info(f'Lora modules(module_key -> adapter_name): {modules}')
 
     @staticmethod
@@ -341,8 +344,6 @@ def unpatch_lora(model, config: LoRAConfig):
             model: The model called with `tune` function.
             config: The `LoRAConfig` to use.
         """
-        if not hasattr(model, 'lora_module_map'):
-            model.lora_module_map = {}
         module_keys = [key for key, _ in model.named_modules()]
         assert isinstance(config.replace_modules, (str, list))
         replace_modules = config.replace_modules
@@ -397,7 +398,6 @@ def unpatch_lora(model, config: LoRAConfig):
                     origin_module.to(sub_module.weight.device).to(
                         sub_module.weight.dtype)
                     setattr(module, _key, origin_module)
-                    model.lora_module_map.pop(module_key, None)
 
 
 class LoRALayer(ActivationMixin):
@@ -420,6 +420,8 @@ def __init__(
         # Mark the weight as unmerged
         self.merged = False
         self.merge_weights = merge_weights
+        if not self._unique_thread:
+            self.merge_weights = False
 
 
 class Embedding(nn.Embedding, LoRALayer):
@@ -801,8 +803,8 @@ def mark_lora_as_trainable(model: nn.Module,
             if 'bias' in n:
                 p.requires_grad = True
     elif bias == 'lora_only':
-        for m in model.modules():
-            if adapter_name == getattr(m, 'adapter_name', None) and \
+        for n, m in model.named_modules():
+            if f'loramodule_{adapter_name}' in n and \
                     hasattr(m, 'bias') and \
                     m.bias is not None:
                 m.bias.requires_grad = True
@@ -811,27 +813,25 @@ def mark_lora_as_trainable(model: nn.Module,
 
 
 def lora_state_dict(state_dict,
-                    module_map: Dict,
                     adapter_name: str,
                     bias: str = 'none') -> Dict[str, torch.Tensor]:
     if bias == 'none':
         return {
             k: state_dict[k]
             for k in state_dict
-            if 'lora_' in k and module_map.get(k[:k.find('lora_')
-                                                 - 1], None) == adapter_name
+            if f'loramodule_{adapter_name}' in k and 'lora_' in k
         }
     elif bias == 'all':
         return {
             k: state_dict[k]
-            for k in state_dict if ('lora_' in k and module_map.get(
-                k[:k.find('lora_') - 1], None) == adapter_name) or 'bias' in k
+            for k in state_dict
+            if ('lora_' in k and f'loramodule_{adapter_name}' in k) or (
+                'bias' in k and f'loramodule_{adapter_name}' not in k)
         }
     elif bias == 'lora_only':
         to_return = {}
         for k in state_dict:
-            if 'lora_' in k and module_map.get(k[:k.find('lora_') - 1],
-                                               None) == adapter_name:
+            if f'loramodule_{adapter_name}' in k and 'lora_' in k:
                 to_return[k] = state_dict[k]
                 bias_name = k.split('lora_')[0] + 'bias'
                 if bias_name in state_dict:
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 35e0217e4f..f306ea3d79 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -132,7 +132,9 @@ def _forward(self, *args, **kwargs):
                         else:
                             kwargs[config.attention_mask_pos] = attention_mask
 
-                    forward_output = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
+                    forward_output = getattr(
+                        self, f'forward_origin_{adapter_name}')(*args,
+                                                                **kwargs)
                     if config.extract_embedding:
                         forward_output = getattr(
                             self,
@@ -140,7 +142,8 @@ def _forward(self, *args, **kwargs):
 
                     return forward_output
 
-                setattr(module, f'forward_origin_{adapter_name}', module.forward)
+                setattr(module, f'forward_origin_{adapter_name}',
+                        module.forward)
                 module.forward = types.MethodType(_forward, module)
                 if isinstance(config.dim, list):
                     input_dim = config.dim[len(match_module_keys)]
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index cf06e20307..4a77887ac9 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -142,9 +142,12 @@ def _forward_target(self, *args, **kwargs):
                 args_main = _forward_restuning(self, _arg)
                 args[0 if self.target_hidden_pos is None else self.
                      target_hidden_pos] = args_main
-                args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
+                args_main = getattr(self,
+                                    f'forward_origin_{adapter_name}')(*args,
+                                                                      **kwargs)
             else:
-                _args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
+                _args_main = getattr(self, f'forward_origin_{adapter_name}')(
+                    *args, **kwargs)
                 _arg = _args_main[0 if self.target_hidden_pos is None else self
                                   .target_hidden_pos] if isinstance(
                                       _args_main,
@@ -266,14 +269,16 @@ def _forward_restuning(self, origin_arg):
                 tgt_module.stem_module_ins_list = stem_module_ins_list
                 target_module_ins = tgt_module
 
-                if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'origin_module_keys'):
+                if isinstance(tgt_module, nn.Sequential) and not hasattr(
+                        tgt_module, 'origin_module_keys'):
                     tgt_module.origin_module_keys = copy.deepcopy(
                         list(tgt_module._modules.keys()))
 
-                    setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType(
-                        _forward_seq, tgt_module))
+                    setattr(tgt_module, f'forward_origin_{adapter_name}',
+                            types.MethodType(_forward_seq, tgt_module))
                 else:
-                    setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward)
+                    setattr(tgt_module, f'forward_origin_{adapter_name}',
+                            tgt_module.forward)
                 tgt_module.forward = types.MethodType(_forward_target,
                                                       tgt_module)
         if target_module_ins is None:
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index d03db26fe5..3c40baede9 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -79,7 +79,9 @@ def prepare_model(model: nn.Module, config: SideConfig,
                     )
 
                 def _forward(self, *args, **kwargs):
-                    args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs)
+                    args_main = getattr(
+                        self, f'forward_origin_{adapter_name}')(*args,
+                                                                **kwargs)
                     if isinstance(args_main, (tuple, list, dict)):
                         if isinstance(config.hidden_pos, str):
                             args_main[config.hidden_pos] = getattr(
@@ -94,7 +96,8 @@ def _forward(self, *args, **kwargs):
                         args_main = _type(args_main)
                     return args_main
 
-                if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'tgt_module_keys'):
+                if isinstance(tgt_module, nn.Sequential) and not hasattr(
+                        tgt_module, 'tgt_module_keys'):
                     tgt_module.tgt_module_keys = copy.deepcopy(
                         list(tgt_module._modules.keys()))
 
@@ -105,10 +108,11 @@ def forward_seq(self, input, *args, **kwargs):
                             input = module(input)
                         return input
 
-                    setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType(
-                        forward_seq, tgt_module))
+                    setattr(tgt_module, f'forward_origin_{adapter_name}',
+                            types.MethodType(forward_seq, tgt_module))
                 else:
-                    setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward)
+                    setattr(tgt_module, f'forward_origin_{adapter_name}',
+                            tgt_module.forward)
                 tgt_module.forward = types.MethodType(_forward, tgt_module)
                 side_module = SideModule(config.dim, config.side_module_name)
                 setattr(tgt_module, f'side_{adapter_name}', side_module)
diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py
index b43c23b6fe..7289773532 100644
--- a/swift/tuners/utils.py
+++ b/swift/tuners/utils.py
@@ -135,7 +135,8 @@ class ActivationMixin:
 
     def __init__(self):
         self._thread_inf: Dict[int, bool] = {}
-        self._unique_thread = int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0'))
+        self._unique_thread = bool(
+            int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0')))
 
     def set_activation(self, activate=True):
         tid = 0 if self._unique_thread else threading.get_ident()
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 7a177ce903..867c8d4513 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -103,8 +103,7 @@ def find_sub_module(module: torch.nn.Module,
     for name, sub_module in module.named_modules():
         if not name:
             continue
-        if module_name == name or getattr(sub_module, 'adapter_name',
-                                          None) == module_name:
+        if module_name == name:
             _modules.append(sub_module)
         else:
             _modules.extend(find_sub_module(sub_module, module_name))

From e7fa13eebb826b05324bbeae00310938ccb3ed8f Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 14:40:48 +0800
Subject: [PATCH 50/70] fixbug

---
 swift/tuners/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index c7c98cc06a..01ceebcb46 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -159,7 +159,7 @@ def state_dict(self,
             for name, output in self.adapters.items():
                 if adapter_name == name or adapter_name is None:
                     state_dicts.update(
-                        output.state_dict_callback(destination, adapter_name))
+                        output.state_dict_callback(destination, name))
         if kwargs.get('save_extra_states', True):
             state_dicts.update({
                 k: v

From f317637471fc9e48176c12f74a59956f91ed7a28 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 14:52:58 +0800
Subject: [PATCH 51/70] update unittest

---
 tests/tuners/test_swift_base.py | 46 ++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index ba2f0f100e..dd7496f138 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -243,12 +243,19 @@ def _init_weights(m):
         model1 = Swift.prepare_model(
             model1,
             config={
-                'lora': LoRAConfig(target_modules=['query', 'key', 'value'])
+                'lora1': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'adapter1':
+                AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=r'.*layer\.\d+$',
+                    method_name='feed_forward_chunk',
+                    hidden_pos=0)
             })
         model2 = Swift.prepare_model(
             model2,
             config={
-                'adapter':
+                'lora2': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'adapter2':
                 AdapterConfig(
                     dim=model.config.hidden_size,
                     target_modules=r'.*layer\.\d+$',
@@ -258,34 +265,43 @@ def _init_weights(m):
         model = Swift.prepare_model(
             model,
             config={
-                'lora':
-                LoRAConfig(target_modules=['query', 'key', 'value']),
-                'adapter':
-                AdapterConfig(
-                    dim=model.config.hidden_size,
-                    target_modules=r'.*layer\.\d+$',
-                    method_name='feed_forward_chunk',
-                    hidden_pos=0)
+                'lora1': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora2': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'adapter1':
+                    AdapterConfig(
+                        dim=model.config.hidden_size,
+                        target_modules=r'.*layer\.\d+$',
+                        method_name='feed_forward_chunk',
+                        hidden_pos=0),
+                'adapter2':
+                    AdapterConfig(
+                        dim=model.config.hidden_size,
+                        target_modules=r'.*layer\.\d+$',
+                        method_name='feed_forward_chunk',
+                        hidden_pos=0),
             })
-        model.deactivate_adapter('adapter')
+        model.deactivate_adapter('adapter2')
+        model.deactivate_adapter('lora2')
         outputs1 = model(**inputs)
         outputs2 = model1(**inputs)
         self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits))
-        model.activate_adapter('adapter')
-        model.deactivate_adapter('lora')
+        model.activate_adapter('adapter2')
+        model.activate_adapter('lora2')
+        model.deactivate_adapter('adapter1')
+        model.deactivate_adapter('lora1')
         outputs1 = model(**inputs)
         outputs2 = model2(**inputs)
         self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits))
 
         def thread_func1():
-            model.set_active_adapters(['lora'])
+            model.set_active_adapters(['lora1', 'adapter1'])
             outputs_single = model1(**inputs)
             outputs_t1 = model(**inputs)
             self.assertTrue(
                 torch.allclose(outputs_single.logits, outputs_t1.logits))
 
         def thread_func2():
-            model.set_active_adapters(['adapter'])
+            model.set_active_adapters(['lora2', 'adapter2'])
             outputs_single = model2(**inputs)
             outputs_t2 = model(**inputs)
             self.assertTrue(

From cf737ec818c9f1325a5916d3c81efe12487f5658 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 15:36:52 +0800
Subject: [PATCH 52/70] fix bug

---
 swift/tuners/base.py            |  4 ++--
 swift/tuners/lora.py            |  3 +--
 tests/tuners/test_swift_base.py | 32 ++++++++++++++++++--------------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index 01ceebcb46..8ce17d76ec 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -62,9 +62,9 @@ def __init__(self,
                 model, config, DEFAULT_ADAPTER)
         elif isinstance(config, dict):
             assert (all(isinstance(c, SwiftConfig) for c in config.values()))
-            for adapter_name, config in config.items():
+            for adapter_name, _config in config.items():
                 self.adapters[adapter_name] = self._prepare_model(
-                    model, config, adapter_name)
+                    model, _config, adapter_name)
         self.model = model
 
         self.extra_state_keys = extra_state_keys or []
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 26ab294ef3..3819d1940a 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -311,8 +311,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
 
                 def _forward(self, *args, **kwargs):
                     for _name, _module in sub_module.named_modules():
-                        if f'loramodule_{adapter_name}' in _name and _module.is_activated(
-                        ):
+                        if 'loramodule_' in _name and _module.is_activated():
                             return _module.forward(*args, **kwargs)
                     return self.forward_origin(*args, **kwargs)
 
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index dd7496f138..53e15cfa66 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -243,7 +243,8 @@ def _init_weights(m):
         model1 = Swift.prepare_model(
             model1,
             config={
-                'lora1': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora1':
+                LoRAConfig(target_modules=['query', 'key', 'value']),
                 'adapter1':
                 AdapterConfig(
                     dim=model.config.hidden_size,
@@ -254,7 +255,8 @@ def _init_weights(m):
         model2 = Swift.prepare_model(
             model2,
             config={
-                'lora2': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora2':
+                LoRAConfig(target_modules=['query', 'key', 'value']),
                 'adapter2':
                 AdapterConfig(
                     dim=model.config.hidden_size,
@@ -265,20 +267,22 @@ def _init_weights(m):
         model = Swift.prepare_model(
             model,
             config={
-                'lora1': LoRAConfig(target_modules=['query', 'key', 'value']),
-                'lora2': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora1':
+                LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora2':
+                LoRAConfig(target_modules=['query', 'key', 'value']),
                 'adapter1':
-                    AdapterConfig(
-                        dim=model.config.hidden_size,
-                        target_modules=r'.*layer\.\d+$',
-                        method_name='feed_forward_chunk',
-                        hidden_pos=0),
+                AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=r'.*layer\.\d+$',
+                    method_name='feed_forward_chunk',
+                    hidden_pos=0),
                 'adapter2':
-                    AdapterConfig(
-                        dim=model.config.hidden_size,
-                        target_modules=r'.*layer\.\d+$',
-                        method_name='feed_forward_chunk',
-                        hidden_pos=0),
+                AdapterConfig(
+                    dim=model.config.hidden_size,
+                    target_modules=r'.*layer\.\d+$',
+                    method_name='feed_forward_chunk',
+                    hidden_pos=0),
             })
         model.deactivate_adapter('adapter2')
         model.deactivate_adapter('lora2')

From e64e30256c697a5ba2646115571fd46eccdea137 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 15:39:48 +0800
Subject: [PATCH 53/70] update unittest

---
 tests/tuners/test_swift_base.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 53e15cfa66..b2a3a7e3a7 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -267,10 +267,13 @@ def _init_weights(m):
         model = Swift.prepare_model(
             model,
             config={
-                'lora1':
-                LoRAConfig(target_modules=['query', 'key', 'value']),
-                'lora2':
-                LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora1': LoRAConfig(target_modules=['query', 'key', 'value']),
+                'lora2': LoRAConfig(target_modules=['query', 'key', 'value']),
+            })
+
+        model = Swift.prepare_model(
+            model,
+            config={
                 'adapter1':
                 AdapterConfig(
                     dim=model.config.hidden_size,
@@ -284,6 +287,7 @@ def _init_weights(m):
                     method_name='feed_forward_chunk',
                     hidden_pos=0),
             })
+
         model.deactivate_adapter('adapter2')
         model.deactivate_adapter('lora2')
         outputs1 = model(**inputs)

From 479c8661ec0edc760ebbc19bb1d02a349499a6f2 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 16:00:03 +0800
Subject: [PATCH 54/70] fix type claim

---
 swift/tuners/base.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index 8ce17d76ec..8eaa43aec7 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -452,13 +452,13 @@ class Swift:
     """The Wrapper to use both Peft and Swift tuners."""
 
     @staticmethod
-    def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig,
-                                                      Dict[str, SwiftConfig]],
-                      **kwargs):
+    def prepare_model(model: Union[nn.Module, 'SwiftModel'],
+                      config: Union[SwiftConfig, PeftConfig,
+                                    Dict[str, SwiftConfig]], **kwargs):
         """Prepare a model by the input config.
 
         Args:
-            model(`nn.Module`): The model to be tuned.
+            model(`Union[nn.Module, 'SwiftModel']`): The model to be tuned.
             config(`Union[SwiftConfig, PeftConfig, Dict[str, SwiftConfig]]`): The config or config dict, can be either
                 SwiftConfigs or PeftConfigs
             **kwargs:
@@ -476,7 +476,7 @@ def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig,
         raise ValueError(f'Unsupported swift config type: {config.__class__}')
 
     @staticmethod
-    def from_pretrained(model: nn.Module,
+    def from_pretrained(model: Union[nn.Module, 'SwiftModel'],
                         model_id: str = None,
                         adapter_name: Union[str, List[str]] = None,
                         revision: str = None,
@@ -484,7 +484,7 @@ def from_pretrained(model: nn.Module,
         """Prepare a model by a model_id in the ModelScope hub or a local dir.
 
         Args:
-            model(`nn.Module`): The model to be tuned.
+            model(`Union[nn.Module, 'SwiftModel']`): The model to be tuned.
             model_id(`str`): The model id of the modelhub or a local dir containing the configs/weights.
             adapter_name(`str`, `optional`): The adapter_name to use.
             revision(`str`, `optional`): The model revision if the model_id is a model id of the modelhub.

From 9cf191735fcd2b60cdae66c8d3c4c360581fbbbd Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 16:10:16 +0800
Subject: [PATCH 55/70] add test

---
 tests/tuners/test_swift_base.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index b2a3a7e3a7..c81fc17b34 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -142,11 +142,16 @@ def test_swift_restuner_forward(self):
             torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_lora_injection(self):
-        model = SbertForSequenceClassification(SbertConfig())
+        model = Model.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        preprocessor = Preprocessor.from_pretrained(
+            'damo/nlp_structbert_sentence-similarity_chinese-base')
+        input = preprocessor('this is a test')
         model2 = copy.deepcopy(model)
         lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
         model = Swift.prepare_model(model, config=lora_config)
         self.assertTrue(isinstance(model, SwiftModel))
+        output1 = model(**input)
         model.save_pretrained(self.tmp_dir)
         self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default')))
         self.assertTrue(
@@ -154,7 +159,8 @@ def test_swift_lora_injection(self):
                 os.path.join(self.tmp_dir, 'default', WEIGHTS_NAME)))
 
         model2 = Swift.from_pretrained(model2, self.tmp_dir)
-
+        output2 = model2(**input)
+        self.assertTrue(torch.allclose(output1.logits, output2.logits))
         state_dict = model.state_dict()
         state_dict2 = model2.state_dict()
         for key in state_dict:

From ddc815c2a59d8b6fa1f9738e0fd4806d72f8ff0f Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 16:17:46 +0800
Subject: [PATCH 56/70] add test

---
 tests/tuners/test_swift_base.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index c81fc17b34..83dd5fa44a 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -142,6 +142,18 @@ def test_swift_restuner_forward(self):
             torch.allclose(outputs_lora.logits, outputs_reactivate.logits))
 
     def test_swift_lora_injection(self):
+
+        from swift.tuners.lora import Linear
+
+        def reset_parameters(self):
+            nn.Linear.reset_parameters(self)
+            if hasattr(self, 'lora_A'):
+                # initialize A the same way as the default for nn.Linear and B to zero
+                nn.init.ones_(self.lora_A)
+                nn.init.ones_(self.lora_B)
+
+        Linear.reset_parameters = reset_parameters
+
         model = Model.from_pretrained(
             'damo/nlp_structbert_sentence-similarity_chinese-base')
         preprocessor = Preprocessor.from_pretrained(

From 0868f61378ac80b26e1b27c591f07472f02230f5 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 21:33:54 +0800
Subject: [PATCH 57/70] add docs

---
 README.md                                    |   2 +-
 README_CN.md                                 |   2 +-
 docs/Get Started/1.Introduction.md           | 103 ++++++++++++++++
 docs/Get Started/2.Installation.md           |  25 ++++
 docs/Get Started/3.Use in train and infer.md | 123 +++++++++++++++++++
 docs/Get Started/4.examples.md               |   4 +
 docs/Modules/1.Interface.md                  |  70 +++++++++++
 docs/Modules/2.lora.md                       |  17 +++
 docs/Modules/3.Restuning.md                  |  21 ++++
 docs/Modules/4.adapter.md                    |  15 +++
 docs/Modules/5.side.md                       |  13 ++
 docs/Modules/6.prompt.md                     |  17 +++
 swift/tuners/lora.py                         |   1 -
 swift/tuners/restuning.py                    |  26 ++--
 14 files changed, 424 insertions(+), 15 deletions(-)
 create mode 100644 docs/Get Started/1.Introduction.md
 create mode 100644 docs/Get Started/2.Installation.md
 create mode 100644 docs/Get Started/3.Use in train and infer.md
 create mode 100644 docs/Get Started/4.examples.md
 create mode 100644 docs/Modules/1.Interface.md
 create mode 100644 docs/Modules/2.lora.md
 create mode 100644 docs/Modules/3.Restuning.md
 create mode 100644 docs/Modules/4.adapter.md
 create mode 100644 docs/Modules/5.side.md
 create mode 100644 docs/Modules/6.prompt.md

diff --git a/README.md b/README.md
index bdc5d17eb6..722d0b9a17 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Key features:
 
 1. By integrating the ModelScope library, models can be readily obatined via a model-id.
 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result.
-3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with different tuners in different threads in a time-sharing manner.
+3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with multiple tuners in different threads.
 
 ## LLM SFT Example
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
diff --git a/README_CN.md b/README_CN.md
index 1ebe678276..4b6d7f4379 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -27,7 +27,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 关键特点：
 1. 通过集成ModelScope库，可以通过model id轻松获取模型。
 2. SWIFT提供的tuners可以组合在一起，以便在模型上探索多个tuners，以获得最佳结果。
-3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活，用户可以用一个模型在不同线程中分时使用不同的tuners。
+3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活，用户可以在推理时用一个模型在不同线程中使用多种tuners而互不干扰。
 
 ## 大模型微调的例子
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
diff --git a/docs/Get Started/1.Introduction.md b/docs/Get Started/1.Introduction.md
new file mode 100644
index 0000000000..36c4c32409
--- /dev/null
+++ b/docs/Get Started/1.Introduction.md	
@@ -0,0 +1,103 @@
+# 介绍
+
+Swift是一个提供LLM模型轻量级训练和推理的开源框架。Swift提供的主要能力是`efficient tuners`，tuners是运行时动态加载到模型上的额外结构，在训练时将原模型的参数冻结，只训练tuner部分，这样可以达到快速训练、降低显存使用的目的。比如，最常用的tuner是LoRA。
+
+总之，在这个框架中提供了以下特性：
+
+- **具备SOTA特性的Efficient Tuners**：用于结合大模型实现轻量级（在商业级显卡上）训练和推理，并取得较好效果
+- **使用ModelScope Hub的Trainer**：基于`transformers trainer`提供，支持LLM模型的训练，并支持将训练后的模型上传到[ModelScope Hub](https://www.modelscope.cn/models)中
+- **可运行的模型Examples**：针对热门大模型提供的训练脚本和推理脚本，并针对热门开源数据集提供了预处理逻辑，可直接运行使用
+
+# 快速开始
+
+在本章节会介绍如何快速安装swift并设定好运行环境，并跑通一个用例。
+
+安装swift的方式非常简单，用户只需要在python>=3.8环境中运行：
+
+```shell
+pip install ms-swift
+```
+
+下面的代码使用LoRA在分类任务上训练了`bert-base-uncased`模型：
+
+**运行下面的代码前请额外安装modelscope: **
+
+```shell
+pip install modelscope>=1.9.0
+```
+
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from modelscope import AutoModelForSequenceClassification, AutoTokenizer, MsDataset
+from transformers import default_data_collator
+
+from swift import Trainer, LoRAConfig, Swift, TrainingArguments
+
+
+model = AutoModelForSequenceClassification.from_pretrained(
+            'AI-ModelScope/bert-base-uncased', revision='v1.0.0')
+tokenizer = AutoTokenizer.from_pretrained(
+    'AI-ModelScope/bert-base-uncased', revision='v1.0.0')
+lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
+model = Swift.prepare_model(model, config=lora_config)
+
+train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train').to_hf_dataset().select(range(100))
+val_dataset = MsDataset.load('clue', subset_name='afqmc', split='validation').to_hf_dataset().select(range(100))
+
+
+def tokenize_function(examples):
+    return tokenizer(examples["sentence1"], examples["sentence2"], 
+    padding="max_length", truncation=True, max_length=128)
+
+
+train_dataset = train_dataset.map(tokenize_function)
+val_dataset = val_dataset.map(tokenize_function)
+
+arguments = TrainingArguments(
+    output_dir='./outputs',
+    per_device_train_batch_size=16,
+)
+
+trainer = Trainer(model, arguments, train_dataset=train_dataset, 
+                    eval_dataset=val_dataset,
+                    data_collator=default_data_collator,)
+
+trainer.train()
+```
+
+在上面的例子中，我们使用了`bert-base-uncased`作为基模型，将LoRA模块patch到了['query', 'key', 'value']三个Linear上，进行了一次训练。
+
+训练结束后可以看到outputs文件夹，它的文件结构如下：
+
+> outputs
+>
+> ​    |-- checkpoint-xx
+>
+> ​                    |-- configuration.json
+>
+> ​                    |-- default
+>
+> ​                              |-- adapter_config.json
+>
+> ​                              |-- adapter_model.bin
+>
+> ​                    |-- ...
+
+可以使用该文件夹执行推理：
+
+```python
+from modelscope import AutoModelForSequenceClassification, AutoTokenizer
+from swift import Trainer, LoRAConfig, Swift
+
+
+model = AutoModelForSequenceClassification.from_pretrained(
+            'AI-ModelScope/bert-base-uncased', revision='v1.0.0')
+tokenizer = AutoTokenizer.from_pretrained(
+    'AI-ModelScope/bert-base-uncased', revision='v1.0.0')
+lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
+model = Swift.from_pretrained(model, model_id='./outputs/checkpoint-21')
+
+print(model(**tokenizer('this is a test', return_tensors='pt')))
+```
\ No newline at end of file
diff --git a/docs/Get Started/2.Installation.md b/docs/Get Started/2.Installation.md
new file mode 100644
index 0000000000..740d67bcc8
--- /dev/null
+++ b/docs/Get Started/2.Installation.md	
@@ -0,0 +1,25 @@
+# 安装和使用
+
+## Wheel包安装
+
+可以使用pip进行安装：
+
+```shell
+pip install ms-swift
+```
+
+## 源代码安装
+
+```shell
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e .
+```
+
+## Notebook环境
+
+Swift支持训练的绝大多数模型都可以在`A10`显卡上使用，用户可以使用ModelScope官方提供的免费显卡资源：
+
+1. 进入[ModelScope](https://www.modelscope.cn)官方网站并登录
+2. 点击左侧的`我的Notebook`并开启一个免费GPU实例
+3. 愉快地薅A10显卡羊毛
\ No newline at end of file
diff --git a/docs/Get Started/3.Use in train and infer.md b/docs/Get Started/3.Use in train and infer.md
new file mode 100644
index 0000000000..bcb68e3b15
--- /dev/null
+++ b/docs/Get Started/3.Use in train and infer.md	
@@ -0,0 +1,123 @@
+# Swift API
+
+## 在训练中使用Swift
+
+调用`Swift.prepare_model()`来将tuners添加到模型上：
+
+```python
+from modelscope import Model
+from swift import Swift, LoRAConfig
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+lora_config = LoRAConfig(
+                r=16,
+                target_modules=['query_key_value'],
+                lora_alpha=32,
+                lora_dropout=0.)
+model = Swift.prepare_model(model, lora_config)
+# use model to do other things
+```
+
+也可以同时使用多个tuners：
+
+```python
+from modelscope import Model
+from swift import Swift, LoRAConfig, AdapterConfig
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+lora_config = LoRAConfig(
+                r=16,
+                target_modules=['query_key_value'],
+                lora_alpha=32,
+                lora_dropout=0.)
+adapter_config = AdapterConfig(
+                dim=model.config.hidden_size,
+                target_modules=['mlp'],
+                method_name='forward',
+                hidden_pos=0,
+                adapter_length=32,
+            )
+model = Swift.prepare_model(model, {'first_tuner': lora_config, 'second_tuner': adapter_config})
+# use model to do other things
+```
+
+在使用多个tuners时，传入的第二个参数需要是Dict，key是tuner名字，value是tuner配置。
+
+训练后可以调用：
+
+```python
+model.save_pretrained(save_directory='./output')
+```
+
+来存储模型checkpoint。模型的checkpoint文件只会包括tuners的权重，不会包含模型本身的权重。存储后的结构如下：
+
+> outputs
+>
+> ​     |-- configuration.json
+>
+> ​     |-- first_tuner
+>
+> ​               |-- adapter_config.json
+>
+> ​               |-- adapter_model.bin
+>
+> ​     |-- second_tuner
+>
+> ​               |-- adapter_config.json
+>
+> ​               |-- adapter_model.bin
+>
+> ​     |-- ...
+
+如果只传入单独的config，则会使用默认的名称`default`：
+
+> outputs
+>
+> ​      |-- configuration.json
+>
+> ​      |-- default
+>
+> ​                |-- adapter_config.json
+>
+> ​                |-- adapter_model.bin
+>
+> ​      |-- ...
+
+## 在推理时使用Swift
+
+使用`Swift.from_pretrained()`来拉起训练后存储的checkpoint：
+
+```python
+from modelscope import Model
+from swift import Swift
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+model = Swift.from_pretrained(model, './output')
+```
+
+## 加载多个tuners并在不同线程中并行使用
+
+在模型提供服务时，很可能出现一个模型同时服务多个http线程的情况，其中每个线程代表了一类用户请求。Swift支持在不同线程中激活不同tuners：
+
+```python
+from modelscope import Model
+from swift import Swift
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+# 假设output中存在训练完成的a、b、c、d是个tuners
+model = Swift.from_pretrained(model, './output')
+
+# 假设两类请求，一类使用a、b两个tuner，一类使用c、d两个tuner
+type_1 = ['a', 'b', 'c']
+type_2 = ['a', 'c', 'd']
+
+def request(_input, _type):
+  if _type == 'type_1':
+    model.set_active_adapters(type_1)
+  elif _type == 'type_2':
+    model.set_active_adapters(type_2)
+  return model(**_input)
+
+```
+
+在不同线程中使用同样一个tuner是安全的。
diff --git a/docs/Get Started/4.examples.md b/docs/Get Started/4.examples.md
new file mode 100644
index 0000000000..3c2e531aa1
--- /dev/null
+++ b/docs/Get Started/4.examples.md	
@@ -0,0 +1,4 @@
+# LLM训练方案
+
+Swift提供了完整的LLM训练方案，可以查看[Examples的README](../../examples/pytorch/llm/README_CN.md).
+
diff --git a/docs/Modules/1.Interface.md b/docs/Modules/1.Interface.md
new file mode 100644
index 0000000000..11d39c0379
--- /dev/null
+++ b/docs/Modules/1.Interface.md
@@ -0,0 +1,70 @@
+# 接口介绍
+
+## Swift
+
+##### Swift.prepare_model(model: Union[nn.Module, 'SwiftModel'], config: Union[SwiftConfig, PeftConfig, Dict[str, SwiftConfig]], **kwargs)
+
+>该静态方法随机初始化指定类型的tuners
+>
+>model: 需要加载tuner的模型，可以是SwiftModel，后添加的tuners会和前面SwiftModel中的一起生效
+>
+>config：加载的tuner的config，可以是SwiftConfig或PeftConfig，或者带有名称的config的dict。如果不传递名称则名称默认为`default`
+>
+>kwargs:
+>
+>​	    extra_state_keys: List[str] 需要被额外存储到文件的原始模型weights的key
+>
+>​        inference_mode: bool 是否以推理模式启动
+
+SwiftConfig的具体参数可以查看每个tuner的文档。
+
+##### Swift.from_pretrained(model: Union[nn.Module, 'SwiftModel'], model_id: str = None, adapter_name: Union[str, List[str]] = None, revision: str = None, **kwargs)
+
+> 该静态方法拉起之前存储过的tuners的checkpoint
+>
+> model: 需要加载tuner的模型，可以是SwiftModel，后添加的tuners会和前面SwiftModel中的一起生效
+>
+> model_id：已存储的tuners的本地目录或modelscope hub id。
+>
+> adapter_name：需要被拉起的adapter名称，默认为None代表全部拉起
+>
+> kwargs：
+>
+> ​        inference_mode: bool 是否以推理模式启动
+>
+> ​        revision: model_id的revision
+>
+> ​        extra_state_keys: 下次save_pretrained时额外存储的weights
+
+## SwiftModel
+
+在`Swift.prepare_model`或`Swift.from_pretrained`拉起后，都会返回一个`SwiftModel`类型的实例。该实例包装了实际传入的模型。
+
+##### save_pretrained(self, save_directory: str, safe_serialization: bool = False, adapter_name: Union[str, List[str]] = None, **kwargs)
+
+> 实例方法，将模型存储到本地磁盘中，可直接被Swift.from_pretrained拉起
+>
+> save_directory：存储的目录
+>
+> safe_serialization: 是否存储safe_tensors
+>
+> adapter_name：待存储的adapter名称，默认为None代表全部存储
+
+##### set_active_adapters(self, adapter_names: List[str])
+
+> 实例方法，设置模型在当前线程中生效的所有adapter。如果将环境变量`USE_UNIQUE_THREAD`设置为'0'，则设置对所有线程同时生效。
+>
+> adapter_names：adapter名称列表
+
+##### activate_adapter(self, adapter_name)
+
+> 实例方法，在当前线程中单独激活某个adapter，如果将环境变量`USE_UNIQUE_THREAD`设置为'0'，则设置对所有线程同时生效。
+>
+> adapter_name：adapter名称
+
+##### deactivate_adapter(self, adapter_name)
+
+> 实例方法，在当前线程中单独激活某个adapter，如果将环境变量`USE_UNIQUE_THREAD`设置为'0'，则设置对所有线程同时生效。
+>
+> adapter_name：adapter名称
+
diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md
new file mode 100644
index 0000000000..49909e8e48
--- /dev/null
+++ b/docs/Modules/2.lora.md
@@ -0,0 +1,17 @@
+# LoRA
+
+LoRA是[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 论文提供的轻量级训练组件。LoRA可以添加到Linear、Embedding、Conv2d等算子上生效。
+
+>```python
+>LoRAConfig (
+>    r: int LoRA结构的秩
+>    target_modules: Union[List[str], str] MLP结构的module_key，如果是str类型则进行full_match统配查找，如果是List，则进行末尾匹配
+>    lora_alpha: int LoRA结构的权重比例，lora_alpha/r的值是lora结构的权重
+>    lora_dropout: float LoRA结构的dropout比例
+>    merge_weights: bool 在推理时是否将loRA权重合并到原始weights上
+>    use_merged_linear: bool 是否是merged linear结构
+>    enable_lora: List[bool]: 如果是use_merged_linear，哪些module需要添加LoRA结构
+>    bias: str 偏置是否参与训练和存储，可以为`none`：所有偏置不参与训练, `all`：所有模块的偏置均参与训练, `lora_only`：仅loRA结构的偏置参与训练
+>)
+>```
+
diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md
new file mode 100644
index 0000000000..c380771742
--- /dev/null
+++ b/docs/Modules/3.Restuning.md
@@ -0,0 +1,21 @@
+# Restuning
+
+Restuning是[Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding Tuner from Backbone]()论文提供的轻量级训练组件。Restuning工作在深度学习模型多层结构的layer上。
+
+>```python
+>ResTuningConfig (
+>dims: Union[List[int], int] layers输出的hidden_state的维度，可以传入List以适配上采样或下采样
+>root_modules: str 提供root hidden_state的模块的正则表达式
+>root_modules_hook: str 可以为`input`或`output`，表示hidden_state从root_module的输入或输出中取到
+>stem_modules: Union[List[str], str 提供root hidden_state的模块的正则表达式（str）或完整module路径（List）
+>stem_modules_hook: str 可以为`input`或`output`，表示hidden_state从stem_module的输入或输出中取到
+>target_modules: str target module的正则表达式
+>target_modules_hook: str 可以为`input`或`output` hidden_state从target_module的输入或输出中取到
+>target_hidden_pos: Union[int, str] target_module forward输入或输出中hidden_state的index
+>tuner_cfg: restuning模块中子tuner的配置，可以传入str或dict
+>use_upsample: bool 是否加入上采样模块
+>upsample_out_channels: List[int] 如果进行上采样，上采样的通道数
+>zero_init_last: bool 是否对tuner的最后一层Linear进行全零初始化
+>)
+>```
+
diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md
new file mode 100644
index 0000000000..e07af189cc
--- /dev/null
+++ b/docs/Modules/4.adapter.md
@@ -0,0 +1,15 @@
+# Adapter
+
+Adapter是[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751) 论文提供的轻量级训练组件。一般添加到MLP结构之后生效。
+
+>```python
+>AdapterConfig (
+>  dim: int MLP结构输出中hidden_state的dim，一般等于模型的hidden_size
+>  target_modules: Union[List[str], str] MLP结构的module_key，如果是str类型则进行full_match统配查找，如果是List，则进行末尾匹配
+>  hidden_pos: Union[str, int] MLP输出结构中hidden_state的位置，如果是tuple/list则传入int，如果是dict则传入str类型的key
+>  method_name: str MLP结构的前向方法，Adapter默认会patch到该方法上，在forward调用后使用其hidden_state输入tuner，默认是forward。
+>  adapter_length: int adapter结构中间层长度，默认为128
+>  act_layer: str 激活算子，默认为gelu
+>)
+>```
+
diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md
new file mode 100644
index 0000000000..2ad0fe587a
--- /dev/null
+++ b/docs/Modules/5.side.md
@@ -0,0 +1,13 @@
+# Side
+
+Side是[Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503) 论文提供的轻量级训练组件。Side可以添加到MLP结构上。
+
+>```python
+>SideConfig (
+>dim: int hidden_state的维度
+>target_modules: str 需要嵌入的位置的正则表达式
+>side_module_name: str side module的名字，可以是fcn4，mlp，alexnet
+>hidden_pos: Union[str, int] hidden_state在MLP结构中的位置，如果MLP输出为tuple/list，则hidden_pos需要是一个int，否则需要是一个str
+>)
+>```
+
diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md
new file mode 100644
index 0000000000..9d93121503
--- /dev/null
+++ b/docs/Modules/6.prompt.md
@@ -0,0 +1,17 @@
+# Prompt
+
+Prompt是[Visual Prompt Tuning](https://arxiv.org/abs/2106.09685) 论文提供的轻量级训练组件。Prompt可以添加到每个layer的输入上，为hidden_state添加prompt embedding。
+
+>```python
+>PromptConfig (
+>    dim: int layer输入参数中hidden_state的维度
+>    target_modules: Union[str, List[str]]：可以是需要嵌入prompt的layer的正则表达式（字符串类型），如果是List，则匹配这些layers名称的末尾
+>    embedding_pos: Union[str, int] layer输入参数中hidden_state的位置，如果是tuple/list则是int类型，如果是dict则是str类型
+>    attention_mask_pos: Union[str, int] layer输入参数中attention_mask的位置，如果是tuple/list则是int类型，如果是dict则是str类型
+>    attention_mask_value: Union[float, int, bool] prompt部分的attention值，默认为0.0
+>    prompt_length: int prompt的长度
+>    attach_front: bool prompt和hidden_state组合的方式，True代表将prompt concat到hidden_state的前面，反之则concat到后面
+>    extract_embedding: bool 是否在最后的layer结束后将hidden_state中的prompt部分移除
+>)
+>```
+
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 3819d1940a..bb7882aee2 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -12,7 +12,6 @@
 import torch.nn.functional as F
 from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available,
                                is_bnb_available)
-from peft.tuners.lora import LoraLayer
 from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
 
 from swift import get_logger
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index 4a77887ac9..4744e55b38 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -84,7 +84,7 @@ class ResTuningConfig(SwiftConfig):
             'The hook type of target modules, can be "input" or "output"'
         })
 
-    target_hidden_pos: str = field(
+    target_hidden_pos: Union[int, str] = field(
         default=None,
         metadata={
             'help':
@@ -118,6 +118,7 @@ class ResTuningConfig(SwiftConfig):
     def __post_init__(self):
         from .mapping import SwiftTuners
         self.swift_type = SwiftTuners.RESTUNING
+        self.target_hidden_pos = 0 if self.target_hidden_pos is None else self.target_hidden_pos
 
 
 class ResTuning:
@@ -136,26 +137,27 @@ def _forward_seq(self, input, *args, **kwargs):
 
         def _forward_target(self, *args, **kwargs):
             if self.target_modules_hook == 'input':
-                args = list(args)
-                _arg = args[0 if self.target_hidden_pos is None else self.
-                            target_hidden_pos]
+                if isinstance(self.target_hidden_pos, int):
+                    args = list(args)
+                    _arg = args[self.target_hidden_pos]
+                else:
+                    _arg = kwargs[self.target_hidden_pos]
                 args_main = _forward_restuning(self, _arg)
-                args[0 if self.target_hidden_pos is None else self.
-                     target_hidden_pos] = args_main
+                if isinstance(self.target_hidden_pos, int):
+                    args[self.target_hidden_pos] = args_main
+                else:
+                    kwargs[self.target_hidden_pos] = args_main
                 args_main = getattr(self,
                                     f'forward_origin_{adapter_name}')(*args,
                                                                       **kwargs)
             else:
                 _args_main = getattr(self, f'forward_origin_{adapter_name}')(
                     *args, **kwargs)
-                _arg = _args_main[0 if self.target_hidden_pos is None else self
-                                  .target_hidden_pos] if isinstance(
-                                      _args_main,
-                                      (tuple, list)) else _args_main
+                _arg = _args_main[self.target_hidden_pos] if isinstance(
+                    _args_main, (tuple, list, dict)) else _args_main
                 args_main = _forward_restuning(self, _arg)
                 if type(_args_main) != type(args_main):
-                    _args_main[0 if self.target_hidden_pos is None else self.
-                               target_hidden_pos] = args_main
+                    _args_main[self.target_hidden_pos] = args_main
                     args_main = _args_main
             return args_main
 

From 539cea805c2b0f5f0279480364a567e8de825601 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 23:27:52 +0800
Subject: [PATCH 58/70] update doc

---
 README.md                                    |  2 ++
 README_CN.md                                 |  2 ++
 docs/Get Started/3.Use in train and infer.md |  2 +-
 docs/Modules/{1.Interface.md => 1.swift.md}  |  0
 docs/Modules/2.lora.md                       | 16 +++++++++
 docs/Modules/3.Restuning.md                  | 21 +++++++++++
 docs/Modules/4.adapter.md                    | 17 +++++++++
 docs/Modules/5.side.md                       | 18 ++++++++++
 docs/Modules/6.prompt.md                     | 18 ++++++++++
 docs/Modules/7.peft.md                       | 38 ++++++++++++++++++++
 10 files changed, 133 insertions(+), 1 deletion(-)
 rename docs/Modules/{1.Interface.md => 1.swift.md} (100%)
 create mode 100644 docs/Modules/7.peft.md

diff --git a/README.md b/README.md
index 722d0b9a17..17efba9455 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ Key features:
 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result.
 3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with multiple tuners in different threads.
 
+Users can check the [documentation of Swift](./docs/Get Started/1.Introduction.md) to get detail tutorials.
+
 ## LLM SFT Example
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
diff --git a/README_CN.md b/README_CN.md
index 4b6d7f4379..61ac291017 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -29,6 +29,8 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 2. SWIFT提供的tuners可以组合在一起，以便在模型上探索多个tuners，以获得最佳结果。
 3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活，用户可以在推理时用一个模型在不同线程中使用多种tuners而互不干扰。
 
+用户可以查看 [Swift官方文档](./docs/Get Started/1.Introduction.md) 来了解详细信息。
+
 ## 大模型微调的例子
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
diff --git a/docs/Get Started/3.Use in train and infer.md b/docs/Get Started/3.Use in train and infer.md
index bcb68e3b15..2209cecfc6 100644
--- a/docs/Get Started/3.Use in train and infer.md	
+++ b/docs/Get Started/3.Use in train and infer.md	
@@ -120,4 +120,4 @@ def request(_input, _type):
 
 ```
 
-在不同线程中使用同样一个tuner是安全的。
+在不同线程中使用同一个tuner是安全的。
diff --git a/docs/Modules/1.Interface.md b/docs/Modules/1.swift.md
similarity index 100%
rename from docs/Modules/1.Interface.md
rename to docs/Modules/1.swift.md
diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md
index 49909e8e48..55d47831bf 100644
--- a/docs/Modules/2.lora.md
+++ b/docs/Modules/2.lora.md
@@ -15,3 +15,19 @@ LoRA是[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/ab
 >)
 >```
 
+一个使用LoRA的例子如下：
+
+```python
+from modelscope import Model
+from swift import Swift, LoRAConfig
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+lora_config = LoRAConfig(
+                r=16,
+                target_modules=['query_key_value'],
+                lora_alpha=32,
+                lora_dropout=0.)
+model = Swift.prepare_model(model, lora_config)
+# use model to do other things
+```
+
diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md
index c380771742..c2635b385f 100644
--- a/docs/Modules/3.Restuning.md
+++ b/docs/Modules/3.Restuning.md
@@ -19,3 +19,24 @@ Restuning是[Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding
 >)
 >```
 
+一个使用Restuning的例子如下：
+
+```python
+from swift import (ResTuningConfig, Swift, snapshot_download)
+
+model_dir = snapshot_download('AI-ModelScope/vit-base-patch16-224')
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained(model_dir)
+restuning_config_1 = ResTuningConfig(
+    dims=768,
+    root_modules=r'.*vit.encoder.layer.0$',
+    stem_modules=r'.*vit.encoder.layer\.\d+$',
+    target_modules=r'.*vit.layernorm',
+    target_modules_hook='input',
+    tuner_cfg='res_adapter',
+)
+model = Swift.prepare_model(model, config=restuning_config_1)
+# use model to do other things
+```
+
diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md
index e07af189cc..52b553b715 100644
--- a/docs/Modules/4.adapter.md
+++ b/docs/Modules/4.adapter.md
@@ -13,3 +13,20 @@ Adapter是[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1
 >)
 >```
 
+一个使用adapter的例子如下：
+
+```python
+from modelscope import Model
+from swift import Swift, LoRAConfig
+import torch
+model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto')
+adapter_config = AdapterConfig(
+                dim=model.config.hidden_size,
+                target_modules=['mlp']),
+                method_name='forward',
+                hidden_pos=0,
+            )
+model = Swift.prepare_model(model, adapter_config)
+# use model to do other things
+```
+
diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md
index 2ad0fe587a..a33b970513 100644
--- a/docs/Modules/5.side.md
+++ b/docs/Modules/5.side.md
@@ -11,3 +11,21 @@ Side是[Side-Tuning: A Baseline for Network Adaptation via Additive Side Network
 >)
 >```
 
+一个使用Side的例子如下：
+
+```python
+from modelscope import Model
+
+from swift import (SideConfig, Swift)
+
+model = Model.from_pretrained(
+    'damo/nlp_structbert_sentence-similarity_chinese-base')
+side_config = SideConfig(
+    dim=model.config.hidden_size,
+    target_modules=r'.*encoder.encoder',
+    side_module_name='mlp',
+    hidden_pos='last_hidden_state')
+model = Swift.prepare_model(model, side_config)
+# use model to do other things
+```
+
diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md
index 9d93121503..54d521b8f4 100644
--- a/docs/Modules/6.prompt.md
+++ b/docs/Modules/6.prompt.md
@@ -15,3 +15,21 @@ Prompt是[Visual Prompt Tuning](https://arxiv.org/abs/2106.09685) 论文提供
 >)
 >```
 
+一个使用Prompt的例子如下：
+
+```python
+from modelscope import Model
+
+from swift import (PromptConfig, Swift)
+
+model = Model.from_pretrained(
+    'damo/nlp_structbert_sentence-similarity_chinese-base')
+prompt_config = PromptConfig(
+    dim=model.config.hidden_size,
+    target_modules=r'.*layer\.\d+$',
+    embedding_pos=0,
+    attention_mask_pos=1)
+model = Swift.prepare_model(model, config=prompt_config)
+# use model to do other things
+```
+
diff --git a/docs/Modules/7.peft.md b/docs/Modules/7.peft.md
new file mode 100644
index 0000000000..c3cba7862c
--- /dev/null
+++ b/docs/Modules/7.peft.md
@@ -0,0 +1,38 @@
+# 对Peft的兼容性
+
+为了支持习惯Peft的用户，Swift提供了对于Peft的兼容性。用户可以从swift中import peft组件：
+
+>PeftModel
+>PeftConfig
+>PeftModelForSeq2SeqLM
+>PeftModelForSequenceClassification
+>PeftModelForTokenClassification
+>PeftModelForCausalLM
+>PromptEncoderConfig
+>PromptTuningConfig
+>PrefixTuningConfig
+>PromptLearningConfig
+>LoraConfig
+>get_peft_config
+>get_peft_model_state_dict
+>get_peft_model
+
+以上组件均可以从swift中import：
+
+```python
+from swift import PeftModel, PeftConfig
+```
+
+Swift类也支持初始化Peft的tuner：
+
+```python
+from modelscope.models.nlp import SbertForSequenceClassification
+from modelscope.models.nlp.structbert import SbertConfig
+
+from swift import LoraConfig, Swift
+model = SbertForSequenceClassification(SbertConfig())
+lora_config = LoraConfig(target_modules=['query', 'key', 'value'])
+model = Swift.prepare_model(model, lora_config)
+```
+
+Swift对Peft进行了浅封装，使Peft可以在from_pretrained时使用modelscope hub中的模型。
\ No newline at end of file

From c10c44350bebf1cddba55ca51330543a0bca63e4 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 13 Sep 2023 23:28:30 +0800
Subject: [PATCH 59/70] pre-commit passed

---
 docs/Get Started/1.Introduction.md | 6 +++---
 docs/Get Started/2.Installation.md | 2 +-
 docs/Get Started/4.examples.md     | 1 -
 docs/Modules/1.swift.md            | 1 -
 docs/Modules/2.lora.md             | 1 -
 docs/Modules/3.Restuning.md        | 1 -
 docs/Modules/4.adapter.md          | 1 -
 docs/Modules/5.side.md             | 1 -
 docs/Modules/6.prompt.md           | 1 -
 docs/Modules/7.peft.md             | 2 +-
 10 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/docs/Get Started/1.Introduction.md b/docs/Get Started/1.Introduction.md
index 36c4c32409..14f68b2d0c 100644
--- a/docs/Get Started/1.Introduction.md	
+++ b/docs/Get Started/1.Introduction.md	
@@ -48,7 +48,7 @@ val_dataset = MsDataset.load('clue', subset_name='afqmc', split='validation').to
 
 
 def tokenize_function(examples):
-    return tokenizer(examples["sentence1"], examples["sentence2"], 
+    return tokenizer(examples["sentence1"], examples["sentence2"],
     padding="max_length", truncation=True, max_length=128)
 
 
@@ -60,7 +60,7 @@ arguments = TrainingArguments(
     per_device_train_batch_size=16,
 )
 
-trainer = Trainer(model, arguments, train_dataset=train_dataset, 
+trainer = Trainer(model, arguments, train_dataset=train_dataset,
                     eval_dataset=val_dataset,
                     data_collator=default_data_collator,)
 
@@ -100,4 +100,4 @@ lora_config = LoRAConfig(target_modules=['query', 'key', 'value'])
 model = Swift.from_pretrained(model, model_id='./outputs/checkpoint-21')
 
 print(model(**tokenizer('this is a test', return_tensors='pt')))
-```
\ No newline at end of file
+```
diff --git a/docs/Get Started/2.Installation.md b/docs/Get Started/2.Installation.md
index 740d67bcc8..7bc620c51d 100644
--- a/docs/Get Started/2.Installation.md	
+++ b/docs/Get Started/2.Installation.md	
@@ -22,4 +22,4 @@ Swift支持训练的绝大多数模型都可以在`A10`显卡上使用，用户
 
 1. 进入[ModelScope](https://www.modelscope.cn)官方网站并登录
 2. 点击左侧的`我的Notebook`并开启一个免费GPU实例
-3. 愉快地薅A10显卡羊毛
\ No newline at end of file
+3. 愉快地薅A10显卡羊毛
diff --git a/docs/Get Started/4.examples.md b/docs/Get Started/4.examples.md
index 3c2e531aa1..80240e2679 100644
--- a/docs/Get Started/4.examples.md	
+++ b/docs/Get Started/4.examples.md	
@@ -1,4 +1,3 @@
 # LLM训练方案
 
 Swift提供了完整的LLM训练方案，可以查看[Examples的README](../../examples/pytorch/llm/README_CN.md).
-
diff --git a/docs/Modules/1.swift.md b/docs/Modules/1.swift.md
index 11d39c0379..0d5b35c9ab 100644
--- a/docs/Modules/1.swift.md
+++ b/docs/Modules/1.swift.md
@@ -67,4 +67,3 @@ SwiftConfig的具体参数可以查看每个tuner的文档。
 > 实例方法，在当前线程中单独激活某个adapter，如果将环境变量`USE_UNIQUE_THREAD`设置为'0'，则设置对所有线程同时生效。
 >
 > adapter_name：adapter名称
-
diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md
index 55d47831bf..013c4da7ee 100644
--- a/docs/Modules/2.lora.md
+++ b/docs/Modules/2.lora.md
@@ -30,4 +30,3 @@ lora_config = LoRAConfig(
 model = Swift.prepare_model(model, lora_config)
 # use model to do other things
 ```
-
diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md
index c2635b385f..4beb11a022 100644
--- a/docs/Modules/3.Restuning.md
+++ b/docs/Modules/3.Restuning.md
@@ -39,4 +39,3 @@ restuning_config_1 = ResTuningConfig(
 model = Swift.prepare_model(model, config=restuning_config_1)
 # use model to do other things
 ```
-
diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md
index 52b553b715..10ab21c665 100644
--- a/docs/Modules/4.adapter.md
+++ b/docs/Modules/4.adapter.md
@@ -29,4 +29,3 @@ adapter_config = AdapterConfig(
 model = Swift.prepare_model(model, adapter_config)
 # use model to do other things
 ```
-
diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md
index a33b970513..6c49e2fad3 100644
--- a/docs/Modules/5.side.md
+++ b/docs/Modules/5.side.md
@@ -28,4 +28,3 @@ side_config = SideConfig(
 model = Swift.prepare_model(model, side_config)
 # use model to do other things
 ```
-
diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md
index 54d521b8f4..a9578911d5 100644
--- a/docs/Modules/6.prompt.md
+++ b/docs/Modules/6.prompt.md
@@ -32,4 +32,3 @@ prompt_config = PromptConfig(
 model = Swift.prepare_model(model, config=prompt_config)
 # use model to do other things
 ```
-
diff --git a/docs/Modules/7.peft.md b/docs/Modules/7.peft.md
index c3cba7862c..aadfa08023 100644
--- a/docs/Modules/7.peft.md
+++ b/docs/Modules/7.peft.md
@@ -35,4 +35,4 @@ lora_config = LoraConfig(target_modules=['query', 'key', 'value'])
 model = Swift.prepare_model(model, lora_config)
 ```
 
-Swift对Peft进行了浅封装，使Peft可以在from_pretrained时使用modelscope hub中的模型。
\ No newline at end of file
+Swift对Peft进行了浅封装，使Peft可以在from_pretrained时使用modelscope hub中的模型。

From 0faee0f553ef7f8cfc4cfbfb02158a3ede98bb9b Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 Sep 2023 10:50:32 +0800
Subject: [PATCH 60/70] fix

---
 swift/tuners/prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index f306ea3d79..56605f5896 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -217,7 +217,7 @@ def forward(self, x):
         if not self.is_activated():
             return x
         prompt_token = self.prompt_token.expand(x.shape[0], -1,
-                                                -1).to(x.device)
+                                                -1).to(x.device, x.dtype)
 
         if self.layer_num == 0:
             if self.attach_front:

From caf83a5df21f93dab62f5f1618a3783c45187ed3 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 Sep 2023 11:35:54 +0800
Subject: [PATCH 61/70] fix bug

---
 swift/tuners/lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index bb7882aee2..760db2d314 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -309,7 +309,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
                         **kwargs)
 
                 def _forward(self, *args, **kwargs):
-                    for _name, _module in sub_module.named_modules():
+                    for _name, _module in self.named_modules():
                         if 'loramodule_' in _name and _module.is_activated():
                             return _module.forward(*args, **kwargs)
                     return self.forward_origin(*args, **kwargs)

From aacecfeff56cdff741bcc8c7d5cfcb7c240d9839 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 14 Sep 2023 14:54:36 +0800
Subject: [PATCH 62/70] fix bugs

---
 examples/pytorch/llm/src/llm_infer.py         |  5 ++---
 examples/pytorch/llm/src/llm_sft.py           | 11 +++++------
 examples/pytorch/llm/src/utils/__init__.py    |  3 ++-
 examples/pytorch/llm/src/utils/swift_utils.py |  5 ++---
 swift/trainers/trainers.py                    |  1 +
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index bd7e28868a..61c23ca3c1 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -37,13 +37,12 @@ class InferArguments:
         default='alpaca-en,alpaca-zh',
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
-    dataset_sample: int = -1  # -1: all dataset
+    dataset_sample: int = 20000  # -1: all dataset
     dataset_test_size: float = 0.01
     system: str = 'you are a helpful assistant!'
     max_length: Optional[int] = 2048
 
-    quantization_bit: Optional[int] = field(
-        default=None, metadata={'choices': {4, 8}})
+    quantization_bit: int = field(default=0, metadata={'choices': {0, 4, 8}})
     bnb_4bit_comp_dtype: str = field(
         default=None, metadata={'choices': {'fp16', 'bf16', 'fp32'}})
     bnb_4bit_quant_type: str = field(
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 401d7c3199..424fe3e89b 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -9,15 +9,14 @@
 import json
 import torch
 import torch.distributed as dist
-from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics
-from examples.pytorch.llm.src.utils.swift_utils import prepare_model
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
                    broadcast_string, check_json_format,
                    find_all_linear_for_lora, get_dataset, get_dist_setting,
                    get_model_tokenizer, get_preprocess, is_dist, is_master,
                    plot_images, process_dataset, select_bnb, select_dtype,
-                   show_layers, sort_by_max_length)
+                   show_layers, sort_by_max_length,
+                   compute_nlg_metrics, prepare_model)
 
 from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
                    Swift, get_logger)
@@ -36,7 +35,7 @@ class SftArguments:
         metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
         default='lora',
-        metadata={'help': f'adapter choices: {["lora", "full", "adapter", "restuning"]}'})
+        metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'})
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -54,7 +53,7 @@ class SftArguments:
         default='alpaca-en,alpaca-zh',
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
-    dataset_sample: int = -1  # -1: all dataset
+    dataset_sample: int = 20000  # -1: all dataset
     dataset_test_size: float = 0.01
     system: str = 'you are a helpful assistant!'
     max_length: Optional[int] = 2048
@@ -271,7 +270,7 @@ def llm_sft(args: SftArguments) -> None:
         tokenizer,
         args.system,
         args.max_length,
-        validate_generation=True)
+        validate_generation=args.predict_with_generate)
     val_dataset = val_dataset.map(preprocess_func_eval)
     del dataset
     if args.test_oom_error:
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
index 56b16ead20..ceb60765f1 100644
--- a/examples/pytorch/llm/src/utils/__init__.py
+++ b/examples/pytorch/llm/src/utils/__init__.py
@@ -1,7 +1,8 @@
-from .dataset import DATASET_MAPPING, get_dataset, process_dataset
+from .dataset import DATASET_MAPPING, get_dataset
 from .metric_utils import compute_nlg_metrics
 from .model import MODEL_MAPPING, get_model_tokenizer
 from .preprocess import TEMPLATE_MAPPING, get_preprocess
+from .swift_utils import prepare_model
 from .utils import (broadcast_string, check_json_format, download_dataset,
                     find_all_linear_for_lora, get_dist_setting, inference,
                     is_dist, is_local_master, is_master, plot_images,
diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
index 3f11634f00..8d931017c6 100644
--- a/examples/pytorch/llm/src/utils/swift_utils.py
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict
 
-import torch.nn
+from torch.nn import Module
 
 from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, Swift,
                    SwiftConfig, SwiftTuners, get_logger)
@@ -12,8 +12,7 @@
 
 
 def prepare_model(
-    model: torch.nn.Module,
-    args: Any,
+    model: Module, args
 ):
     swift_config: Dict[str, SwiftConfig] = dict()
     for sft_type in [_type.strip() for _type in args.sft_type.split(',')]:
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 5c2de223af..a659ec8747 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -22,6 +22,7 @@ class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        # performance
         self.perf: Dict[str, Any] = {
             'gen_time':
             0.,

From 38bd482706571efb9a60b241fabd3f53aa8aac58 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 14 Sep 2023 16:11:54 +0800
Subject: [PATCH 63/70] update sh

---
 examples/pytorch/llm/README.md                |  31 ++-
 examples/pytorch/llm/README_CN.md             |  34 ++-
 .../baichuan2_7b_chat/lora_ddp/infer.sh       |   5 +-
 .../scripts/baichuan2_7b_chat/lora_ddp/sft.sh |   7 +-
 .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh |   5 +-
 .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh   |   7 +-
 .../full_mp}/infer.sh                         |   8 +-
 .../qwen_7b_chat/{full => full_mp}/sft.sh     |   8 +-
 .../{full => full_mp_ddp}/infer.sh            |   5 +-
 .../full_mp_ddp}/sft.sh                       |  27 ++-
 .../llm/scripts/qwen_7b_chat/lora/infer.sh    |   6 +-
 .../llm/scripts/qwen_7b_chat/lora/sft.sh      |   8 +-
 .../scripts/qwen_7b_chat/lora_ddp/infer.sh    |   4 +-
 .../llm/scripts/qwen_7b_chat/lora_ddp/sft.sh  |  10 +-
 .../llm/scripts/qwen_vl_chat/lora/infer.sh    |  17 --
 .../llm/scripts/qwen_vl_chat/lora/sft.sh      |  31 ---
 .../llm/scripts/qwen_vl_chat/qlora/infer.sh   |  19 --
 .../llm/scripts/qwen_vl_chat/qlora/sft.sh     |  33 ---
 .../scripts/qwen_vl_chat/qlora_ddp/infer.sh   |  19 --
 .../llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh |  39 ----
 .../llm/scripts/seqgpt_560m/full/infer.sh     |   4 +-
 .../llm/scripts/seqgpt_560m/full/sft.sh       |   6 +-
 examples/pytorch/llm/src/llm_sft.py           |  27 ++-
 examples/pytorch/llm/src/utils/__init__.py    |  10 +-
 examples/pytorch/llm/src/utils/dataset.py     | 204 ++++++++++++++++--
 examples/pytorch/llm/src/utils/model.py       |   2 +-
 examples/pytorch/llm/src/utils/utils.py       | 136 +++++++++++-
 swift/trainers/trainers.py                    |   5 +-
 28 files changed, 448 insertions(+), 269 deletions(-)
 rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => qwen_7b_chat/full_mp}/infer.sh (76%)
 rename examples/pytorch/llm/scripts/qwen_7b_chat/{full => full_mp}/sft.sh (87%)
 rename examples/pytorch/llm/scripts/qwen_7b_chat/{full => full_mp_ddp}/infer.sh (74%)
 rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => qwen_7b_chat/full_mp_ddp}/sft.sh (59%)
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh
 delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 76bbbae4a0..20dcdedc55 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -25,15 +25,16 @@
    6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b
    7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k
    8. other: polylm-13b, seqgpt-560m
-3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
+3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
-   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh
+   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
    3. multi-modal: coco-en
+   4. other: cls-fudan-news-zh, ner-jave-zh
 5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## Prepare the Environment
-Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization)
+Experimental environment: V100, A10, 3090, A100, ... (V100 does not support bf16, quantization)
 ```bash
 # Installing miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -63,38 +64,50 @@ pip install .
 
 ## Run SFT and Inference
 Performace: full(nice) > lora > qlora
+
 Training GPU memory: qlora(low,3090) > lora > full(2*A100)
 ```bash
 # Clone the repository and enter the code directory.
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
-# sft lora and infer qwen-7b-chat, Requires 27GB GPU memory.
+# sft lora and infer qwen-7b-chat, Requires 38GB GPU memory.
 # You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed.
 # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
-# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory.
+# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory.
+# Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
+# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*16GB GPU memory.
+# Recommended experimental environment: V100, A10, 3090
+bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+
 # sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory.
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
-# Recommended experimental environment: 3090
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory.
+# Recommended experimental environment: A10, 3090
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# sft(full) and infer qwen-7b-chat, Requires 100GB GPU memory.
+# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory.
 # Recommended experimental environment: A100
-bash scripts/qwen_7b_chat/full/sft.sh
-bash scripts/qwen_7b_chat/full/infer.sh
+bash scripts/qwen_7b_chat/full_mp/sft.sh
+bash scripts/qwen_7b_chat/full_mp/infer.sh
 
+# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory.
+# Recommended experimental environment: A100
+bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 # For more scripts, please see `scripts/` folder.
 ```
 
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index c66c2eab57..0e23d44619 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -28,13 +28,14 @@
    8. other: polylm-13b, seqgpt-560m
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
-   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh
+   1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh
    2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh
    3. 多模态: coco-en
+   4. 其他: cls-fudan-news-zh, ner-jave-zh
 5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## 准备实验环境
-实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化)
+实验环境: V100, A10, 3090, A100均可. (V100不支持bf16, 量化)
 ```bash
 # 安装miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -65,37 +66,50 @@ pip install .
 
 ## 微调和推理
 性能: full(优) > lora > qlora
+
 训练显存: qlora(低,3090) > lora > full(2*A100)
 ```bash
 # clone仓库并进入代码目录
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
-# 微调(lora)+推理 qwen-7b-chat, 需要27GB显存.
+# 微调(lora)+推理 qwen-7b-chat, 需要38GB显存.
 # 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度.
 # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora/sft.sh
 bash scripts/qwen_7b_chat/lora/infer.sh
 
-# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存.
+# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存.
+# 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/lora_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_ddp/infer.sh
 
-# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存.
+# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存.
+# 推荐的实验环境: V100, 3090, A10
+bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+
+# 微调(qlora)+推理 qwen-7b-chat, 需要9GB显存.
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
-# 推荐的实验环境: 3090
+# 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
-# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存.
+# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存.
+# 推荐的实验环境: 3090, A10
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# 微调(full)+推理 qwen-7b-chat, 需要100G显存.
+# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存.
+# 推荐的实验环境: A100
+bash scripts/qwen_7b_chat/full_mp/sft.sh
+bash scripts/qwen_7b_chat/full_mp/infer.sh
+
+# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存.
 # 推荐的实验环境: A100
-bash scripts/qwen_7b_chat/full/sft.sh
-bash scripts/qwen_7b_chat/full/infer.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
+bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
 
 # 更多的scripts脚本, 可以看`scripts`文件夹.
 ```
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
index e62aa4b203..ca53acdf99 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,10 @@ python src/llm_infer.py \
     --template_type baichuan \
     --dtype bf16 \
     --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset damo-agent-mini-zh \
+    --dataset_sample -1 \
+    --max_length 4096 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
index ea219e0759..c315d78850 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
@@ -1,4 +1,5 @@
 # Experimental environment: 2 * A100
+# 2 * 44GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -11,10 +12,10 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample 20000 \
+    --dataset damo-agent-mini-zh \
+    --dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 4096 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
index 96aa910f23..85d856ad36 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh
@@ -5,7 +5,10 @@ python src/llm_infer.py \
     --template_type chatglm2 \
     --dtype bf16 \
     --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset code-python-zh \
+    --dataset_sample -1 \
+    --max_length 8192 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
index 7ec0bb88d9..b85eac5572 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: A100
+# 2 * 50GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -10,13 +12,14 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
+    --dataset code-python-zh \
     --dataset_sample -1 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 8192 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
+    --lora_target_modules ALL \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
similarity index 76%
rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh
rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
index b6c221155d..17e53a8c82 100644
--- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
@@ -1,14 +1,14 @@
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b-chat \
-    --sft_type lora \
+    --sft_type full \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset damo-agent-mini-zh \
-    --dataset_sample -1 \
-    --max_length 2048 \
+    --dataset damo-agent-zh \
+    --dataset_sample 200000 \
+    --max_length 8192 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
similarity index 87%
rename from examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh
rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
index 182e287faf..2a961f7e72 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 100GB GPU memory
+# 2 * 75GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
@@ -7,10 +7,10 @@ python src/llm_sft.py \
     --template_type chatml \
     --dtype bf16 \
     --output_dir runs \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset damo-agent-zh \
+    --dataset_sample 200000 \
     --num_train_epochs 1 \
-    --max_length 2048 \
+    --max_length 8192 \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0.01 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
similarity index 74%
rename from examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh
rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
index 9ef3c08124..f99464d035 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -5,7 +5,10 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset medical-en,medical-zh \
+    --dataset_sample 200000 \
+    --max_length 8192 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
similarity index 59%
rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh
rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
index 7f4c9c37bd..de95dda252 100644
--- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -1,37 +1,34 @@
-# Experimental environment: 2 * A100
+# Experimental environment: 4 * A100
+# 4 * 75GB GPU memory
 nproc_per_node=2
-CUDA_VISIBLE_DEVICES=0,1 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
     --nproc_per_node=$nproc_per_node \
     --master_port 29500 \
     src/llm_sft.py \
     --model_type qwen-7b-chat \
-    --sft_type lora \
+    --sft_type full \
     --template_type chatml \
     --dtype bf16 \
     --output_dir runs \
-    --ddp_backend nccl \
-    --dataset damo-agent-mini-zh \
-    --dataset_sample -1 \
+    --dataset medical-en,medical-zh \
+    --dataset_sample 200000 \
     --num_train_epochs 1 \
-    --max_length 2048 \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --lora_dropout_p 0. \
-    --lora_target_modules ALL \
+    --max_length 8192 \
     --gradient_checkpointing false \
     --batch_size 1 \
-    --weight_decay 0. \
-    --learning_rate 1e-4 \
+    --weight_decay 0.01 \
+    --learning_rate 2e-5 \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --max_grad_norm 0.5 \
+    --max_grad_norm 1 \
     --warmup_ratio 0.03 \
     --eval_steps 100 \
     --save_steps 100 \
+    --only_save_model true \
     --save_total_limit 2 \
     --logging_steps 10 \
     --use_flash_attn true \
     --push_to_hub false \
-    --hub_model_id qwen-7b-chat-qlora \
+    --hub_model_id qwen-7b-chat-full \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
index 5aae79e72d..6382b5d34f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -5,7 +5,11 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset cot-en,cot-zh \
+    --dataset_sample 50000 \
+    --max_length 2048 \
+    --use_flash_attn true \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
index 025f728cb1..0d1d205a1a 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
@@ -1,3 +1,5 @@
+# Experimental environment: A100
+# 38GB GPU memory
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_sft.py \
     --model_type qwen-7b-chat \
@@ -5,14 +7,14 @@ python src/llm_sft.py \
     --template_type chatml \
     --dtype bf16 \
     --output_dir runs \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset cot-en,cot-zh \
+    --dataset_sample 50000 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
-    --lora_target_modules c_attn c_proj \
+    --lora_target_modules ALL \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
index 27d3c0cbb3..8d5674bef4 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
@@ -5,7 +5,9 @@ python src/llm_infer.py \
     --template_type chatml \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human true \
+    --eval_human false \
+    --dataset sharegpt-en,sharegpt-zh \
+    --dataset_sample 50000 \
     --max_length 2048 \
     --use_flash_attn true \
     --max_new_tokens 1024 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
index fd92b9a941..82f0838235 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
@@ -1,6 +1,6 @@
 # Experimental environment: 2 * A100
-# 2 * 27GB GPU memory
-# use_flash_attn=false: 2 * 31GB GPU memory
+# 2 * 38GB GPU memory
+# use_flash_attn=false: 2 * 70GB GPU memory
 nproc_per_node=2
 CUDA_VISIBLE_DEVICES=0,1 \
 torchrun \
@@ -13,14 +13,14 @@ torchrun \
     --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
-    --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset sharegpt-en,sharegpt-zh \
+    --dataset_sample 50000 \
     --num_train_epochs 1 \
     --max_length 2048 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
-    --lora_target_modules c_attn c_proj \
+    --lora_target_modules ALL \
     --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh
deleted file mode 100644
index 9c2299bb25..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 \
-python src/llm_infer.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human false \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --max_length 2048 \
-    --max_new_tokens 1024 \
-    --use_flash_attn true \
-    --temperature 0.9 \
-    --top_k 50 \
-    --top_p 0.9 \
-    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
deleted file mode 100644
index 8eb51200b9..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 \
-python src/llm_sft.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --output_dir runs \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --num_train_epochs 1 \
-    --max_length 2048 \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --lora_dropout_p 0. \
-    --lora_target_modules c_attn attn.c_proj \
-    --gradient_checkpointing false \
-    --batch_size 1 \
-    --weight_decay 0. \
-    --learning_rate 1e-4 \
-    --gradient_accumulation_steps 16 \
-    --max_grad_norm 0.5 \
-    --warmup_ratio 0.03 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 10 \
-    --use_flash_attn true \
-    --push_to_hub false \
-    --hub_model_id qwen-vl-chat-lora \
-    --hub_private_repo true \
-    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
deleted file mode 100644
index e3c68d9770..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 \
-python src/llm_infer.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human false \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --max_new_tokens 1024 \
-    --use_flash_attn false \
-    --temperature 0.9 \
-    --top_k 50 \
-    --top_p 0.9 \
-    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
deleted file mode 100644
index 8f23629c6c..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 \
-python src/llm_sft.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --output_dir runs \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --num_train_epochs 1 \
-    --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --lora_dropout_p 0. \
-    --lora_target_modules c_attn attn.c_proj \
-    --gradient_checkpointing true \
-    --batch_size 1 \
-    --weight_decay 0. \
-    --learning_rate 1e-4 \
-    --gradient_accumulation_steps 16 \
-    --max_grad_norm 0.5 \
-    --warmup_ratio 0.03 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 10 \
-    --use_flash_attn false \
-    --push_to_hub false \
-    --hub_model_id qwen-vl-chat-qlora \
-    --hub_private_repo true \
-    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh
deleted file mode 100644
index e3c68d9770..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-CUDA_VISIBLE_DEVICES=0 \
-python src/llm_infer.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
-    --eval_human false \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --max_new_tokens 1024 \
-    --use_flash_attn false \
-    --temperature 0.9 \
-    --top_k 50 \
-    --top_p 0.9 \
-    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh
deleted file mode 100644
index ff512f36ab..0000000000
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-# Experimental environment: 3090
-nproc_per_node=2
-CUDA_VISIBLE_DEVICES=0,1 \
-torchrun \
-    --nproc_per_node=$nproc_per_node \
-    --master_port 29500 \
-    src/llm_sft.py \
-    --model_type qwen-vl-chat \
-    --sft_type lora \
-    --template_type chatml \
-    --dtype bf16 \
-    --output_dir runs \
-    --ddp_backend nccl \
-    --dataset coco-en \
-    --dataset_sample 20000 \
-    --num_train_epochs 1 \
-    --max_length 2048 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --lora_rank 8 \
-    --lora_alpha 32 \
-    --lora_dropout_p 0. \
-    --lora_target_modules c_attn attn.c_proj \
-    --gradient_checkpointing false \
-    --batch_size 1 \
-    --weight_decay 0. \
-    --learning_rate 1e-4 \
-    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
-    --max_grad_norm 0.5 \
-    --warmup_ratio 0.03 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 10 \
-    --use_flash_attn false \
-    --push_to_hub false \
-    --hub_model_id qwen-vl-chat-qlora \
-    --hub_private_repo true \
-    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
index a1f095bc58..cb3e4b7062 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
@@ -6,8 +6,8 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "runs/seqgpt-560m/vx_xxx/checkpoint-xxx" \
     --eval_human false \
-    --dataset cmnli-zh \
-    --dataset_sample 20000 \
+    --dataset ner-jave-zh \
+    --dataset_sample -1 \
     --max_length 1024 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
index 9c5e30b8e7..5d0ada5770 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
@@ -5,9 +5,9 @@ python src/llm_sft.py \
     --template_type default-generation \
     --dtype bf16 \
     --output_dir runs \
-    --dataset cmnli-zh \
-    --dataset_sample 20000 \
-    --num_train_epochs 1 \
+    --dataset ner-jave-zh \
+    --dataset_sample -1 \
+    --num_train_epochs 3 \
     --max_length 1024 \
     --gradient_checkpointing false \
     --batch_size 32 \
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 424fe3e89b..bcd6bf7304 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -11,11 +11,11 @@
 import torch.distributed as dist
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, check_json_format,
+                   broadcast_string, check_json_format, dataset_map,
                    find_all_linear_for_lora, get_dataset, get_dist_setting,
-                   get_model_tokenizer, get_preprocess, is_dist, is_master,
-                   plot_images, process_dataset, select_bnb, select_dtype,
-                   show_layers, sort_by_max_length,
+                   get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
+                   is_dist, is_master, plot_images, process_dataset,
+                   select_bnb, select_dtype, show_layers, sort_by_max_length,
                    compute_nlg_metrics, prepare_model)
 
 from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
@@ -39,7 +39,6 @@ class SftArguments:
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
-    # DDP + MP(device_map) is not supported
     ddp_backend: Optional[str] = field(
         default=None, metadata={'choices': ['nccl', 'gloo', 'mpi', 'ccl']})
 
@@ -76,6 +75,7 @@ class SftArguments:
 
     gradient_checkpointing: bool = False
     batch_size: int = 1
+    eval_batch_size: Optional[int] = None
     num_train_epochs: int = 1
     # if max_steps >= 0, override num_train_epochs
     max_steps: int = -1
@@ -120,7 +120,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-            "This parameter is used only when model_type.startswith('qwen-7b')"
+            "This parameter is used only when model_type.startswith('qwen')"
         })
 
     # generation config, only useful when `predict_with_generate=True`
@@ -153,7 +153,7 @@ def __post_init__(self):
         assert all([_type.lower() in all_types for _type in sft_type]), \
             f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}'
         if self.sft_type == 'full':
-            assert self.quantization_bit is None, 'not supported'
+            assert self.quantization_bit != 0, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
             if self.learning_rate is None:
                 self.learning_rate = 2e-5
@@ -198,6 +198,11 @@ def __post_init__(self):
         if self.use_flash_attn is None:
             self.use_flash_attn = 'auto'
         self.train_sampler_random = not self.test_oom_error
+        if self.eval_batch_size is None:
+            if self.predict_with_generate:
+                self.eval_batch_size = 1
+            else:
+                self.eval_batch_size = batch_size
 
 
 def llm_sft(args: SftArguments) -> None:
@@ -209,7 +214,7 @@ def llm_sft(args: SftArguments) -> None:
 
     # ### Loading Model and Tokenizer
     kwargs = {'low_cpu_mem_usage': True}
-    if is_dist():
+    if is_dist() and not is_ddp_plus_mp():
         kwargs['device_map'] = {'': local_rank}
     else:
         kwargs['device_map'] = 'auto'
@@ -274,7 +279,7 @@ def llm_sft(args: SftArguments) -> None:
     val_dataset = val_dataset.map(preprocess_func_eval)
     del dataset
     if args.test_oom_error:
-        train_dataset = sort_by_max_length(train_dataset)
+        train_dataset = sort_by_max_length(train_dataset, 20000)
     # Data analysis
     stat_dataset(train_dataset)
     stat_dataset(val_dataset)
@@ -344,10 +349,10 @@ def llm_sft(args: SftArguments) -> None:
         **kwargs)
 
     if args.gradient_checkpointing:
-        # fix: gradients will be None
-        model.config.use_cache = True
         model.enable_input_require_grads()
     if is_dist():
+        # Compatible with https://github.com/huggingface/transformers/pull/25903
+        training_args._frozen = False
         if args.gradient_checkpointing:
             training_args.ddp_find_unused_parameters = False
             training_args.ddp_broadcast_buffers = False
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
index ceb60765f1..07953bfc8a 100644
--- a/examples/pytorch/llm/src/utils/__init__.py
+++ b/examples/pytorch/llm/src/utils/__init__.py
@@ -3,8 +3,8 @@
 from .model import MODEL_MAPPING, get_model_tokenizer
 from .preprocess import TEMPLATE_MAPPING, get_preprocess
 from .swift_utils import prepare_model
-from .utils import (broadcast_string, check_json_format, download_dataset,
-                    find_all_linear_for_lora, get_dist_setting, inference,
-                    is_dist, is_local_master, is_master, plot_images,
-                    process_dataset, select_bnb, select_dtype, show_layers,
-                    sort_by_max_length)
+from .utils import (broadcast_string, check_json_format, dataset_map,
+                    download_dataset, find_all_linear_for_lora,
+                    get_dist_setting, inference, is_ddp_plus_mp, is_dist,
+                    is_local_master, is_master, plot_images, process_dataset,
+                    select_bnb, select_dtype, show_layers, sort_by_max_length)
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 3a2294a395..79f537857b 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -3,9 +3,10 @@
 import os
 import re
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional
 
 import json
+import numpy as np
 from datasets import Dataset as HfDataset
 from datasets import concatenate_datasets
 from modelscope import MsDataset
@@ -373,7 +374,7 @@ def get_jd_zh_dataset() -> HfDataset:
                                    'Sentiment Classification', False)
 
 
-def _process_dureader_robust(dataset: HfDataset) -> HfDataset:
+def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset:
     prompt = """Task: Question Generation
 Context: {context}
 Answer: {answer}
@@ -396,34 +397,191 @@ def get_dureader_robust_qg_zh_dataset() -> HfDataset:
         dataset_dict['validation'].to_hf_dataset(),
         dataset_dict['test'].to_hf_dataset()
     ])
-    return _process_dureader_robust(dataset)
+    return _preprocess_dureader_robust(dataset)
+
+
+def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset:
+    query = []
+    response = []
+    for d in tqdm(dataset):
+        r = d['output']
+        if r is None:
+            continue
+        if subset_name == 'zh':
+            q = d['instruction']
+        else:
+            q = d['input']
+            if q is None:
+                continue
+        query.append(q)
+        response.append(r)
+    return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_medical_dataset(subset_name: str,
+                        dataset_sample: int = -1) -> HfDataset:
+    """
+    mode: Literal['en', zh]
+    """
+    dataset_dict = MsDataset.load(
+        'huangjintao/medical_zh', subset_name=subset_name)
+    dataset: HfDataset = concatenate_datasets([
+        dataset_dict['train'].to_hf_dataset(),
+        dataset_dict['val'].to_hf_dataset(),
+        dataset_dict['test'].to_hf_dataset(),
+    ])
+    if dataset_sample != -1:
+        idxs = np.random.permutation(dataset_sample)
+        dataset = dataset.select(idxs)
+    return _preprocess_medical(dataset, subset_name)
+
+
+def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset:
+    query = []
+    response = []
+    history: List[History] = []
+    for d in tqdm(dataset):
+        conversation = ast.literal_eval(d['conversation'])
+        query.append(conversation[-1]['human'])
+        response.append(conversation[-1]['assistant'])
+        h = []
+        for c in conversation[:-1]:
+            h.append((c['human'], c['assistant']))
+        history.append(h)
+    return HfDataset.from_dict({
+        'query': query,
+        'response': response,
+        'history': history
+    })
+
+
+def get_sharegpt_dataset(subset_name_list: List[str]) -> HfDataset:
+    dataset_list = []
+    for subset_name in subset_name_list:
+        dataset = MsDataset.load(
+            'huangjintao/sharegpt', subset_name=subset_name,
+            split='train').to_hf_dataset()
+        dataset_list.append(dataset)
+    dataset = concatenate_datasets(dataset_list)
+    return _preprocess_sharegpt(dataset)
+
+
+_sharegpt_zh_subset_list = ['common-zh', 'computer-zh', 'unknow-zh']
+
+_sharegpt_en_subset_list = ['common-en', 'computer-en']
+
+
+def get_sharegpt_all_zh_dataset():
+    """multi-round chat"""
+    return get_sharegpt_dataset(_sharegpt_zh_subset_list)
+
+
+def get_sharegpt_all_en_dataset():
+    """multi-round chat"""
+    return get_sharegpt_dataset(_sharegpt_en_subset_list)
+
+
+def get_cls_fudan_news_zh() -> HfDataset:
+    """Sequence Classification """
+    dataset = MsDataset.load('damo/zh_cls_fudan-news').to_hf_dataset()
+    return HfDataset.from_dict({
+        'query': dataset['prompt'],
+        'response': dataset['answer']
+    })
+
+
+def get_ner_jave_zh() -> HfDataset:
+    """Named Entity Recognition"""
+    dataset = MsDataset.load('damo/zh_ner-JAVE').to_hf_dataset()
+    return HfDataset.from_dict({
+        'query': dataset['prompt'],
+        'response': dataset['answer']
+    })
+
+
+def _preprocess_code_python_dataset(dataset: HfDataset) -> HfDataset:
+    query = []
+    response = []
+    for d in tqdm(dataset):
+        chat_rounds = ast.literal_eval(d['chat_rounds'])
+        assert len(chat_rounds) == 2
+        query.append(chat_rounds[-2]['content'])
+        response.append(chat_rounds[-1]['content'])
+    return HfDataset.from_dict({'query': query, 'response': response})
+
+
+def get_code_python_zh_dataset() -> HfDataset:
+    dataset = MsDataset.load(
+        'codefuse-ai/CodeExercise-Python-27k').to_hf_dataset()
+    return _preprocess_code_python_dataset(dataset)
 
 
 DATASET_MAPPING = {
     # nlp chat
-    'alpaca-en': get_alpaca_gpt4_en_dataset,
-    'alpaca-zh': get_alpaca_gpt4_zh_dataset,
-    'finance-en': get_finance_en_dataset,
-    'multi-alpaca-all': get_multi_alpaca_all,
-    'code-en': get_code_alpaca_en_dataset,
-    'instinwild-en': get_instinwild_en_dataset,
-    'instinwild-zh': get_instinwild_zh_dataset,
-    'cot-en': get_cot_en_dataset,
-    'cot-zh': get_cot_zh_dataset,
-    'damo-agent-mini-zh': partial(get_damo_agent_zh_dataset, use_mini=True),
-    'damo-agent-zh': get_damo_agent_zh_dataset,  # containing normal chat
-    'firefly-all-zh': get_firefly_all_zh_dataset,
-    'poetry-zh': get_poetry_zh_dataset,
-    'instruct-en': get_instruct_en_dataset,
-    'gpt4all-en': get_gpt4all_en_dataset,
+    'alpaca-en':
+    get_alpaca_gpt4_en_dataset,
+    'alpaca-zh':
+    get_alpaca_gpt4_zh_dataset,
+    'finance-en':
+    get_finance_en_dataset,
+    'multi-alpaca-all':
+    get_multi_alpaca_all,
+    'code-en':
+    get_code_alpaca_en_dataset,
+    'instinwild-en':
+    get_instinwild_en_dataset,
+    'instinwild-zh':
+    get_instinwild_zh_dataset,
+    'cot-en':
+    get_cot_en_dataset,
+    'cot-zh':
+    get_cot_zh_dataset,
+    'firefly-all-zh':
+    get_firefly_all_zh_dataset,
+    'poetry-zh':
+    get_poetry_zh_dataset,
+    'instruct-en':
+    get_instruct_en_dataset,
+    'gpt4all-en':
+    get_gpt4all_en_dataset,
+    'medical-en':
+    partial(get_medical_dataset, subset_name='en'),
+    'medical-zh':
+    partial(get_medical_dataset, subset_name='zh'),
+    'medical-mini-zh':
+    partial(get_medical_dataset, subset_name='zh', dataset_sample=100000),
+    'code-python-zh':
+    get_code_python_zh_dataset,
+
+    # multi-round chat
+    'damo-agent-mini-zh':
+    partial(get_damo_agent_zh_dataset, use_mini=True),
+    'damo-agent-zh':
+    get_damo_agent_zh_dataset,  # containing normal chat
+    'sharegpt-en':
+    get_sharegpt_all_en_dataset,
+    'sharegpt-zh':
+    get_sharegpt_all_zh_dataset,
+
     # nlp text-generation (please use model:base, template:default-generation)
-    'cmnli-zh': get_cmnli_zh_dataset,
-    'jd-zh': get_jd_zh_dataset,
-    'dureader-robust-zh': get_dureader_robust_qg_zh_dataset,
-    # multi-modal chat
-    'coco-en': get_coco_en_dataset,
+    'cmnli-zh':
+    get_cmnli_zh_dataset,
+    'jd-zh':
+    get_jd_zh_dataset,
+    'dureader-robust-zh':
+    get_dureader_robust_qg_zh_dataset,
     'advertise_gen': get_advertise_gen_dataset,
     'du_reader': get_du_reader_dataset,
+
+    # multi-modal chat
+    'coco-en':
+    get_coco_en_dataset,
+
+    # other (e.g. example dataset for specific model)
+    'cls-fudan-news-zh':
+    get_cls_fudan_news_zh,  # seqgpt-560m
+    'ner-jave-zh':
+    get_ner_jave_zh,  # seqgpt-560m
 }
 
 
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index abfe1140e4..7d3741ebd8 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -392,7 +392,7 @@ class ResTunerTM(NamedTuple):
     },
     'baichuan2-7b-chat': {
         'model_id': 'baichuan-inc/Baichuan2-7B-Chat',
-        'revision': 'v1.0.0',
+        'revision': 'v1.0.1',
         'template': 'baichuan',
         'lora_TM': LoRATM.baichuan,
     },
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py
index 18f1e77e71..e6306d36f3 100644
--- a/examples/pytorch/llm/src/utils/utils.py
+++ b/examples/pytorch/llm/src/utils/utils.py
@@ -1,20 +1,29 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed from huggingface/transformers.
+import heapq
 import logging
 import os
 import shutil
+from functools import wraps
 from tempfile import TemporaryDirectory
-from typing import Any, List, Mapping, Optional, Sequence, Tuple
+from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence,
+                    Tuple, Union)
 
 import matplotlib.pyplot as plt
 import numpy as np
 import requests
 import torch
 import torch.distributed as dist
+from accelerate.utils.modeling import (get_balanced_memory,
+                                       infer_auto_device_map)
 from datasets import Dataset as HfDataset
+from modelscope import MsDataset
 from modelscope.utils.config_ds import MS_CACHE_HOME
 from modelscope.utils.logger import get_logger as get_ms_logger
+from torch import device as Device
 from torch import dtype as Dtype
 from torch.nn import Linear, Module
+from torch.nn.parallel import DistributedDataParallel as DDP
 from tqdm.auto import tqdm
 from transformers import GenerationConfig, TextStreamer, trainer
 
@@ -252,10 +261,10 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float,
     return dataset['train'], dataset['test']
 
 
-def sort_by_max_length(dataset: HfDataset) -> HfDataset:
-    dataset_len = [len(d['input_ids']) for d in dataset]
-    idx = sorted(
-        range(len(dataset)), key=lambda i: dataset_len[i], reverse=True)
+def sort_by_max_length(dataset: HfDataset, num_dataset: int) -> HfDataset:
+    dataset_len = [len(d['input_ids']) for d in tqdm(dataset)]
+    idx = heapq.nlargest(
+        num_dataset, range(len(dataset_len)), key=lambda i: dataset_len[i])
     input_ids = []
     labels = []
     for i in tqdm(idx):
@@ -282,6 +291,107 @@ def check_json_format(obj: Any) -> Any:
     return res
 
 
+_old_msdataset_load = MsDataset.load
+
+
+@wraps(_old_msdataset_load)
+def _msdataset_ddp_load(*args, **kwargs):
+    if is_dist() and not is_local_master():
+        dist.barrier()
+    dataset = _old_msdataset_load(*args, **kwargs)
+    if is_dist() and is_local_master():
+        dist.barrier()
+
+    if is_dist():
+        dist.barrier()
+    return dataset
+
+
+def is_ddp_plus_mp() -> bool:
+    if not is_dist():
+        return False
+    n_gpu = torch.cuda.device_count()
+    local_world_size = get_dist_setting()[3]
+    assert n_gpu % local_world_size == 0
+    if n_gpu // local_world_size >= 2:
+        logger.info('Using DDP + MP(device_map)')
+        return True
+    return False
+
+
+def _get_max_memory(device_ids: List[int]) -> Dict[Union[int, str], int]:
+    """add feat in accelerate to support DDP + MP"""
+    import psutil
+    # Make sure CUDA is initialized on each GPU to have the right memory info.
+    for i in device_ids:
+        _ = torch.tensor([0], device=i)
+
+    device_ids_set = set(device_ids)
+    max_memory = {}
+    for i in range(torch.cuda.device_count()):
+        max_memory[i] = 0
+        if i in device_ids_set:
+            max_memory[i] = torch.cuda.mem_get_info(i)[0]
+    max_memory['cpu'] = psutil.virtual_memory().available
+    return max_memory
+
+
+def _sync_max_memory(
+        max_memory: Dict[Union[int, str], int]) -> Dict[Union[int, str], int]:
+    """Make sure that the model structure of MP(device_map) is the same, when using DDP."""
+    max_memory_list = [
+        v for k, v in max_memory.items() if (v > 0 and k != 'cpu')
+    ]
+    _, local_rank, world_size, _ = get_dist_setting()
+    src_tensor = torch.tensor(max_memory_list).to(local_rank)
+    tgt_tensor_list = [torch.zeros_like(src_tensor) for _ in range(world_size)]
+    dist.all_gather(tgt_tensor_list, src_tensor)
+    tgt_tensor = torch.stack(tgt_tensor_list, dim=0)
+    new_max_memory_iter = iter(tgt_tensor.min(dim=0)[0].tolist())
+    new_max_memory = {}
+    for k, v in max_memory.items():
+        new_max_memory[k] = v
+        if v > 0 and k != 'cpu':
+            new_max_memory[k] = next(new_max_memory_iter)
+    return new_max_memory
+
+
+@wraps(infer_auto_device_map)
+def _infer_auto_device_map_patch(
+        model: Module,
+        max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None,
+        **kwargs) -> Dict[str, Union[int, str, Device]]:
+    """The auxiliary function for supports DDP+MP. Monkey Patching.
+    add feat in accelerate to support DDP + MP"""
+    verbose = kwargs.pop('verbose', False)
+    n_gpu = torch.cuda.device_count()
+    _, local_rank, _, local_world_size = get_dist_setting()
+    device_ids = list(range(local_rank, n_gpu, local_world_size))
+    max_memory = _get_max_memory(device_ids)
+    max_memory = _sync_max_memory(max_memory)
+    max_memory = get_balanced_memory(
+        model, max_memory, low_zero=False, **kwargs)
+    max_memory = {k: v for k, v in max_memory.items() if v > 0}
+    return infer_auto_device_map(model, max_memory, verbose=verbose, **kwargs)
+
+
+def dataset_map(
+    dataset: HfDataset, preprocess_func: Callable[[Dict[str, Any]],
+                                                  Dict[str,
+                                                       Optional[List[int]]]]
+) -> HfDataset:
+    # faster than dataset.map
+    input_ids = []
+    labels = []
+    for d in tqdm(dataset):
+        d = preprocess_func(d)
+        if d['input_ids'] is None:
+            continue
+        input_ids.append(d['input_ids'])
+        labels.append(d['labels'])
+    return HfDataset.from_dict({'input_ids': input_ids, 'labels': labels})
+
+
 logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s')
 
 logger.handlers[0].setFormatter(logger_format)
@@ -296,3 +406,19 @@ def check_json_format(obj: Any) -> Any:
 # monkey patching
 trainer.DEFAULT_PROGRESS_CALLBACK = ProgressCallbackNew
 trainer.DEFAULT_CALLBACKS = [DefaultFlowCallbackNew]
+MsDataset.load = _msdataset_ddp_load
+if is_ddp_plus_mp():
+    import transformers
+    import accelerate
+    _old_ddp_init = DDP.__init__
+    accelerate.accelerator.torch.nn.parallel.DistributedDataParallel.__init__ = (
+        lambda self, model, device_ids, output_device, *args, **kwargs:
+        _old_ddp_init(self, model, *args, **kwargs))
+    transformers.modeling_utils.get_balanced_memory = lambda *args, **kwargs: None
+    transformers.modeling_utils.infer_auto_device_map = _infer_auto_device_map_patch
+    _old_accelerator_init = trainer.Accelerator.__init__
+    trainer.Accelerator.__init__ = (
+        lambda self, device_placement=False, *args, **kwargs:
+        _old_accelerator_init(
+            self, device_placement=device_placement, *args, **kwargs))
+    trainer.Accelerator.verify_device_map = lambda *args, **kwargs: False
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index a659ec8747..1f4a4c2f46 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -8,7 +8,10 @@
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
-from transformers.deepspeed import is_deepspeed_zero3_enabled
+try: 
+    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+except ImportError:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from .callback import DefaultFlowCallbackNew, ProgressCallbackNew
 from .mixin import PushToMsHubMixin, SwiftMixin

From a522bbfd7494e4bf9675c3dec664608f0f473a02 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 14 Sep 2023 16:35:00 +0800
Subject: [PATCH 64/70] update

---
 examples/pytorch/llm/src/llm_sft.py       | 4 ++--
 examples/pytorch/llm/src/utils/dataset.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index bcd6bf7304..3430c20ca4 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -153,7 +153,7 @@ def __post_init__(self):
         assert all([_type.lower() in all_types for _type in sft_type]), \
             f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}'
         if self.sft_type == 'full':
-            assert self.quantization_bit != 0, 'not supported'
+            assert self.quantization_bit == 0, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
             if self.learning_rate is None:
                 self.learning_rate = 2e-5
@@ -202,7 +202,7 @@ def __post_init__(self):
             if self.predict_with_generate:
                 self.eval_batch_size = 1
             else:
-                self.eval_batch_size = batch_size
+                self.eval_batch_size = self.batch_size
 
 
 def llm_sft(args: SftArguments) -> None:
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 79f537857b..9366574a6a 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -3,7 +3,7 @@
 import os
 import re
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import json
 import numpy as np
@@ -570,7 +570,7 @@ def get_code_python_zh_dataset() -> HfDataset:
     get_jd_zh_dataset,
     'dureader-robust-zh':
     get_dureader_robust_qg_zh_dataset,
-    'advertise_gen': get_advertise_gen_dataset,
+    'advertise-gen': get_advertise_gen_dataset,
     'du_reader': get_du_reader_dataset,
 
     # multi-modal chat

From c3cab0db0afb9c038edec570b465fd830ad39eba Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 Sep 2023 16:43:02 +0800
Subject: [PATCH 65/70] fix bug

---
 examples/pytorch/llm/src/llm_sft.py           | 17 ++++----
 examples/pytorch/llm/src/utils/dataset.py     |  6 ++-
 examples/pytorch/llm/src/utils/swift_utils.py |  4 +-
 swift/trainers/trainers.py                    |  9 +++--
 swift/tuners/adapter.py                       | 24 +++++------
 swift/tuners/base.py                          | 20 +++++++---
 swift/tuners/lora.py                          | 40 ++++++++++---------
 swift/tuners/prompt.py                        | 17 ++++----
 swift/tuners/restuning.py                     | 23 ++++++-----
 swift/tuners/side.py                          | 34 ++++++++++------
 10 files changed, 111 insertions(+), 83 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 3430c20ca4..ef72981dfa 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -11,12 +11,12 @@
 import torch.distributed as dist
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, check_json_format, dataset_map,
-                   find_all_linear_for_lora, get_dataset, get_dist_setting,
-                   get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
-                   is_dist, is_master, plot_images, process_dataset,
-                   select_bnb, select_dtype, show_layers, sort_by_max_length,
-                   compute_nlg_metrics, prepare_model)
+                   broadcast_string, check_json_format, compute_nlg_metrics,
+                   dataset_map, find_all_linear_for_lora, get_dataset,
+                   get_dist_setting, get_model_tokenizer, get_preprocess,
+                   is_ddp_plus_mp, is_dist, is_master, plot_images,
+                   prepare_model, process_dataset, select_bnb, select_dtype,
+                   show_layers, sort_by_max_length)
 
 from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
                    Swift, get_logger)
@@ -35,7 +35,10 @@ class SftArguments:
         metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
         default='lora',
-        metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'})
+        metadata={
+            'help':
+            f'tuner choices: {["lora", "full", "adapter", "restuning"]}'
+        })
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 9366574a6a..3b418d2d7f 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -570,8 +570,10 @@ def get_code_python_zh_dataset() -> HfDataset:
     get_jd_zh_dataset,
     'dureader-robust-zh':
     get_dureader_robust_qg_zh_dataset,
-    'advertise-gen': get_advertise_gen_dataset,
-    'du_reader': get_du_reader_dataset,
+    'advertise-gen':
+    get_advertise_gen_dataset,
+    'du_reader':
+    get_du_reader_dataset,
 
     # multi-modal chat
     'coco-en':
diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
index 8d931017c6..0c56972aca 100644
--- a/examples/pytorch/llm/src/utils/swift_utils.py
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -11,9 +11,7 @@
 logger = get_logger()
 
 
-def prepare_model(
-    model: Module, args
-):
+def prepare_model(model: Module, args):
     swift_config: Dict[str, SwiftConfig] = dict()
     for sft_type in [_type.strip() for _type in args.sft_type.split(',')]:
         if sft_type.lower() == SwiftTuners.LORA.lower():
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 1f4a4c2f46..c51eae8841 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -8,14 +8,15 @@
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
-try: 
-    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-except ImportError:
-    from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from .callback import DefaultFlowCallbackNew, ProgressCallbackNew
 from .mixin import PushToMsHubMixin, SwiftMixin
 
+try:
+    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+except ImportError:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
+
 
 class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
     pass
diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py
index 3beffcfca8..98f829525a 100644
--- a/swift/tuners/adapter.py
+++ b/swift/tuners/adapter.py
@@ -26,10 +26,12 @@ class AdapterConfig(SwiftConfig):
     See http://arxiv.org/abs/1902.00751
 
     Args:
-        dim: The dimension of the hidden states
-        target_modules: The feedforward module to be replaced, in regex format
-        hidden_pos: The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)
-        method_name: The method to be replaced, default to replace the forward method
+        dim(`int`): The dimension of the hidden states
+        target_modules(`Union[str, List[str]]`): The feedforward module to be replaced.
+            in regex format if this argument is str, else will match with `end with` if List[str].
+        hidden_pos(`Union[str, int]`): The position of the hidden state to be passed into the adapter,
+            can be int (args) or str (kwargs)
+        method_name(`str`): The method to be replaced, default is `forward`
         adapter_length: The length of the adapter length (intermediate length)
         act_layer: The activation layer of the adapter
     """
@@ -37,25 +39,24 @@ class AdapterConfig(SwiftConfig):
     dim: int = field(
         default=None, metadata={'help': 'The dimension of the hidden states'})
 
-    target_modules: str = field(
+    target_modules: Union[str, List[str]] = field(
         default=None,
         metadata={
-            'help': 'The feedforward module to be replaced, in regex format'
+            'help':
+            'The feedforward module to be replaced. in regex format if this argument is str, '
+            'else will match with `end with` if List[str].'
         })
 
     hidden_pos: Union[str, int] = field(
         default=None,
         metadata={
             'help':
-            'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)'
+            'The position of the hidden state to be passed into the adapter, can be int (args) or str (kwargs)'
         })
 
     method_name: str = field(
         default='forward',
-        metadata={
-            'help':
-            'The method to be replaced, default to replace the forward method'
-        })
+        metadata={'help': 'The method to be replaced, default is `forward`'})
 
     adapter_length: int = field(
         default=128,
@@ -182,7 +183,6 @@ def __init__(
         super(nn.Module, self).__init__()
         self.dim = dim
         self.adapter_length = adapter_length
-        # self.adapter_type = adapter_type
         self.linear1 = nn.Linear(dim, adapter_length)
         self.act = act_layer()
         self.linear2 = nn.Linear(adapter_length, dim)
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index 8eaa43aec7..8ad9807e09 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -29,7 +29,7 @@ class SwiftModel(nn.Module):
 
     Args:
         model (`Union[nn.Module, 'SwiftModel']`) A module to be tuned by Swift.
-        config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of adapter_name: SwiftConfig.
+        config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of {adapter_name: SwiftConfig}.
             If it's a config class, the adapter_name will be `default`
         extra_state_keys (`List[str]`, `optional`) A list of regex to match the extra state keys to be saved.
         inference_mode (bool, `optional`): Load model at inference mode, default False.
@@ -202,7 +202,7 @@ def load_state_file(path):
 
     @classmethod
     def from_pretrained(cls,
-                        model: nn.Module,
+                        model: Union[nn.Module, 'SwiftModel'],
                         model_id: str = None,
                         adapter_name: Union[str, List[str]] = None,
                         inference_mode: bool = False,
@@ -211,9 +211,11 @@ def from_pretrained(cls,
         """Load a set of tuners and corresponding weights by a model_id.
 
         Args:
-            model (`torch.nn.Module`): The model to be tuned.
-            model_id (`str`): The model_id or a local model dir to use to tune the model.
+            model (`Union[torch.nn.Module, 'SwiftModel']`): The model to be tuned,
+                if the model is already a `SwiftModel` it will be un-wrapped and re-wrapped..
+            model_id (`str`): The model_id or a local model dir of tuners to use to tune the model.
             adapter_name (`Union[str, List[str]]`): The adapter_names saved in the model repo to load.
+                Default `None`, means load all tuners saved in the model_id
             inference_mode (`bool`): Use in the inference mode or not.
             revision (`str`): The model revision to use.
             **kwargs:
@@ -247,6 +249,10 @@ def from_pretrained(cls,
             sub_folder = os.path.join(model_dir, _name)
             config_file = os.path.join(sub_folder, CONFIG_NAME)
 
+            if not os.path.isfile(config_file):
+                logger.warning(f'{_name} is not a valid tuner')
+                continue
+
             with open(config_file, 'r') as file:
                 json_object = json.load(file)
 
@@ -315,7 +321,6 @@ def create_or_update_model_card(self, output_dir: str):
             lines.append(
                 f'{training_procedure_heading}\n{training_config_text}')
 
-        # Adds peft version
         framework_block_heading = '### Framework versions\n'
         from swift.version import __version__
         if framework_block_heading in lines:
@@ -326,6 +331,11 @@ def create_or_update_model_card(self, output_dir: str):
             lines.append(
                 f'{framework_block_heading}\n\n- SWIFT {__version__}\n')
 
+        base_model_heading = '### Base model information\n'
+        lines.append(
+            f'{base_model_heading}\n\n- BaseModel Class {self.base_model.__class__.__name__}\n'
+        )
+
         # write the lines back to README.md
         with open(os.path.join(output_dir, 'README.md'), 'w') as f:
             f.writelines(lines)
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 760db2d314..15a6594aa1 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -5,7 +5,7 @@
 import re
 from dataclasses import dataclass, field
 from types import MethodType
-from typing import Dict, List
+from typing import Dict, List, Union
 
 import torch
 import torch.nn as nn
@@ -106,19 +106,20 @@ class LoRAConfig(SwiftConfig):
     The configuration class for the loRA module.
 
     Args:
-        r: The rank of the LoRA module
-        target_modules: The modules to be replaced by LoRA, can be the end of the module name or a regex string
-        lora_alpha: The factor to add the lora weights
-        lora_dropout: The dropout rate of the lora module
-        merge_weights: Whether to merge weights when validating
-        use_merged_linear: Whether to replace with merged linear layer
-        enable_lora: The modules need to be turned on when using the merged linear layer
-        fan_in_fan_out: Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        bias: Bias type. Values ca be "none", "all" or "lora_only"
+        r(int): The rank of the LoRA module
+        target_modules(List[str]): The modules to be replaced by LoRA,
+            can be the end of the module name or a regex string
+        lora_alpha(float): The factor to add the lora weights
+        lora_dropout(float): The dropout rate of the lora module
+        merge_weights(bool): Whether to merge weights when validating
+        use_merged_linear(bool): Whether to replace with merged linear layer
+        enable_lora(List[bool]): The modules need to be turned on when using the merged linear layer
+        fan_in_fan_out(bool): Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        bias(str): Bias type. Values ca be "none", "all" or "lora_only"
     """
 
     r: int = field(default=6, metadata={'help': 'The rank of the LoRA module'})
-    target_modules: List = field(
+    target_modules: List[str] = field(
         default=None,
         metadata={
             'help':
@@ -193,18 +194,19 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str,
             _module.set_activation(activate)
 
     @staticmethod
-    def _dynamic_patch_lora(model, replace_modules, use_merged_linear,
-                            adapter_name, **kwargs):
+    def _dynamic_patch_lora(model: torch.nn.Module,
+                            replace_modules: Union[str, List[str]],
+                            use_merged_linear: bool, adapter_name: str,
+                            **kwargs):
         """Dynamic patch lora to model
 
         Args:
-            model: The torch.nn.Module containing the target module to be patched.
-            replace_modules: The module names to be replaced, the replacing strategy is `end with`.
-            use_merged_linear: Whether to replace with merged linear layer
+            model(`torch.nn.Module`): The torch.nn.Module containing the target module to be patched.
+            replace_modules(`Union[str, List[str]]`): The module names to be replaced,
+                the replacing strategy is `end with`.
+            use_merged_linear(bool): Whether to replace with merged linear layer.
+            adapter_name(str): The adapter name.
             **kwargs: The arguments passed from `tune` which are needed by lora.
-
-        Returns:
-            The lora modules
         """
         modules = {}
         module_keys = [key for key, _ in model.named_modules()]
diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
index 56605f5896..141c196fdb 100644
--- a/swift/tuners/prompt.py
+++ b/swift/tuners/prompt.py
@@ -28,14 +28,15 @@ class PromptConfig(SwiftConfig):
     Here we apply the VPT to other fields.
 
     Args:
-        dim: The dimension of the hidden states
-        target_modules: The layer module to be replaced, in regex format
-        embedding_pos: The position of the embedding tensor
-        attention_mask_pos: The position of the attention mask
-        attention_mask_value: The value to pad to the attention mask
-        prompt_length: The length of the prompt tokens
-        attach_front: When set to True, prompt is attached in front of the embedding
-        extract_embedding: Whether the embedding is extracted at final stage to keep the same dims with inputs
+        dim(`Union[int, List[int]]`): The dimension of the hidden states, use list if there are up-sample blocks
+            or down-sample blocks
+        target_modules(str): The layer module to be replaced, in regex format
+        embedding_pos(Union[str, int]): The position of the embedding tensor
+        attention_mask_pos(Union[str, int]): The position of the attention mask
+        attention_mask_value(Union[float, int, bool]): The value to pad to the attention mask
+        prompt_length(int): The length of the prompt tokens
+        attach_front(bool): When set to True, prompt is attached in front of the embedding
+        extract_embedding(bool): Whether the embedding is extracted at final stage to keep the same dims with inputs
     """
 
     dim: Union[int, List[int]] = field(
diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
index 4744e55b38..d808551971 100644
--- a/swift/tuners/restuning.py
+++ b/swift/tuners/restuning.py
@@ -28,16 +28,19 @@ class ResTuningConfig(SwiftConfig):
     See
 
     Args:
-        dims: The dimensions of the hidden states
-        root_modules: The root module to be replaced, can a regex string
-        root_modules_hook: The hook type of root modules, can be "input" or "output"
-        stem_modules: The stem modules to be replaced, can a regex string or name list of full match format
-        stem_modules_hook: The hook type of stem modules, can be "input" or "output"
-        target_modules: The target module to be replaced, can a regex string
-        target_modules_hook: The hook type of target modules, can be "input" or "output"
-        tuner_cfg: The configuration of the tuning module, can a string or customized config
-        use_upsample: Whether to use auxiliary upsample module
-        use_bypass: Whether to use bypass
+        dims(`Union[List[int], int]`): The dimensions of the hidden states
+        root_modules(`str`): The root module to be replaced, can a regex string
+        root_modules_hook(`str`): The hook type of root modules, can be "input" or "output"
+        stem_modules(`Union[List[str], str]`): The stem modules to be replaced,
+            can a regex string or name list of full match format
+        stem_modules_hook(`Union[List[str], str]`): The hook type of stem modules, can be "input" or "output"
+        target_modules(`str`): The target module to be replaced, can a regex string
+        target_modules_hook(`str`): The hook type of target modules, can be "input" or "output"
+        tuner_cfg(`Union[List[Dict], Dict, str]`): The configuration of the tuning module,
+            can a string or customized config
+        use_upsample(bool): Whether to use auxiliary upsample module
+        upsample_out_channels(List[int]): The channels if `use_upsample`
+        zero_init_last(bool): Use zero to initialize the last Linear in every sub tuner.
 
     """
 
diff --git a/swift/tuners/side.py b/swift/tuners/side.py
index 3c40baede9..168cc2bb2c 100644
--- a/swift/tuners/side.py
+++ b/swift/tuners/side.py
@@ -44,14 +44,21 @@ class SideConfig(SwiftConfig):
         })
 
     side_module_name: str = field(
-        default=1.,
+        default='fcn4',
         metadata={'help': 'The name of the additive side networks'})
 
-    hidden_pos: Union[str, int] = field(
+    source_hidden_pos: Union[str, int] = field(
         default=0,
         metadata={
             'help':
-            'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)'
+            'The position of the hidden state input to the target module, can be int (args) or str (kwargs)'
+        })
+
+    target_hidden_pos: Union[str, int] = field(
+        default=0,
+        metadata={
+            'help':
+            'The position of the hidden state output from the target module, can be int (args) or str (kwargs)'
         })
 
     def __post_init__(self):
@@ -82,18 +89,19 @@ def _forward(self, *args, **kwargs):
                     args_main = getattr(
                         self, f'forward_origin_{adapter_name}')(*args,
                                                                 **kwargs)
+
+                    if isinstance(config.source_hidden_pos, int):
+                        x = args[config.source_hidden_pos]
+                    else:
+                        x = kwargs[config.source_hidden_pos]
+
+                    x_main = args_main[config.target_modules] \
+                        if isinstance(args_main, (tuple, list, dict)) else args_main
+                    out = getattr(self, f'side_{adapter_name}')(x, x_main)
                     if isinstance(args_main, (tuple, list, dict)):
-                        if isinstance(config.hidden_pos, str):
-                            args_main[config.hidden_pos] = getattr(
-                                self, f'side_{adapter_name}')(
-                                    *args, args_main[config.hidden_pos])
+                        args_main[config.target_modules] = out
                     else:
-                        _type = type(args_main)
-                        args_main = list(args_main)
-                        args_main[config.hidden_pos] = getattr(
-                            self, f'side_{adapter_name}')(
-                                *args, args_main[config.hidden_pos])
-                        args_main = _type(args_main)
+                        args_main = out
                     return args_main
 
                 if isinstance(tgt_module, nn.Sequential) and not hasattr(

From 14cbaac763db8b09993af9306082bb204ba315cc Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Thu, 14 Sep 2023 16:43:46 +0800
Subject: [PATCH 66/70] fix arg

---
 tests/tuners/test_swift_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index 83dd5fa44a..f6deec9f86 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -359,7 +359,7 @@ def test_swift_side_bert(self):
             dim=model.config.hidden_size,
             target_modules=r'.*encoder.encoder',
             side_module_name='mlp',
-            hidden_pos='last_hidden_state')
+            target_hidden_pos='last_hidden_state')
 
         model = Swift.prepare_model(model, config=side_config)
         result_activate = model(**inputs).logits

From 903cb34564836d0ba8e399f0a0b164bf4b895b0d Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 14 Sep 2023 17:40:54 +0800
Subject: [PATCH 67/70] fix bugs

---
 examples/pytorch/llm/src/llm_sft.py           | 26 ++++++++++---------
 examples/pytorch/llm/src/utils/dataset.py     |  6 +++--
 examples/pytorch/llm/src/utils/swift_utils.py |  4 +--
 swift/trainers/trainers.py                    |  9 ++++---
 swift/utils/torch_utils.py                    | 11 ++++----
 5 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 3430c20ca4..bbf9b7895f 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -11,12 +11,12 @@
 import torch.distributed as dist
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, check_json_format, dataset_map,
-                   find_all_linear_for_lora, get_dataset, get_dist_setting,
-                   get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
-                   is_dist, is_master, plot_images, process_dataset,
-                   select_bnb, select_dtype, show_layers, sort_by_max_length,
-                   compute_nlg_metrics, prepare_model)
+                   broadcast_string, check_json_format, compute_nlg_metrics,
+                   dataset_map, find_all_linear_for_lora, get_dataset,
+                   get_dist_setting, get_model_tokenizer, get_preprocess,
+                   is_ddp_plus_mp, is_dist, is_master, plot_images,
+                   prepare_model, process_dataset, select_bnb, select_dtype,
+                   show_layers, sort_by_max_length)
 
 from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
                    Swift, get_logger)
@@ -35,7 +35,10 @@ class SftArguments:
         metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
         default='lora',
-        metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'})
+        metadata={
+            'help':
+            f'tuner choices: {["lora", "full", "adapter", "restuning"]}'
+        })
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -234,15 +237,15 @@ def llm_sft(args: SftArguments) -> None:
         args.model_type, torch_dtype=args.torch_dtype, **kwargs)
 
     if args.resume_from_ckpt is None:
-        model = prepare_model(model, args)
+        if args.sft_type != 'full':
+            model = prepare_model(model, args)
     else:
         model = Swift.from_pretrained(
             model, args.resume_from_ckpt, is_trainable=True)
 
     show_layers(model)
     print_model_info(model)
-    logger.info(str(model))
-    logger.info(model.get_trainable_parameters())
+    logger.info(model)
 
     # ### Loading Dataset
     dataset = get_dataset(args.dataset.split(','))
@@ -311,8 +314,7 @@ def llm_sft(args: SftArguments) -> None:
         do_eval=True,
         evaluation_strategy='steps',
         per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=1
-        if args.predict_with_generate else args.batch_size,
+        per_device_eval_batch_size=args.eval_batch_size,
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         learning_rate=args.learning_rate,
         weight_decay=args.weight_decay,
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 9366574a6a..3b418d2d7f 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -570,8 +570,10 @@ def get_code_python_zh_dataset() -> HfDataset:
     get_jd_zh_dataset,
     'dureader-robust-zh':
     get_dureader_robust_qg_zh_dataset,
-    'advertise-gen': get_advertise_gen_dataset,
-    'du_reader': get_du_reader_dataset,
+    'advertise-gen':
+    get_advertise_gen_dataset,
+    'du_reader':
+    get_du_reader_dataset,
 
     # multi-modal chat
     'coco-en':
diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
index 8d931017c6..63484a1e9f 100644
--- a/examples/pytorch/llm/src/utils/swift_utils.py
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -11,9 +11,7 @@
 logger = get_logger()
 
 
-def prepare_model(
-    model: Module, args
-):
+def prepare_model(model: Module, args) -> Module:
     swift_config: Dict[str, SwiftConfig] = dict()
     for sft_type in [_type.strip() for _type in args.sft_type.split(',')]:
         if sft_type.lower() == SwiftTuners.LORA.lower():
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 1f4a4c2f46..c51eae8841 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -8,14 +8,15 @@
 from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
 from transformers import Trainer as HfTrainer
 from transformers import trainer
-try: 
-    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-except ImportError:
-    from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from .callback import DefaultFlowCallbackNew, ProgressCallbackNew
 from .mixin import PushToMsHubMixin, SwiftMixin
 
+try:
+    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+except ImportError:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
+
 
 class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer):
     pass
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 867c8d4513..b51453df2e 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -89,12 +89,11 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     n_params /= 1e6
     n_grads /= 1e6
     n_buffers /= 1e6
-    s = [
-        f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
-        f'{n_buffers:.4f}M Buffers, ',
-        f'Trainable percentage: {100 * n_grads / n_params:.2f}%.'
-    ]
-    logger.info(''.join(s))
+    s = (f'{name}: '
+         f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable '
+         f'[{100 * n_grads / n_params:.4f}%]), '
+         f'{n_buffers:.4f}M Buffers.')
+    logger.info(s)
 
 
 def find_sub_module(module: torch.nn.Module,

From 2aa0182e718d04fc29996e88627c1357605246c2 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 15 Sep 2023 11:51:20 +0800
Subject: [PATCH 68/70] temporary commit

---
 examples/pytorch/llm/src/llm_infer.py     | 17 ++---
 examples/pytorch/llm/src/llm_sft.py       | 40 +++++-----
 examples/pytorch/llm/src/utils/dataset.py | 92 +++++++++++------------
 examples/pytorch/llm/src/utils/utils.py   |  4 +-
 4 files changed, 71 insertions(+), 82 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index 61c23ca3c1..0783fd9858 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -26,7 +26,7 @@ class InferArguments:
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx'
-    eval_human: bool = False  # False: eval test_dataset
+    eval_human: bool = False  # False: eval val_dataset
 
     seed: int = 42
     dtype: str = field(
@@ -38,7 +38,7 @@ class InferArguments:
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
     dataset_sample: int = 20000  # -1: all dataset
-    dataset_test_size: float = 0.01
+    dataset_test_ratio: float = 0.01
     system: str = 'you are a helpful assistant!'
     max_length: Optional[int] = 2048
 
@@ -138,14 +138,13 @@ def llm_infer(args: InferArguments) -> None:
             inference(input_ids, model, tokenizer, streamer, generation_config,
                       args.skip_prompt)
     else:
-        dataset = get_dataset(args.dataset.split(','))
-        _, test_dataset = process_dataset(dataset, args.dataset_test_size,
-                                          args.dataset_sample,
-                                          args.dataset_seed)
-        mini_test_dataset = test_dataset.select(
-            range(min(10, test_dataset.shape[0])))
+        _, val_dataset = get_dataset(
+            args.dataset.split(','), args.dataset_test_ratio,
+            args.dataset_sample, args.dataset_seed)
+        mini_val_dataset = val_dataset.select(
+            range(min(10, val_dataset.shape[0])))
         del dataset
-        for data in mini_test_dataset:
+        for data in mini_val_dataset:
             response = data['response']
             data['response'] = None
             input_ids = preprocess_func(data)['input_ids']
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index bbf9b7895f..1482e5f7ca 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -56,7 +56,7 @@ class SftArguments:
         metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
     dataset_seed: int = 42
     dataset_sample: int = 20000  # -1: all dataset
-    dataset_test_size: float = 0.01
+    dataset_test_ratio: float = 0.01
     system: str = 'you are a helpful assistant!'
     max_length: Optional[int] = 2048
 
@@ -127,11 +127,11 @@ class SftArguments:
         })
 
     # generation config, only useful when `predict_with_generate=True`
+    max_new_tokens: int = 1024
     do_sample: bool = True
-    top_p: float = 0.7
-    max_new_tokens: int = None
-    temperature: float = 0.95
-    top_k: int = 20
+    temperature: float = 0.9
+    top_k: int = 50
+    top_p: float = 0.9
 
     def __post_init__(self):
         if is_dist():
@@ -248,23 +248,17 @@ def llm_sft(args: SftArguments) -> None:
     logger.info(model)
 
     # ### Loading Dataset
-    dataset = get_dataset(args.dataset.split(','))
-    if isinstance(dataset, tuple):
-        train_dataset, val_dataset = dataset
-    else:
-        train_dataset, val_dataset = process_dataset(dataset,
-                                                     args.dataset_test_size,
-                                                     args.dataset_sample,
-                                                     args.dataset_seed)
-
-    generation_config = {
-        'do_sample': args.do_sample,
-        'top_p': args.top_p,
-        'max_length': None,
-        'max_new_tokens': args.max_new_tokens,
-        'temperature': args.temperature,
-        'top_k': args.top_k,
-    }
+    train_dataset, val_dataset = get_dataset(
+        args.dataset.split(','), args.dataset_test_ratio, args.dataset_sample,
+        args.dataset_seed)
+    generation_config = GenerationConfig(
+        do_sample=args.do_sample,
+        max_length=None,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+    )
 
     preprocess_func_train = get_preprocess(
         args.template_type,
@@ -346,7 +340,7 @@ def llm_sft(args: SftArguments) -> None:
         ddp_backend=args.ddp_backend,
         gradient_checkpointing=args.gradient_checkpointing,
         predict_with_generate=args.predict_with_generate,
-        generation_config=GenerationConfig.from_dict(generation_config),
+        generation_config=generation_config,
         local_rank=local_rank,
         **kwargs)
 
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 3b418d2d7f..664b7981a9 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -13,7 +13,7 @@
 from tqdm.auto import tqdm
 
 from .preprocess import History
-from .utils import download_dataset
+from .utils import download_dataset, process_dataset
 
 
 def _preprocess_alpaca_dataset(
@@ -41,23 +41,17 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset:
         'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
     return _preprocess_alpaca_dataset(dataset)
 
+def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset:
+    for d in dataset:
+        pass
 
 def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
     dataset_train: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen',
-        split='train').to_hf_dataset().rename_columns({
-            'content': 'query',
-            'summary': 'response',
-        })
+        'lvjianjin/AdvertiseGen',split='train').to_hf_dataset()
     dataset_val: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen',
-        split='validation').to_hf_dataset().rename_columns({
-            'content':
-            'query',
-            'summary':
-            'response',
-        })
-    return dataset_train, dataset_val
+        'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset()
+    return (_preprocess_advertise_gen_dataset(dataset_train), 
+            _preprocess_advertise_gen_dataset(dataset_val))
 
 
 def get_alpaca_gpt4_zh_dataset() -> HfDataset:
@@ -184,12 +178,12 @@ def _preprocess_mutimodal_dataset(dataset: HfDataset, prompt: str,
 
 def get_coco_en_dataset() -> HfDataset:
     dataset_dict = MsDataset.load('modelscope/coco_2014_caption')
-    dataset: HfDataset = concatenate_datasets([
-        dataset_dict['train'].to_hf_dataset(),
-        dataset_dict['validation'].to_hf_dataset()
-    ])
-    return _preprocess_mutimodal_dataset(dataset, 'please describe the image',
-                                         'image', 'caption')
+    train_dataset = dataset_dict['train'].to_hf_dataset()
+    val_dataset = dataset_dict['validation'].to_hf_dataset()
+    return (
+        _preprocess_mutimodal_dataset(dataset, 'please describe the image',
+        'image', 'caption') for dataset in (train_dataset, val_dataset)
+    )
 
 
 def _filter_agent_dataset(dataset: List[Dict[str, Any]],
@@ -392,12 +386,14 @@ def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset:
 def get_dureader_robust_qg_zh_dataset() -> HfDataset:
     """Question Generation"""
     dataset_dict = MsDataset.load('modelscope/DuReader_robust-QG')
-    dataset: HfDataset = concatenate_datasets([
+    train_dataset: HfDataset = concatenate_datasets([
         dataset_dict['train'].to_hf_dataset(),
         dataset_dict['validation'].to_hf_dataset(),
-        dataset_dict['test'].to_hf_dataset()
     ])
-    return _preprocess_dureader_robust(dataset)
+    val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset()
+    return (
+        _preprocess_dureader_robust(dataset) for dataset in (train_dataset, val_dataset)
+    )
 
 
 def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset:
@@ -419,21 +415,22 @@ def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset:
 
 
 def get_medical_dataset(subset_name: str,
-                        dataset_sample: int = -1) -> HfDataset:
+                        train_dataset_sample: int = -1) -> HfDataset:
     """
     mode: Literal['en', zh]
     """
     dataset_dict = MsDataset.load(
         'huangjintao/medical_zh', subset_name=subset_name)
-    dataset: HfDataset = concatenate_datasets([
+    train_dataset: HfDataset = concatenate_datasets([
         dataset_dict['train'].to_hf_dataset(),
         dataset_dict['val'].to_hf_dataset(),
-        dataset_dict['test'].to_hf_dataset(),
     ])
+    val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset(),
     if dataset_sample != -1:
         idxs = np.random.permutation(dataset_sample)
-        dataset = dataset.select(idxs)
-    return _preprocess_medical(dataset, subset_name)
+        train_dataset = train_dataset.select(idxs)
+    return (_preprocess_medical(dataset, subset_name) 
+           for dataset in (train_dataset, val_dataset))
 
 
 def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset:
@@ -549,7 +546,7 @@ def get_code_python_zh_dataset() -> HfDataset:
     'medical-zh':
     partial(get_medical_dataset, subset_name='zh'),
     'medical-mini-zh':
-    partial(get_medical_dataset, subset_name='zh', dataset_sample=100000),
+    partial(get_medical_dataset, subset_name='zh', train_dataset_sample=100000),
     'code-python-zh':
     get_code_python_zh_dataset,
 
@@ -588,26 +585,25 @@ def get_code_python_zh_dataset() -> HfDataset:
 
 
 def get_dataset(
-    dataset_name_list: List[str]
-) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]:
+    dataset_name_list: List[str],
+    dataset_test_ratio: float,
+    dataset_sample: int,
+    dataset_seed: int
+) -> Tuple[HfDataset, HfDataset]:
     """Returns a dataset to be split or a train-val dataset tuple"""
-    dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = []
+    train_dataset_list: List[HfDataset] = []
+    val_dataset_list: List[HfDataset] = []
     for dataset_name in dataset_name_list:
         get_function = DATASET_MAPPING[dataset_name]
-        dataset_list.append(get_function())
-
-    assert (all(isinstance(dataset, tuple) for dataset in dataset_list)
-            or all(isinstance(dataset, HfDataset) for dataset in dataset_list))
-    if not isinstance(dataset_list[0], tuple):
-        dataset = concatenate_datasets(dataset_list)
-    else:
-        train_datasets = [dataset[0] for dataset in dataset_list]
-        val_datasets = [dataset[1] for dataset in dataset_list]
-        if len(train_datasets) > 1:
-            train_dataset = concatenate_datasets(train_datasets)
-            val_dataset = concatenate_datasets(val_datasets)
+        dataset = get_function()
+        if isinstance(dataset, (list, tuple)):
+            train_dataset = dataset[0]
+            val_dataset = dataset[1]
         else:
-            train_dataset = train_datasets[0]
-            val_dataset = val_datasets[0]
-        dataset = (train_dataset, val_dataset)
-    return dataset
+            process_dataset(dataset, dataset_test_)
+        train_dataset_list.append(train_dataset)
+        val_dataset_list.append(val_dataset)
+
+        train_dataset = concatenate_datasets(train_dataset_list)
+        val_dataset = concatenate_datasets(val_dataset_list)
+    return train_dataset, val_dataset
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py
index e6306d36f3..70167db4f8 100644
--- a/examples/pytorch/llm/src/utils/utils.py
+++ b/examples/pytorch/llm/src/utils/utils.py
@@ -249,7 +249,7 @@ def download_files(url: str, local_path: str, cookies) -> None:
             f.write(data)
 
 
-def process_dataset(dataset: HfDataset, dataset_test_size: float,
+def process_dataset(dataset: HfDataset, dataset_test_ratio: float,
                     dataset_sample: int,
                     dataset_seed: int) -> Tuple[HfDataset, HfDataset]:
     random_state = np.random.RandomState(dataset_seed)
@@ -257,7 +257,7 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float,
         index = random_state.permutation(len(dataset))[:dataset_sample]
         dataset = dataset.select(index)
     dataset = dataset.train_test_split(
-        dataset_test_size, seed=get_seed(random_state))
+        dataset_test_ratio, seed=get_seed(random_state))
     return dataset['train'], dataset['test']
 
 

From 6a57b109ae15d60f7c0f5d3c79b1e8ce99a04bcc Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 15 Sep 2023 12:37:33 +0800
Subject: [PATCH 69/70] fix bugs

---
 .../baichuan2_7b_chat/lora_ddp/infer.sh       |  2 +-
 examples/pytorch/llm/src/llm_sft.py           | 24 ++++++++-------
 examples/pytorch/llm/src/utils/__init__.py    |  1 +
 examples/pytorch/llm/src/utils/dataset.py     | 30 +------------------
 .../pytorch/llm/src/utils/metric_utils.py     |  6 ++--
 5 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
index ce54c3ffaa..6988d4a37d 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -8,7 +8,7 @@ python src/llm_infer.py \
     --eval_human false \
     --dataset damo-agent-mini-zh \
     --max_length 4096 \
-    --max_new_tokens 1024 \
+    --max_new_tokens 2048 \
     --temperature 0.9 \
     --top_k 50 \
     --top_p 0.9 \
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 7d2d5fd483..517886d589 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -12,12 +12,12 @@
 import torch.distributed as dist
 from transformers import BitsAndBytesConfig, GenerationConfig
 from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING,
-                   broadcast_string, check_json_format, dataset_map,
-                   find_all_linear_for_lora, get_dataset, get_dist_setting,
-                   get_model_tokenizer, get_preprocess, is_ddp_plus_mp,
-                   is_dist, is_master, plot_images, select_bnb, select_dtype,
-                   compute_nlg_metrics, prepare_model,
-                   show_layers, sort_by_max_length)
+                   broadcast_string, check_json_format, compute_nlg_metrics,
+                   dataset_map, find_all_linear_for_lora, get_dataset,
+                   get_dist_setting, get_model_tokenizer, get_preprocess,
+                   is_ddp_plus_mp, is_dist, is_master, plot_images,
+                   prepare_model, select_bnb, select_dtype, show_layers,
+                   sort_by_max_length)
 
 from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments,
                    Swift, get_logger)
@@ -270,16 +270,20 @@ def llm_sft(args: SftArguments) -> None:
             val_dataset = val_dataset.select(val_idxs)
     logger.info(f'train_dataset: {train_dataset}')
     logger.info(f'val_dataset: {val_dataset}')
-    preprocess_func_train = get_preprocess(args.template_type, tokenizer,
-                                     args.system, args.max_length, validate_generation=False)
+    preprocess_func_train = get_preprocess(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        validate_generation=False)
     preprocess_func_eval = get_preprocess(
         args.template_type,
         tokenizer,
         args.system,
         args.max_length,
         validate_generation=args.predict_with_generate)
-    train_dataset = dataset_map(train_dataset, preprocess_func)
-    val_dataset = dataset_map(val_dataset, preprocess_func)
+    train_dataset = dataset_map(train_dataset, preprocess_func_train)
+    val_dataset = dataset_map(val_dataset, preprocess_func_eval)
     if args.test_oom_error:
         train_dataset = sort_by_max_length(train_dataset, 20000)
     # Data analysis
diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py
index 671d38b56d..341293902d 100644
--- a/examples/pytorch/llm/src/utils/__init__.py
+++ b/examples/pytorch/llm/src/utils/__init__.py
@@ -2,6 +2,7 @@
 from .metric_utils import compute_nlg_metrics
 from .model import MODEL_MAPPING, get_model_tokenizer
 from .preprocess import TEMPLATE_MAPPING, get_preprocess
+from .swift_utils import prepare_model
 from .utils import (broadcast_string, check_json_format, dataset_map,
                     download_dataset, find_all_linear_for_lora,
                     get_dist_setting, inference, is_ddp_plus_mp, is_dist,
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index ba70b76884..6804a9dca4 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -14,7 +14,7 @@
 
 from swift.utils import get_seed
 from .preprocess import History
-from .utils import download_dataset, process_dataset
+from .utils import download_dataset
 
 
 def _preprocess_alpaca_dataset(
@@ -42,18 +42,6 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset:
         'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
     return _preprocess_alpaca_dataset(dataset)
 
-def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset:
-    for d in dataset:
-        pass
-
-def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]:
-    dataset_train: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen',split='train').to_hf_dataset()
-    dataset_val: HfDataset = MsDataset.load(
-        'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset()
-    return (_preprocess_advertise_gen_dataset(dataset_train), 
-            _preprocess_advertise_gen_dataset(dataset_val))
-
 
 def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset:
     prompt = """Task: Generating advertisements based on keywords.
@@ -156,22 +144,6 @@ def get_instinwild_en_dataset() -> HfDataset:
     return _preprocess_alpaca_dataset(dataset)
 
 
-def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]:
-    dataset_train: HfDataset = MsDataset.load(
-        'modelscope/DuReader_robust-QG',
-        split='train').to_hf_dataset().rename_columns({
-            'text1': 'query',
-            'text2': 'response',
-        })
-    dataset_val: HfDataset = MsDataset.load(
-        'modelscope/DuReader_robust-QG',
-        split='validation').to_hf_dataset().rename_columns({
-            'text1': 'query',
-            'text2': 'response',
-        })
-    return dataset_train, dataset_val
-
-
 def get_cot_en_dataset() -> HfDataset:
     dataset: HfDataset = MsDataset.load(
         'YorickHe/CoT', split='train').to_hf_dataset()
diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py
index d4f964a5e6..2e8df7d53d 100644
--- a/examples/pytorch/llm/src/utils/metric_utils.py
+++ b/examples/pytorch/llm/src/utils/metric_utils.py
@@ -1,9 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import jieba
 import numpy as np
-from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-from rouge.rouge import Rouge
 
 from swift import get_logger
 
@@ -11,6 +8,9 @@
 
 
 def compute_nlg_metrics(prediction, tokenizer):
+    import jieba
+    from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
+    from rouge.rouge import Rouge
     preds, labels = prediction[0], prediction[1]
 
     score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}

From 166d3c038f5dd126c16e7a4c702a7706f82b356b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 15 Sep 2023 14:16:20 +0800
Subject: [PATCH 70/70] merge branch

---
 examples/pytorch/llm/src/llm_sft.py           | 30 +++++++------------
 examples/pytorch/llm/src/utils/swift_utils.py |  6 ++--
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index 517886d589..5d484e423b 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -35,11 +35,7 @@ class SftArguments:
         default='qwen-7b-chat',
         metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
-        default='lora',
-        metadata={
-            'help':
-            f'tuner choices: {["lora", "full", "adapter", "restuning"]}'
-        })
+        default='lora', metadata={'choices': ['lora', 'full']})
     template_type: str = field(
         default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())})
     output_dir: str = 'runs'
@@ -75,7 +71,6 @@ class SftArguments:
     lora_rank: int = 8
     lora_alpha: int = 32
     lora_dropout_p: float = 0.
-    adapter_length: int = 32
 
     gradient_checkpointing: bool = False
     batch_size: int = 1
@@ -147,16 +142,12 @@ def __post_init__(self):
             # Initialize in advance
             dist.init_process_group(backend=self.ddp_backend)
 
-        from swift import SwiftTuners
-        all_types = [
-            SwiftTuners.LORA.lower(),
-            SwiftTuners.ADAPTER.lower(),
-            SwiftTuners.RESTUNING.lower()
-        ] + ['full']
-        sft_type = [_type.strip() for _type in self.sft_type.split(',')]
-        assert all([_type.lower() in all_types for _type in sft_type]), \
-            f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}'
-        if self.sft_type == 'full':
+        if self.sft_type == 'lora':
+            if self.learning_rate is None:
+                self.learning_rate = 1e-4
+            if self.only_save_model is None:
+                self.only_save_model = False
+        elif self.sft_type == 'full':
             assert self.quantization_bit == 0, 'not supported'
             assert self.dtype != 'fp16', 'please use bf16 or fp32'
             if self.learning_rate is None:
@@ -164,10 +155,8 @@ def __post_init__(self):
             if self.only_save_model is None:
                 self.only_save_model = True
         else:
-            if self.learning_rate is None:
-                self.learning_rate = 1e-4
-            if self.only_save_model is None:
-                self.only_save_model = False
+            raise ValueError(f'sft_type: {self.sft_type}')
+
         if self.template_type is None:
             self.template_type = MODEL_MAPPING[self.model_type].get(
                 'template', 'default')
@@ -239,6 +228,7 @@ def llm_sft(args: SftArguments) -> None:
 
     if args.resume_from_ckpt is None:
         if args.sft_type != 'full':
+            # lora
             model = prepare_model(model, args)
     else:
         model = Swift.from_pretrained(
diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py
index 63484a1e9f..ee8ef3b489 100644
--- a/examples/pytorch/llm/src/utils/swift_utils.py
+++ b/examples/pytorch/llm/src/utils/swift_utils.py
@@ -27,7 +27,7 @@ def prepare_model(model: Module, args) -> Module:
                 target_modules=args.lora_target_modules,
                 lora_alpha=args.lora_alpha,
                 lora_dropout=args.lora_dropout_p)
-            logger.info(f'lora_config: {lora_config}')
+            logger.debug(f'lora_config: {lora_config}')
             swift_config['lora'] = lora_config
         elif sft_type.lower() == SwiftTuners.ADAPTER.lower():
             adapter_config = AdapterConfig(
@@ -38,12 +38,12 @@ def prepare_model(model: Module, args) -> Module:
                 hidden_pos=0,
                 adapter_length=args.adapter_length,
             )
-            logger.info(f'adapter_config: {adapter_config}')
+            logger.debug(f'adapter_config: {adapter_config}')
             swift_config['adapter'] = adapter_config
         elif sft_type.lower() == SwiftTuners.RESTUNING.lower():
             restuner_config = ResTuningConfig(
                 dims=model.config.hidden_size,
                 **MODEL_MAPPING[args.model_type]['restuner_TM'])
-            logger.info(f'restuner_config: {restuner_config}')
+            logger.debug(f'restuner_config: {restuner_config}')
             swift_config['restuner'] = restuner_config
     return Swift.prepare_model(model, swift_config)