From ff9f473acf100891eba94cc46c0eaf0740a85018 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 00:01:02 +0800 Subject: [PATCH 01/70] try to instead lora with peft lora --- swift/tuners/lora.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 69719c9df1..3848dfbc72 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -77,17 +77,15 @@ class LoRA: @staticmethod def prepare_model(model: nn.Module, config: LoRAConfig): - """Prepare a model with `LoRAConfig`""" - LoRA._dynamic_patch_lora( - model, - replace_modules=config.target_modules, + from peft import LoraConfig, LoraModel + LoraModel(model, LoraConfig( r=config.r, - lora_alpha=config.lora_alpha, + target_modules=config.target_modules, + lora_alpha=int(config.lora_alpha), lora_dropout=config.lora_dropout, - merge_weights=config.merge_weights, - use_merged_linear=config.use_merged_linear, - enable_lora=config.enable_lora, - fan_in_fan_out=config.fan_in_fan_out) + fan_in_fan_out=config.fan_in_fan_out, + bias=config.bias, + ), 'default') def state_dict_callback(state_dict): return lora_state_dict(state_dict, config.bias) From 5b00f1128b186e3fb51df2f04e663e11f081f35e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 10:51:11 +0800 Subject: [PATCH 02/70] Revert "try to instead lora with peft lora" This reverts commit ff9f473acf100891eba94cc46c0eaf0740a85018. --- swift/tuners/lora.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 3848dfbc72..69719c9df1 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -77,15 +77,17 @@ class LoRA: @staticmethod def prepare_model(model: nn.Module, config: LoRAConfig): - from peft import LoraConfig, LoraModel - LoraModel(model, LoraConfig( + """Prepare a model with `LoRAConfig`""" + LoRA._dynamic_patch_lora( + model, + replace_modules=config.target_modules, r=config.r, - target_modules=config.target_modules, - lora_alpha=int(config.lora_alpha), + lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout, - fan_in_fan_out=config.fan_in_fan_out, - bias=config.bias, - ), 'default') + merge_weights=config.merge_weights, + use_merged_linear=config.use_merged_linear, + enable_lora=config.enable_lora, + fan_in_fan_out=config.fan_in_fan_out) def state_dict_callback(state_dict): return lora_state_dict(state_dict, config.bias) From 3d1a618d8690f3eba93d55af179c3022ec748246 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 11:44:56 +0800 Subject: [PATCH 03/70] try to add bnb & gptq linear --- requirements/framework.txt | 2 +- swift/tuners/lora.py | 73 ++++++++++++++++++++++++++++----- tests/tuners/test_swift_base.py | 10 +++++ 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/requirements/framework.txt b/requirements/framework.txt index 4247a138db..c4ecc554c0 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -3,7 +3,7 @@ datasets diffusers>=0.18.0 numpy pandas -peft +peft>=0.5.0 requests safetensors tensorboard diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 69719c9df1..c95ed4438b 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -10,10 +10,23 @@ import torch import torch.nn as nn import torch.nn.functional as F - from .utils import SwiftConfig, SwiftOutput +from peft.utils import get_auto_gptq_quant_linear, get_quantization_config + +from peft.import_utils import is_bnb_available, is_bnb_4bit_available, is_auto_gptq_available + +if is_bnb_available(): + import bitsandbytes as bnb + + from peft.tuners.lora import Linear8bitLt -logger = logging.getLogger(__name__) +if is_bnb_4bit_available(): + from peft.tuners.lora import Linear4bit + +if is_auto_gptq_available(): + from peft.tuners.lora import QuantLinear + +logger = logging.getLogger() @dataclass @@ -38,7 +51,7 @@ class LoRAConfig(SwiftConfig): default=None, metadata={ 'help': - 'The modules to be replaced by LoRA, can be the end of the module name or a regex string' + 'The modules to be replaced by LoRA, can be the end of the module name or a regex string' }) lora_alpha: float = field( default=1., metadata={'help': 'The factor to add the lora weights'}) @@ -54,13 +67,13 @@ class LoRAConfig(SwiftConfig): default=None, metadata={ 'help': - 'The modules need to be turned on when using the merged linear layer' + 'The modules need to be turned on when using the merged linear layer' }) fan_in_fan_out: bool = field( default=False, metadata={ 'help': - 'Set this to True if the layer to replace stores weight like (fan_in, fan_out)' + 'Set this to True if the layer to replace stores weight like (fan_in, fan_out)' }) bias: str = field( default='none', @@ -146,6 +159,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, sub_module.out_features, bias=sub_module.bias is not None, **kwargs) + elif isinstance(sub_module, torch.nn.Embedding): + lora_module = Embedding( + num_embeddings=sub_module.num_embeddings, + embedding_dim=sub_module.embedding_dim, + r=kwargs['r'], + lora_alpha=kwargs['lora_alpha'], + merge_weights=kwargs['merge_weights']) elif isinstance(sub_module, torch.nn.Conv2d): kwargs.pop('fan_in_fan_out', None) lora_module = Conv2d( @@ -157,6 +177,37 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, dilation=sub_module.dilation, groups=sub_module.groups, **kwargs) + elif kwargs.pop('loaded_in_8bit', False) and isinstance(sub_module, bnb.nn.Linear8bitLt): + eight_bit_kwargs = kwargs.copy() + eight_bit_kwargs.update( + { + "has_fp16_weights": sub_module.state.has_fp16_weights, + "memory_efficient_backward": sub_module.state.memory_efficient_backward, + "threshold": sub_module.state.threshold, + "index": sub_module.index, + } + ) + lora_module = Linear8bitLt( + 'default', sub_module.in_features, sub_module.out_features, + bias=kwargs.pop('bias', False), **eight_bit_kwargs + ) + elif kwargs.pop('loaded_in_4bit', False) and is_bnb_4bit_available() and isinstance(sub_module, + bnb.nn.Linear4bit): + four_bit_kwargs = kwargs.copy() + four_bit_kwargs.update( + { + "compute_dtype": sub_module.compute_dtype, + "compress_statistics": sub_module.weight.compress_statistics, + "quant_type": sub_module.weight.quant_type, + } + ) + lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features, + bias=kwargs.pop('bias', False), **four_bit_kwargs) + + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq")) + if AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear): + lora_module = QuantLinear('default', sub_module, **kwargs) + sub_module.weight = sub_module.qweight if lora_module is not None: lora_module.weight = sub_module.weight @@ -238,11 +289,11 @@ def unpatch_lora(model, config: LoRAConfig): class LoRALayer: def __init__( - self, - r: int, - lora_alpha: int, - lora_dropout: float, - merge_weights: bool, + self, + r: int, + lora_alpha: int, + lora_dropout: float, + merge_weights: bool, ): self.r = r self.lora_alpha = lora_alpha @@ -458,7 +509,7 @@ def __init__(self, self.weight.requires_grad = False # Compute the indices self.lora_ind = self.weight.new_zeros( - (out_features, ), dtype=torch.bool).view(len(enable_lora), -1) + (out_features,), dtype=torch.bool).view(len(enable_lora), -1) self.lora_ind[enable_lora, :] = True self.lora_ind = self.lora_ind.view(-1) self.reset_parameters() diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 715fd0c743..1dff38d84f 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -6,6 +6,7 @@ from time import time import torch +from modelscope import Model, Preprocessor from modelscope.models.nlp.structbert import (SbertConfig, SbertForSequenceClassification) from peft.utils import WEIGHTS_NAME @@ -25,6 +26,15 @@ def tearDown(self): shutil.rmtree(self.tmp_dir) super().tearDown() + def test_swift_lora_forward(self): + model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) + model = Swift.prepare_model(model, config=lora_config) + inputs = preprocessor('how are you') + outputs = model(**inputs) + self.assertTrue('logits' in outputs) + def test_swift_lora_injection(self): model = SbertForSequenceClassification(SbertConfig()) model2 = copy.deepcopy(model) From 18de6ffc06c764fc6dc785026b5c8ae317fd1817 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 12:07:11 +0800 Subject: [PATCH 04/70] add more code --- examples/pytorch/llm/src/llm_sft.py | 7 +++---- tests/tuners/test_swift_base.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 696a887dc8..2f08c0be58 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -13,7 +13,7 @@ is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, show_layers) -from swift import (HubStrategy, LoraConfig, Seq2SeqTrainer, +from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, @@ -207,12 +207,11 @@ def llm_sft(args: SftArguments) -> None: logger.info( f'Setting lora_target_modules: {args.lora_target_modules}') if args.resume_from_ckpt is None: - lora_config = LoraConfig( + lora_config = LoRAConfig( r=args.lora_rank, target_modules=args.lora_target_modules, lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout_p, - task_type='CAUSAL_LM') + lora_dropout=args.lora_dropout_p) logger.info(f'lora_config: {lora_config}') model = Swift.prepare_model(model, lora_config) else: diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 1dff38d84f..763aeab626 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -33,7 +33,7 @@ def test_swift_lora_forward(self): model = Swift.prepare_model(model, config=lora_config) inputs = preprocessor('how are you') outputs = model(**inputs) - self.assertTrue('logits' in outputs) + self.assertTrue(hasattr(outputs, 'logits')) def test_swift_lora_injection(self): model = SbertForSequenceClassification(SbertConfig()) From e836d27ea12de693ae3e5a03b806328415615462 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Tue, 5 Sep 2023 15:21:08 +0800 Subject: [PATCH 05/70] fix bug --- examples/pytorch/llm/src/llm_sft.py | 2 +- swift/tuners/lora.py | 70 +++++++++++++++-------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 2f08c0be58..a5b86533b8 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -13,7 +13,7 @@ is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, show_layers) -from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer, +from swift import (HubStrategy, LoraConfig, LoRAConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index c95ed4438b..f46e42d826 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -130,6 +130,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, assert isinstance(replace_modules, (str, list)) if isinstance(replace_modules, str): replace_modules = [replace_modules] + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq")) for module_key in module_keys: if isinstance(replace_modules, str): @@ -145,19 +146,48 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, _key = parts[-1] lora_module = None - if isinstance(sub_module, torch.nn.Linear): + if getattr(model, "is_loaded_in_8bit", False) and isinstance(sub_module, bnb.nn.Linear8bitLt): + eight_bit_kwargs = kwargs.copy() + eight_bit_kwargs.update( + { + "has_fp16_weights": sub_module.state.has_fp16_weights, + "memory_efficient_backward": sub_module.state.memory_efficient_backward, + "threshold": sub_module.state.threshold, + "index": sub_module.index, + } + ) + lora_module = Linear8bitLt( + 'default', sub_module.in_features, sub_module.out_features, + bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **eight_bit_kwargs + ) + elif getattr(model, "is_loaded_in_4bit", False) and is_bnb_4bit_available() and isinstance(sub_module, + bnb.nn.Linear4bit): + four_bit_kwargs = kwargs.copy() + four_bit_kwargs.update( + { + "compute_dtype": sub_module.compute_dtype, + "compress_statistics": sub_module.weight.compress_statistics, + "quant_type": sub_module.weight.quant_type, + } + ) + lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features, + bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **four_bit_kwargs) + elif AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear): + lora_module = QuantLinear('default', sub_module, **kwargs) + sub_module.weight = sub_module.qweight + elif isinstance(sub_module, torch.nn.Linear): if use_merged_linear: lora_module = MergedLinear( sub_module.in_features, sub_module.out_features, - bias=sub_module.bias is not None, + bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **kwargs) else: kwargs.pop('enable_lora', None) lora_module = Linear( sub_module.in_features, sub_module.out_features, - bias=sub_module.bias is not None, + bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **kwargs) elif isinstance(sub_module, torch.nn.Embedding): lora_module = Embedding( @@ -177,45 +207,17 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, dilation=sub_module.dilation, groups=sub_module.groups, **kwargs) - elif kwargs.pop('loaded_in_8bit', False) and isinstance(sub_module, bnb.nn.Linear8bitLt): - eight_bit_kwargs = kwargs.copy() - eight_bit_kwargs.update( - { - "has_fp16_weights": sub_module.state.has_fp16_weights, - "memory_efficient_backward": sub_module.state.memory_efficient_backward, - "threshold": sub_module.state.threshold, - "index": sub_module.index, - } - ) - lora_module = Linear8bitLt( - 'default', sub_module.in_features, sub_module.out_features, - bias=kwargs.pop('bias', False), **eight_bit_kwargs - ) - elif kwargs.pop('loaded_in_4bit', False) and is_bnb_4bit_available() and isinstance(sub_module, - bnb.nn.Linear4bit): - four_bit_kwargs = kwargs.copy() - four_bit_kwargs.update( - { - "compute_dtype": sub_module.compute_dtype, - "compress_statistics": sub_module.weight.compress_statistics, - "quant_type": sub_module.weight.quant_type, - } - ) - lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features, - bias=kwargs.pop('bias', False), **four_bit_kwargs) - - AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq")) - if AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear): - lora_module = QuantLinear('default', sub_module, **kwargs) - sub_module.weight = sub_module.qweight if lora_module is not None: lora_module.weight = sub_module.weight if sub_module.bias is not None: lora_module.bias = sub_module.bias + if getattr(sub_module, "state", None) is not None: + lora_module.state = sub_module.state lora_module.to(sub_module.weight.device) setattr(module, _key, lora_module) modules.append(lora_module) + return modules @staticmethod From 09e267c35a578e5be1449edac19d1646301f3288 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 16:17:39 +0800 Subject: [PATCH 06/70] lint code --- examples/pytorch/llm/src/llm_sft.py | 2 +- swift/tuners/lora.py | 98 +++++++++++++++++------------ swift/tuners/prompt.py | 2 +- swift/utils/torch_utils.py | 6 +- tests/tuners/test_swift_base.py | 6 +- 5 files changed, 67 insertions(+), 47 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index a5b86533b8..2f08c0be58 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -13,7 +13,7 @@ is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, show_layers) -from swift import (HubStrategy, LoraConfig, LoRAConfig, Seq2SeqTrainer, +from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index f46e42d826..e5e315385f 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -10,10 +10,11 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .utils import SwiftConfig, SwiftOutput +from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available, + is_bnb_available) from peft.utils import get_auto_gptq_quant_linear, get_quantization_config -from peft.import_utils import is_bnb_available, is_bnb_4bit_available, is_auto_gptq_available +from .utils import SwiftConfig, SwiftOutput if is_bnb_available(): import bitsandbytes as bnb @@ -51,7 +52,7 @@ class LoRAConfig(SwiftConfig): default=None, metadata={ 'help': - 'The modules to be replaced by LoRA, can be the end of the module name or a regex string' + 'The modules to be replaced by LoRA, can be the end of the module name or a regex string' }) lora_alpha: float = field( default=1., metadata={'help': 'The factor to add the lora weights'}) @@ -67,13 +68,13 @@ class LoRAConfig(SwiftConfig): default=None, metadata={ 'help': - 'The modules need to be turned on when using the merged linear layer' + 'The modules need to be turned on when using the merged linear layer' }) fan_in_fan_out: bool = field( default=False, metadata={ 'help': - 'Set this to True if the layer to replace stores weight like (fan_in, fan_out)' + 'Set this to True if the layer to replace stores weight like (fan_in, fan_out)' }) bias: str = field( default='none', @@ -130,7 +131,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, assert isinstance(replace_modules, (str, list)) if isinstance(replace_modules, str): replace_modules = [replace_modules] - AutoGPTQQuantLinear = get_auto_gptq_quant_linear(get_quantization_config(model, method="gptq")) + AutoGPTQQuantLinear = get_auto_gptq_quant_linear( + get_quantization_config(model, method='gptq')) for module_key in module_keys: if isinstance(replace_modules, str): @@ -146,33 +148,47 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, _key = parts[-1] lora_module = None - if getattr(model, "is_loaded_in_8bit", False) and isinstance(sub_module, bnb.nn.Linear8bitLt): + if getattr(model, 'is_loaded_in_8bit', False) and isinstance( + sub_module, bnb.nn.Linear8bitLt): eight_bit_kwargs = kwargs.copy() - eight_bit_kwargs.update( - { - "has_fp16_weights": sub_module.state.has_fp16_weights, - "memory_efficient_backward": sub_module.state.memory_efficient_backward, - "threshold": sub_module.state.threshold, - "index": sub_module.index, - } - ) + eight_bit_kwargs.update({ + 'has_fp16_weights': + sub_module.state.has_fp16_weights, + 'memory_efficient_backward': + sub_module.state.memory_efficient_backward, + 'threshold': + sub_module.state.threshold, + 'index': + sub_module.index, + }) lora_module = Linear8bitLt( - 'default', sub_module.in_features, sub_module.out_features, - bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **eight_bit_kwargs - ) - elif getattr(model, "is_loaded_in_4bit", False) and is_bnb_4bit_available() and isinstance(sub_module, - bnb.nn.Linear4bit): + 'default', + sub_module.in_features, + sub_module.out_features, + bias=hasattr(sub_module, 'bias') + and sub_module.bias is not None, + **eight_bit_kwargs) + elif getattr(model, 'is_loaded_in_4bit', + False) and is_bnb_4bit_available() and isinstance( + sub_module, bnb.nn.Linear4bit): four_bit_kwargs = kwargs.copy() - four_bit_kwargs.update( - { - "compute_dtype": sub_module.compute_dtype, - "compress_statistics": sub_module.weight.compress_statistics, - "quant_type": sub_module.weight.quant_type, - } - ) - lora_module = Linear4bit('default', sub_module.in_features, sub_module.out_features, - bias=hasattr(sub_module, "bias") and sub_module.bias is not None, **four_bit_kwargs) - elif AutoGPTQQuantLinear is not None and isinstance(sub_module, AutoGPTQQuantLinear): + four_bit_kwargs.update({ + 'compute_dtype': + sub_module.compute_dtype, + 'compress_statistics': + sub_module.weight.compress_statistics, + 'quant_type': + sub_module.weight.quant_type, + }) + lora_module = Linear4bit( + 'default', + sub_module.in_features, + sub_module.out_features, + bias=hasattr(sub_module, 'bias') + and sub_module.bias is not None, + **four_bit_kwargs) + elif AutoGPTQQuantLinear is not None and isinstance( + sub_module, AutoGPTQQuantLinear): lora_module = QuantLinear('default', sub_module, **kwargs) sub_module.weight = sub_module.qweight elif isinstance(sub_module, torch.nn.Linear): @@ -180,14 +196,16 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, lora_module = MergedLinear( sub_module.in_features, sub_module.out_features, - bias=hasattr(sub_module, "bias") and sub_module.bias is not None, + bias=hasattr(sub_module, 'bias') + and sub_module.bias is not None, **kwargs) else: kwargs.pop('enable_lora', None) lora_module = Linear( sub_module.in_features, sub_module.out_features, - bias=hasattr(sub_module, "bias") and sub_module.bias is not None, + bias=hasattr(sub_module, 'bias') + and sub_module.bias is not None, **kwargs) elif isinstance(sub_module, torch.nn.Embedding): lora_module = Embedding( @@ -210,9 +228,9 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, if lora_module is not None: lora_module.weight = sub_module.weight - if sub_module.bias is not None: + if getattr(sub_module, 'bias', None) is not None: lora_module.bias = sub_module.bias - if getattr(sub_module, "state", None) is not None: + if getattr(sub_module, 'state', None) is not None: lora_module.state = sub_module.state lora_module.to(sub_module.weight.device) setattr(module, _key, lora_module) @@ -291,11 +309,11 @@ def unpatch_lora(model, config: LoRAConfig): class LoRALayer: def __init__( - self, - r: int, - lora_alpha: int, - lora_dropout: float, - merge_weights: bool, + self, + r: int, + lora_alpha: int, + lora_dropout: float, + merge_weights: bool, ): self.r = r self.lora_alpha = lora_alpha @@ -511,7 +529,7 @@ def __init__(self, self.weight.requires_grad = False # Compute the indices self.lora_ind = self.weight.new_zeros( - (out_features,), dtype=torch.bool).view(len(enable_lora), -1) + (out_features, ), dtype=torch.bool).view(len(enable_lora), -1) self.lora_ind[enable_lora, :] = True self.lora_ind = self.lora_ind.view(-1) self.reset_parameters() diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index a255c36cff..f426a4dd83 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -61,7 +61,7 @@ class PromptConfig(SwiftConfig): 'help': 'When set to True, prompt is attached in front of the embedding' }) - + extract_embedding: bool = field( default=False, metadata={ diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 4fca0a28d5..dc0d6d395f 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -90,9 +90,9 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: n_grads /= 1e6 n_buffers /= 1e6 s = [ - f'{name}: ', - f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ', - f'{n_buffers:.4f}M Buffers', + f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ', + f'{n_buffers:.4f}M Buffers, ', + f'Trainable percentage: {100 * n_grads / n_params:.2f}%' ] s += '.' logger.info(''.join(s)) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 763aeab626..ce1ccb3307 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -27,8 +27,10 @@ def tearDown(self): super().tearDown() def test_swift_lora_forward(self): - model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') - preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) model = Swift.prepare_model(model, config=lora_config) inputs = preprocessor('how are you') From 07e93c221fc60aa36cfdc995baaf2cb4f0d201bf Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 16:30:33 +0800 Subject: [PATCH 07/70] wip --- examples/pytorch/llm/src/llm_sft.py | 41 ++++++++++++++++------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 2f08c0be58..39a3a8c429 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -2,7 +2,7 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' from dataclasses import dataclass, field from functools import partial -from typing import List, Optional +from typing import List, Optional, Dict import torch import torch.distributed as dist @@ -14,7 +14,7 @@ select_bnb, select_dtype, show_layers) from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer, - Seq2SeqTrainingArguments, Swift, get_logger) + Seq2SeqTrainingArguments, Swift, get_logger, SwiftConfig) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, seed_everything) @@ -198,25 +198,30 @@ def llm_sft(args: SftArguments) -> None: model, tokenizer = get_model_tokenizer( args.model_type, torch_dtype=args.torch_dtype, **kwargs) + if args.resume_from_ckpt is None: + swift_config: Dict[str, SwiftConfig] = dict() + for sft_type in args.sft_type.split(','): + if sft_type == 'lora': + if 'ALL' in args.lora_target_modules: + assert len(args.lora_target_modules) == 1 + args.lora_target_modules = find_all_linear_for_lora( + model, args.quantization_bit, args.model_type) + logger.info( + f'Setting lora_target_modules: {args.lora_target_modules}') + + lora_config = LoRAConfig( + r=args.lora_rank, + target_modules=args.lora_target_modules, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout_p) + logger.info(f'lora_config: {lora_config}') # ### Preparing lora if args.sft_type == 'lora': - if 'ALL' in args.lora_target_modules: - assert len(args.lora_target_modules) == 1 - args.lora_target_modules = find_all_linear_for_lora( - model, args.quantization_bit, args.model_type) - logger.info( - f'Setting lora_target_modules: {args.lora_target_modules}') - if args.resume_from_ckpt is None: - lora_config = LoRAConfig( - r=args.lora_rank, - target_modules=args.lora_target_modules, - lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout_p) - logger.info(f'lora_config: {lora_config}') + model = Swift.prepare_model(model, lora_config) - else: - model = Swift.from_pretrained( - model, args.resume_from_ckpt, is_trainable=True) + else: + model = Swift.from_pretrained( + model, args.resume_from_ckpt, is_trainable=True) show_layers(model) print_model_info(model) From a9ff3127aab736ebf72c6610593b2bbcbdefa031 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 5 Sep 2023 17:15:09 +0800 Subject: [PATCH 08/70] 1. prompt&adapter support endwith match 2. llm_sft supports mix tuners --- examples/pytorch/llm/src/llm_sft.py | 42 +++++++++++++++---------- examples/pytorch/llm/src/utils/model.py | 27 ++++++++++++++++ swift/tuners/adapter.py | 10 +++++- swift/tuners/prompt.py | 9 +++++- 4 files changed, 70 insertions(+), 18 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 39a3a8c429..6da37e4ef4 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -2,7 +2,7 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' from dataclasses import dataclass, field from functools import partial -from typing import List, Optional, Dict +from typing import Dict, List, Optional import torch import torch.distributed as dist @@ -13,8 +13,8 @@ is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, show_layers) -from swift import (HubStrategy, LoRAConfig, Seq2SeqTrainer, - Seq2SeqTrainingArguments, Swift, get_logger, SwiftConfig) +from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer, + Seq2SeqTrainingArguments, Swift, SwiftConfig, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, seed_everything) @@ -68,6 +68,7 @@ class SftArguments: lora_rank: int = 8 lora_alpha: int = 32 lora_dropout_p: float = 0.1 + adapter_length: int = 128 gradient_checkpointing: bool = True batch_size: int = 1 @@ -199,15 +200,16 @@ def llm_sft(args: SftArguments) -> None: args.model_type, torch_dtype=args.torch_dtype, **kwargs) if args.resume_from_ckpt is None: - swift_config: Dict[str, SwiftConfig] = dict() - for sft_type in args.sft_type.split(','): - if sft_type == 'lora': - if 'ALL' in args.lora_target_modules: - assert len(args.lora_target_modules) == 1 - args.lora_target_modules = find_all_linear_for_lora( - model, args.quantization_bit, args.model_type) - logger.info( - f'Setting lora_target_modules: {args.lora_target_modules}') + swift_config: Dict[str, SwiftConfig] = dict() + for sft_type in args.sft_type.split(','): + if sft_type == 'lora': + if 'ALL' in args.lora_target_modules: + assert len(args.lora_target_modules) == 1 + args.lora_target_modules = find_all_linear_for_lora( + model, args.quantization_bit, args.model_type) + logger.info( + f'Setting lora_target_modules: {args.lora_target_modules}' + ) lora_config = LoRAConfig( r=args.lora_rank, @@ -215,10 +217,18 @@ def llm_sft(args: SftArguments) -> None: lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout_p) logger.info(f'lora_config: {lora_config}') - # ### Preparing lora - if args.sft_type == 'lora': - - model = Swift.prepare_model(model, lora_config) + swift_config['lora'] = lora_config + elif sft_type == 'adapter': + adapter_config = AdapterConfig( + dim=model.config.hidden_size, + target_modules=MODEL_MAPPING[model.config.model_type].get( + 'adapter_TM', 'mlp'), + method_name='forward', + hidden_pos=0, + adapter_length=args.adapter_length, + ) + swift_config['adapter'] = adapter_config + model = Swift.prepare_model(model, swift_config) else: model = Swift.from_pretrained( model, args.resume_from_ckpt, is_trainable=True) diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index bf6cc4c797..d16b76bfa0 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -172,6 +172,15 @@ class LoRATM(NamedTuple): polylm = ['c_attn'] +class AdapterTM(NamedTuple): + # default lora target modules. qkv + baichuan = ['mlp'] + chatglm2 = ['mlp'] + llama2 = ['mlp'] + qwen = ['mlp'] + polylm = ['mlp'] + + # Model Home: 'https://modelscope.cn/models/{model_id}/summary' # keys: 'model_id', 'revision', 'get_function', 'template', # 'ignore_file_pattern', 'lora_TM' @@ -181,6 +190,7 @@ class LoRATM(NamedTuple): 'revision': 'v1.0.5', 'get_function': get_model_tokenizer_qwen, 'lora_TM': LoRATM.qwen, + 'adapter_TM': AdapterTM.qwen, }, 'qwen-7b-chat': { 'model_id': 'qwen/Qwen-7B-Chat', @@ -188,12 +198,14 @@ class LoRATM(NamedTuple): 'get_function': get_model_tokenizer_qwen, 'template': 'chatml', 'lora_TM': LoRATM.qwen, + 'adapter_TM': AdapterTM.qwen, }, 'qwen-vl': { 'model_id': 'qwen/Qwen-VL', 'revision': 'v1.0.2', 'get_function': get_model_tokenizer_qwen_vl, 'lora_TM': LoRATM.qwen, + 'adapter_TM': AdapterTM.qwen, }, 'qwen-vl-chat': { 'model_id': 'qwen/Qwen-VL-Chat', @@ -201,23 +213,27 @@ class LoRATM(NamedTuple): 'get_function': get_model_tokenizer_qwen_vl, 'template': 'chatml', 'lora_TM': LoRATM.qwen, + 'adapter_TM': AdapterTM.qwen, }, 'baichuan-7b': { 'model_id': 'baichuan-inc/baichuan-7B', 'revision': 'v1.0.7', 'lora_TM': LoRATM.baichuan, + 'adapter_TM': AdapterTM.baichuan, }, 'baichuan-13b': { 'model_id': 'baichuan-inc/Baichuan-13B-Base', 'revision': 'v1.0.5', 'get_function': get_model_tokenizer_baichuan13b, 'lora_TM': LoRATM.baichuan, + 'adapter_TM': AdapterTM.baichuan, }, 'baichuan-13b-chat': { 'model_id': 'baichuan-inc/Baichuan-13B-Chat', 'revision': 'v1.0.8', 'template': 'baichuan', 'lora_TM': LoRATM.baichuan, + 'adapter_TM': AdapterTM.baichuan, }, 'chatglm2-6b': { 'model_id': 'ZhipuAI/chatglm2-6b', @@ -225,18 +241,21 @@ class LoRATM(NamedTuple): 'get_function': get_model_tokenizer_chatglm2, 'template': 'chatglm2', 'lora_TM': LoRATM.chatglm2, + 'adapter_TM': AdapterTM.chatglm2, }, 'chatglm2-6b-32k': { 'model_id': 'ZhipuAI/chatglm2-6b-32k', 'revision': 'v1.0.0', 'template': 'chatglm2', 'lora_TM': LoRATM.chatglm2, + 'adapter_TM': AdapterTM.chatglm2, }, 'llama2-7b': { 'model_id': 'modelscope/Llama-2-7b-ms', 'revision': 'v1.0.2', 'ignore_file_pattern': [r'.+\.bin$'], # use safetensors 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'llama2-13b': { 'model_id': 'modelscope/Llama-2-13b-ms', @@ -244,12 +263,14 @@ class LoRATM(NamedTuple): 'get_function': get_model_tokenizer_llama2, 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'llama2-70b': { 'model_id': 'modelscope/Llama-2-70b-ms', 'revision': 'v1.0.0', 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'llama2-7b-chat': { 'model_id': 'modelscope/Llama-2-7b-chat-ms', @@ -257,6 +278,7 @@ class LoRATM(NamedTuple): 'template': 'llama', 'ignore_file_pattern': [r'.+\.bin$'], # use safetensors 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'llama2-13b-chat': { 'model_id': 'modelscope/Llama-2-13b-chat-ms', @@ -265,6 +287,7 @@ class LoRATM(NamedTuple): 'template': 'llama', 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'llama2-70b-chat': { 'model_id': 'modelscope/Llama-2-70b-chat-ms', @@ -273,24 +296,28 @@ class LoRATM(NamedTuple): 'template': 'llama', 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'openbuddy-llama2-13b': { 'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16', 'revision': 'v1.0.0', 'template': 'openbuddy_llama', 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'openbuddy-llama-65b': { 'model_id': 'OpenBuddy/openbuddy-llama-65b-v8-bf16', 'revision': 'v1.0.0', 'template': 'openbuddy_llama', 'lora_TM': LoRATM.llama2, + 'adapter_TM': AdapterTM.llama2, }, 'polylm-13b': { 'model_id': 'damo/nlp_polylm_13b_text_generation', 'revision': 'v1.0.3', 'get_function': get_model_tokenizer_polylm, 'lora_TM': LoRATM.polylm, + 'adapter_TM': AdapterTM.polylm, }, } diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 19233e60eb..c6885a6050 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -76,7 +76,15 @@ def prepare_model(model: nn.Module, config: AdapterConfig) -> SwiftOutput: module_keys = [key for key, _ in model.named_modules()] for module_key in module_keys: - if re.fullmatch(config.target_modules, module_key): # noqa + if isinstance(config.target_modules, str): + target_module_found = re.fullmatch(config.target_modules, + module_key) + else: + target_module_found = any( + module_key.endswith(target_key) + for target_key in config.target_modules) + + if target_module_found: # noqa module = model.get_submodule(module_key) def _forward(self, *args, **kwargs): diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index f426a4dd83..1f5c4b1b14 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -81,7 +81,14 @@ def prepare_model(model: nn.Module, config: PromptConfig): module_keys = [key for key, _ in model.named_modules()] match_module_keys = [] for module_key in module_keys: - if re.fullmatch(config.target_modules, module_key): # noqa + if isinstance(config.target_modules, str): + target_module_found = re.fullmatch(config.target_modules, + module_key) + else: + target_module_found = any( + module_key.endswith(target_key) + for target_key in config.target_modules) + if target_module_found: # noqa module = model.get_submodule(module_key) def _forward(self, *args, **kwargs): From 30b3e8a96d15545b398004296a8e7d9f094b5b06 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 6 Sep 2023 00:27:36 +0800 Subject: [PATCH 09/70] add restuner --- swift/__init__.py | 2 + swift/tuners/__init__.py | 4 + swift/tuners/lora.py | 2 - swift/tuners/mapping.py | 8 +- swift/tuners/restuning.py | 323 +++++++++++++++++++++++++++ swift/tuners/restuning_components.py | 301 +++++++++++++++++++++++++ swift/tuners/side.py | 255 +++++++++++++++++++++ 7 files changed, 892 insertions(+), 3 deletions(-) create mode 100644 swift/tuners/restuning.py create mode 100644 swift/tuners/restuning_components.py create mode 100644 swift/tuners/side.py diff --git a/swift/__init__.py b/swift/__init__.py index d4ab2b8c64..e41615c414 100644 --- a/swift/__init__.py +++ b/swift/__init__.py @@ -8,6 +8,7 @@ from .tuners import ( Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig, SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM, + ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM, PeftModelForSequenceClassification, PeftModelForTokenClassification, PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, @@ -29,6 +30,7 @@ 'tuners': [ 'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA', 'LoRAConfig', 'SWIFT_MAPPING', 'LoraConfig', 'PeftConfig', + 'ResTuningConfig', 'SideConfig', 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification', 'PeftModelForTokenClassification', 'PrefixTuningConfig', diff --git a/swift/tuners/__init__.py b/swift/tuners/__init__.py index bed8803d70..6ebb813e90 100644 --- a/swift/tuners/__init__.py +++ b/swift/tuners/__init__.py @@ -8,6 +8,8 @@ from .base import SwiftModel, Swift from .lora import LoRA, LoRAConfig from .mapping import SWIFT_MAPPING + from .side import Side, SideConfig, SideModule + from .restuning import ResTuning, ResTuningConfig, ResTuningModule from .peft import (LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM, PeftModelForSequenceClassification, @@ -23,6 +25,8 @@ 'base': ['SwiftModel', 'Swift'], 'lora': ['LoRA', 'LoRAConfig'], 'mapping': ['SWIFT_MAPPING'], + 'side': ['Side', 'SideConfig', 'SideModule'], + 'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningModule'], 'peft': [ 'LoraConfig', 'PeftConfig', 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification', diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index e5e315385f..5cbb797970 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -129,8 +129,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, modules = [] module_keys = [key for key, _ in model.named_modules()] assert isinstance(replace_modules, (str, list)) - if isinstance(replace_modules, str): - replace_modules = [replace_modules] AutoGPTQQuantLinear = get_auto_gptq_quant_linear( get_quantization_config(model, method='gptq')) diff --git a/swift/tuners/mapping.py b/swift/tuners/mapping.py index 1f91c542ef..b958cc1305 100644 --- a/swift/tuners/mapping.py +++ b/swift/tuners/mapping.py @@ -3,16 +3,22 @@ from .adapter import Adapter, AdapterConfig from .lora import LoRA, LoRAConfig from .prompt import Prompt, PromptConfig +from .restuning import ResTuning, ResTuningConfig +from .side import Side, SideConfig class SwiftTuners: ADAPTER = 'ADAPTER' PROMPT = 'PROMPT' LORA = 'LORA' + SIDE = 'SIDE' + RESTUNING = 'RESTUNING' SWIFT_MAPPING = { SwiftTuners.ADAPTER: (AdapterConfig, Adapter), SwiftTuners.PROMPT: (PromptConfig, Prompt), - SwiftTuners.LORA: (LoRAConfig, LoRA) + SwiftTuners.LORA: (LoRAConfig, LoRA), + SwiftTuners.SIDE: (SideConfig, Side), + SwiftTuners.RESTUNING: (ResTuningConfig, ResTuning), } diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py new file mode 100644 index 0000000000..b72e000bcb --- /dev/null +++ b/swift/tuners/restuning.py @@ -0,0 +1,323 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import copy +import re +import types +from dataclasses import dataclass, field +from typing import Union, Dict, Optional, List + +import torch.nn as nn + +from swift.utils.logger import get_logger +from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner +from .utils import SwiftConfig, SwiftOutput + +logger = get_logger() + + +@dataclass +class ResTuningConfig(SwiftConfig): + """ + The configuration class for the ResTuning module. + + ResTuning is a flexible parameter-efficient and memory-efficient tuning paradigm framework. + 'Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding Tuner from Backbone' + by Jiang et al.(2023) + See + + Args: + dims: The dimensions of the hidden states + root_modules: The root module to be replaced, can a regex string + root_modules_hook: The hook type of root modules, can be "input" or "output" + stem_modules: The stem modules to be replaced, can a regex string or name list of full match format + stem_modules_hook: The hook type of stem modules, can be "input" or "output" + target_modules: The target module to be replaced, can a regex string + target_modules_hook: The hook type of target modules, can be "input" or "output" + tuner_cfg: The configuration of the tuning module, can a string or customized config + use_upsample: Whether to use auxiliary upsample module + use_bypass: Whether to use bypass + + """ + + dims: Optional[Union[List[int], int]] = field( + default=None, metadata={'help': 'The dimensions of the hidden states'}) + + root_modules: str = field( + default=None, + metadata={ + 'help': 'The root module to be replaced, can a regex string (use the first matching module) or full match format' + }) + + root_modules_hook: str = field( + default="input", + metadata={ + 'help': 'The hook type of root modules, can be "input" or "output"' + }) + + stem_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + 'help': 'The stem modules to be replaced, can a regex string or name list of full match format' + }) + + stem_modules_hook: str = field( + default="output", + metadata={ + 'help': 'The hook type of stem modules, can be "input" or "output"' + }) + + target_modules: str = field( + default=None, + metadata={ + 'help': 'The target module to be replaced, can a regex string (use the first matching module) or full match format' + }) + + target_modules_hook: str = field( + default="input", + metadata={ + 'help': 'The hook type of target modules, can be "input" or "output"' + }) + + target_hidden_pos: str = field( + default=None, + metadata={ + 'help': + 'The position of the hidden state for target modules output' + }) + + tuner_cfg: Optional[Union[List[Dict], Dict, str]] = field( + default=None, + metadata={'help': 'The configuration of the tuning module, can a string or customized config'}) + + use_upsample: bool = field( + default=False, + metadata={'help': 'Whether to use auxiliary upsample module'}) + + upsample_out_channels: List[int] = field( + default=None, + metadata={'help': 'The number of output channels when "use_upsample" is set to "True"'}) + + zero_init_last: bool = field( + default=False, + metadata={'help': 'Zero init last weight'}) + + use_bypass: bool = field( + default=True, + metadata={'help': 'Whether to use bypass'}) + + def __post_init__(self): + from .mapping import SwiftTuners + self.swift_type = SwiftTuners.RESTUNING + + +class ResTuning: + + @staticmethod + def prepare_model(model: nn.Module, config: ResTuningConfig) -> SwiftOutput: + """Prepare a model with `ResTuningConfig`""" + + def _forward_seq(self, input, *args, **kwargs): + for idx, module in enumerate(self): + if idx >= len(self.origin_module_keys): continue + input = module(input) + return input + + def _forward_target(self, *args, **kwargs): + if self.target_modules_hook == "input": + args_main = _forward_restuning(self) + args_main = self.forward_origin(args_main, **kwargs) + else: + _args_main = self.forward_origin(*args, **kwargs) + args_main = _forward_restuning(self) + if type(_args_main) != type(args_main): + _args_main[self.target_hidden_pos] = args_main + args_main = _args_main + return args_main + + def _forward_restuning(self): + probe_results = [] + root_module_ins = self.root_module_ins_list[0] + stem_module_ins_list = self.stem_module_ins_list + top_module = model.get_submodule('') + if root_module_ins: + if root_module_ins.root_modules_hook == 'input': + probe_results.append(root_module_ins.probe_input_data) + else: + probe_results.append(root_module_ins.probe_output_data) + for i, st_mod in enumerate(stem_module_ins_list): + if i == 0 and root_module_ins is None: + probe_results.append(st_mod.probe_input_data) + if st_mod.stem_modules_hook == 'input': + probe_results.append(st_mod.probe_input_data) + else: + probe_results.append(st_mod.probe_output_data) + args_main = getattr(top_module, 'restuning')(probe_results) + return args_main + + # 1. Matching the root module + module_keys = [key for key, _ in model.named_modules()] + root_module_ins_list = [] + if config.root_modules: + for module_key in module_keys: + if re.fullmatch(config.root_modules, module_key): + root_module = model.get_submodule(module_key) + logger.info(f"Matching root module [{module_key}] of type {type(root_module)}") + if isinstance(root_module, (nn.ModuleList, nn.ModuleDict)): + logger.warning( + f"Type of {type(root_module)} may not be supported because of its customized forward") + if config.root_modules_hook == "input": + root_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + else: + root_module.register_forward_hook(probe_output_hook, with_kwargs=True) + root_module.root_modules_hook = config.root_modules_hook + root_module_ins_list.append(root_module) + break + if len(root_module_ins_list) == 0: + logger.error(f"Cannot match root modules") + + # 2. Matching the stem module + stem_module_ins_list = [] + stem_module_ins_index = [] + for module_key in module_keys: + if (isinstance(config.stem_modules, str) and re.fullmatch(config.stem_modules, module_key)) or \ + (isinstance(config.stem_modules, list) and module_key in config.stem_modules): + stem_module = model.get_submodule(module_key) + if isinstance(config.stem_modules, list): + stem_module_ins_index.append(config.stem_modules.index(module_key)) + logger.info(f"Matching stem module [{module_key}] of type {type(stem_module)}") + if isinstance(stem_module, (nn.ModuleList, nn.ModuleDict)): + logger.warning( + f"Type of {type(stem_module)} may not be supported because of its customized forward") + if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0: + stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + if config.stem_modules_hook == "input": + stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + else: + stem_module.register_forward_hook(probe_output_hook, with_kwargs=True) + stem_module.stem_modules_hook = config.stem_modules_hook + stem_module_ins_list.append(stem_module) + if isinstance(config.stem_modules, list): + stem_module_ins_list = [stem_module_ins_list[stem_module_ins_index.index(i)] for i in + range(len(stem_module_ins_index))] + depth = len(stem_module_ins_list) + if len(stem_module_ins_list) == 0: + raise Exception(f"Cannot match source modules") + + # 3. Init restuning module + if len(stem_module_ins_list) != 0: + top_module = model.get_submodule('') + restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample, + config.upsample_out_channels, config.zero_init_last, + config.tuner_cfg) + setattr(top_module, 'restuning', restuning_module) + + # 4. Matching the target module + target_module_ins = None + for module_key in module_keys: + if re.fullmatch(config.target_modules, module_key): + tgt_module = model.get_submodule(module_key) + logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}") + if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)): + raise Exception( + f"Type of {type(tgt_module)} may not be supported because of its customized forward") + + tgt_module.target_modules_hook = config.target_modules_hook + tgt_module.target_hidden_pos = config.target_hidden_pos + tgt_module.root_module_ins_list = root_module_ins_list + tgt_module.stem_module_ins_list = stem_module_ins_list + target_module_ins = tgt_module + + if isinstance(tgt_module, nn.Sequential): + tgt_module.origin_module_keys = copy.deepcopy(list(tgt_module._modules.keys())) + tgt_module.forward_origin = types.MethodType(_forward_seq, tgt_module) + else: + tgt_module.forward_origin = tgt_module.forward + tgt_module.forward = types.MethodType(_forward_target, tgt_module) + if target_module_ins is None: + raise Exception(f"Cannot match target modules") + + def state_dict_callback(state_dict): + return { + key: value + for key, value in state_dict.items() if 'restuning' in key + } + + def mark_trainable_callback(model): + return + + return SwiftOutput(config, state_dict_callback, + mark_trainable_callback) + + +class ResTuningBypassModule(nn.Module): + """The implementation of ResTuningBypass method. + """ + + def __init__( + self, + dims, + depth, + use_upsample=False, + upsample_out_channels=None, + zero_init_last=False, + tuner_cfg=None, + ): + super(ResTuningBypassModule, self).__init__() + + self.bypass_blocks = nn.Sequential(*[ + ResTunerBypassBlock( + dim=dims[i] if isinstance(dims, list) else dims, + layer_num=i, + depth=depth, + use_upsample=use_upsample, + upsample_out_channels=upsample_out_channels[i] if isinstance(upsample_out_channels, + list) else upsample_out_channels, + zero_init_last=zero_init_last, + tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list) else tuner_cfg + ) + for i in range(depth)]) + + def forward(self, x_list, **kwargs): + x_bypass = detach_tensors(x_list.pop(0)) + x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass + x_list = detach_tensors(x_list) + x_list = [_x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list] + for i, (bp_blk, x_stem) in enumerate(zip(self.bypass_blocks, x_list)): + target_size = x_list[i + 1].shape[2:] if i < len(x_list) - 1 else None + x_bypass = bp_blk(x_stem, x_bypass, target_size, **kwargs) + return x_bypass + + +class ResTunerBypassBlock(nn.Module): + def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_last=False, tuner_cfg=None, **kwargs): + super().__init__() + self.layer_num = layer_num + self.depth = depth + + if isinstance(tuner_cfg, str): + lateral_cfg = tuner_cfg + vertical_cfg = tuner_cfg + aux_cfg = "upsample" if use_upsample and layer_num != depth - 1 else None + elif isinstance(tuner_cfg, dict): + lateral_cfg = tuner_cfg['lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None + vertical_cfg = tuner_cfg['vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None + aux_cfg = tuner_cfg['aux_cfg'] if 'aux_cfg' in tuner_cfg else None + + self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "lateral", lateral_cfg, **kwargs) + self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "vertical", vertical_cfg, **kwargs) + if aux_cfg and len(aux_cfg) != 0: + self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "aux", aux_cfg, **kwargs) + + def forward(self, x_stem, x_bypass, target_size=None, **kwargs): + x_lateral = self.lateral_tuner(x_stem) + x_vertical = self.vertical_tuner(x_bypass) + + x_bypass_out = x_lateral + x_vertical + if hasattr(self, 'aux_tuner'): + x_bypass_out = self.aux_tuner(x_bypass_out, target_size) + + # logger.info(f"x_main:{x_stem.shape} / {torch.sum(x_stem)}, x_side:{x_bypass.shape} / {torch.sum(x_bypass)}") + # logger.info(f"x_lateral:{x_lateral.shape} / {torch.sum(x_lateral)}, x_vertical:{x_vertical.shape} / {torch.sum(x_vertical)}") + # logger.info(f"x_bypass_out: {x_bypass_out.shape} / {torch.sum(x_bypass_out)}") + + return x_bypass_out + diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py new file mode 100644 index 0000000000..f6aefb0610 --- /dev/null +++ b/swift/tuners/restuning_components.py @@ -0,0 +1,301 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from swift.utils.logger import get_logger + +logger = get_logger() + + +class ResTuner(nn.Module): + def __init__( + self, dim=None, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg={}, **kwargs): + super().__init__() + self.dim = dim + self.layer_num = layer_num + self.depth = depth + self.stage = stage + self.tuner_cfg = tuner_cfg + + if (isinstance(tuner_cfg, str) and tuner_cfg == "res_adapter") or \ + (isinstance(tuner_cfg, dict) and "res_adapter" in tuner_cfg): + tuner_cfg = tuner_cfg['res_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg + self.tuner = ResAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last, + stage=stage, tuner_cfg=tuner_cfg, **kwargs) + elif (isinstance(tuner_cfg, str) and tuner_cfg == "res_group_adapter") or \ + (isinstance(tuner_cfg, dict) and "res_group_adapter" in tuner_cfg): + tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg + self.tuner = ResGroupAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last, + stage=stage, tuner_cfg=tuner_cfg, **kwargs) + elif (isinstance(tuner_cfg, str) and tuner_cfg == "upsample") or \ + (isinstance(tuner_cfg, dict) and "upsample" in tuner_cfg): + tuner_cfg = tuner_cfg['upsample'] if isinstance(tuner_cfg, dict) else tuner_cfg + if 'upsample_out_channels' in kwargs: + out_channels = kwargs['upsample_out_channels'] + use_conv = True if out_channels else False + else: + out_channels = dim + use_conv = False + self.tuner = Upsample(channels=dim, use_conv=use_conv, out_channels=out_channels, tuner_cfg=tuner_cfg, + **kwargs) + else: + self.tuner = Identity() + + def forward(self, x, *args, **kwargs): + if self.tuner_cfg == "zero" or "zero" in self.tuner_cfg: + x_out = 0.0 + else: + x_out = self.tuner(x, *args, **kwargs) + return x_out + + +class ResAdapter(nn.Module): + def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU, + **kwargs): + super(ResAdapter, self).__init__() + self.dim = dim + self.layer_num = layer_num + self.depth = depth + + self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 10 + self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None + self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None + + self.adapter_length = self.adapter_length[self.layer_num] if isinstance(self.adapter_length, + list) else self.adapter_length + assert isinstance(self.adapter_length, int) or ( + isinstance(self.adapter_length, tuple) and len(self.adapter_length) == 3) + if isinstance(self.adapter_length, int): + self.ln1 = nn.Linear(dim, self.adapter_length) + else: + self.ln1 = nn.Linear(self.adapter_length[0], self.adapter_length[1]) + self.activate = act_layer() + if isinstance(self.adapter_length, int): + self.ln2 = nn.Linear(self.adapter_length, dim) + else: + self.ln2 = nn.Linear(self.adapter_length[1], self.adapter_length[2]) + dim = self.adapter_length[2] + + self._xavier_init_weights(self.ln1) + if zero_init_last and layer_num == depth - 1: + self._zero_init_weights(self.ln2) + else: + self._xavier_init_weights(self.ln2) + + self.scaling = init_weight_type(dim, self.adapter_weight) + + def _zero_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.zeros_(m.weight) + nn.init.zeros_(m.bias) + + def _kaiming_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + nn.init.normal_(m.bias) + + def _xavier_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + nn.init.normal_(m.bias, std=1e-6) + + def forward(self, x): + x_shortcut = x + if len(x_shortcut.size()) == 4: + B, C, N1, N2 = x.size() + x = x.view(x_shortcut.size()[0], x_shortcut.size()[1], -1).permute(0, 2, 1) + + x_adapter = self.ln2(self.activate(self.ln1(x))) + + if self.adapter_weight: + x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight) + + if len(x_shortcut.size()) == 4: + x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1], + x_shortcut.size()[2], x_shortcut.size()[3]) + x_out = x_shortcut + x_adapter + return x_out + + +class ResGroupAdapter(nn.Module): + def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU, + **kwargs): + super(ResGroupAdapter, self).__init__() + self.dim = dim + self.layer_num = layer_num + self.depth = depth + + self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None + self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None + + self.adapter_dim = tuner_cfg['dim'] if 'dim' in tuner_cfg else dim + self.adapter_head = tuner_cfg['head'] if 'head' in tuner_cfg else 4 + self.adapter_scale_factor = tuner_cfg['scale_factor'] if 'scale_factor' in tuner_cfg else 2 + + assert self.adapter_dim % self.adapter_head == 0, 'adapter dim should be divisible by adapter head' + self.dim_mlp = self.adapter_dim // self.adapter_head + + self.ln1 = nn.Linear(self.dim_mlp, self.dim_mlp * self.adapter_scale_factor) + self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor, self.dim_mlp) + self.activate = act_layer() + + self._kaiming_init_weights(self.ln1) + if zero_init_last and layer_num == depth - 1: + self._zero_init_weights(self.ln2) + else: + self._kaiming_init_weights(self.ln2) + self.scaling = init_weight_type(dim, self.adapter_weight) + + def _zero_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.zeros_(m.weight) + nn.init.zeros_(m.bias) + + def _kaiming_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5)) + nn.init.normal_(m.bias) + + def _xavier_init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + nn.init.normal_(m.bias, std=1e-6) + + def forward(self, x): + x_shortcut = x + + batch, inner_dim, height, width = x.shape + + x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + + x_adapter = rearrange(x_adapter, "b n (c h) -> (b h) n c", h=self.adapter_head) + x_adapter = self.ln2(self.activate(self.ln1(x_adapter))) + x_adapter = rearrange(x_adapter, "(b h) n c -> b n (c h)", h=self.adapter_head) + + if self.adapter_weight: + x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight) + + x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous() + x_out = x_shortcut + x_adapter + + return x_out + + +class Identity(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, inputs, *args, **kwargs): + return inputs + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv=False, out_channels=None, padding=1, **kwargs): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + if use_conv: + self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=padding) + self.init_weights() + + def init_weights(self): + def _init_weights(m): + if isinstance(m, nn.Conv2d): + nn.init.zeros_(m.weight) + nn.init.zeros_(m.bias) + + self.apply(_init_weights) + + def forward(self, x, target_size=None, *args, **kwargs): + assert x.shape[1] == self.channels + if target_size is None: + x = F.interpolate(x.float(), scale_factor=2, mode="nearest").type_as(x) + else: + x = F.interpolate(x.float(), target_size, mode="nearest").type_as(x) + if self.use_conv: + x = self.conv(x) + return x + + +def init_weight_type(dim, weight_type): + if weight_type is None: + scaling = None + elif weight_type == "gate": + scaling = nn.Linear(dim, 1) + elif weight_type == "scale": + scaling = nn.Parameter(torch.Tensor(1)) + scaling.data.fill_(1) + elif weight_type == "scale_kv": + scaling_k = nn.Parameter(torch.Tensor(1)) + scaling_k.data.fill_(1) + scaling_v = nn.Parameter(torch.Tensor(1)) + scaling_v.data.fill_(1) + scaling = (scaling_k, scaling_v) + elif weight_type == "scale_channel": + scaling = nn.Parameter(torch.Tensor(dim)) + scaling.data.fill_(1) + elif weight_type == "scale_kv_channel": + scaling_k = nn.Parameter(torch.Tensor(dim)) + scaling_k.data.fill_(1) + scaling_v = nn.Parameter(torch.Tensor(dim)) + scaling_v.data.fill_(1) + scaling = (scaling_k, scaling_v) + elif weight_type and weight_type.startswith("scalar"): + scaling = float(weight_type.split('_')[-1]) + else: + scaling = None + return scaling + + +def apply_data_weight(data, scaling, weight_type): + if weight_type in ["gate"]: + scaling = torch.mean(torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1) + elif weight_type in ["scale", "scale_channel"] or weight_type.startswith('scalar'): + scaling = scaling + else: + scaling = None + if scaling is not None: + data = data * scaling + return data + + +def detach_tensors(feats): + if type(feats) in [list, tuple]: + feats = [detach_tensors(feat) if feat is not None else None for feat in feats] + elif isinstance(feats, dict): + feats = {key: detach_tensors(val) for key, val in feats.items()} + elif isinstance(feats, torch.Tensor): + feats = feats.detach().float() + else: + feats = feats.detach() + return feats + + +def probe_tensors(module, feats, name): + feats = detach_tensors(feats) + setattr(module, name, feats) + + +def probe_input_pre_hook(self, args, kwargs): + input = args[0] + probe_tensors(self, input, 'probe_input_data') + return args, kwargs + + +def probe_output_hook(self, args, kwargs, result): + output = result + probe_tensors(self, output, 'probe_output_data') + return output + diff --git a/swift/tuners/side.py b/swift/tuners/side.py new file mode 100644 index 0000000000..f5aabb7a69 --- /dev/null +++ b/swift/tuners/side.py @@ -0,0 +1,255 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import inspect +import re +import types +import copy +from dataclasses import dataclass, field +from functools import partial +from typing import Union, Callable, Any +from collections import OrderedDict +from itertools import repeat + +import torch +from torch import nn +import torchvision + +from swift.utils.logger import get_logger +from .utils import SwiftConfig, SwiftOutput + +logger = get_logger() + + +@dataclass +class SideConfig(SwiftConfig): + """ + The configuration class for the side module. + + Side-Tuning only needs to train one side network and + weights the output of pre-trained model and side network. + 'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks' + by Zhang et al.(2019) + See https://arxiv.org/abs/1912.13503 + + Args: + target_modules: The feedforward module to be replaced, in regex format + """ + + dim: int = field( + default=None, metadata={'help': 'The dimension of the hidden states'}) + + target_modules: str = field( + default=None, + metadata={ + 'help': 'The target module to be replaced, in full match format' + }) + + side_module_name: float = field( + default=1., metadata={'help': 'The name of the additive side networks'}) + + hidden_pos: Union[str, int] = field( + default=0, + metadata={ + 'help': + 'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)' + }) + + def __post_init__(self): + from .mapping import SwiftTuners + self.swift_type = SwiftTuners.SIDE + + +class Side: + + @staticmethod + def prepare_model(model: nn.Module, config: SideConfig) -> SwiftOutput: + """Prepare a model with `SideConfig`""" + module_keys = [key for key, _ in model.named_modules()] + + for module_key in module_keys: + if re.fullmatch(config.target_modules, module_key): # noqa + tgt_module = model.get_submodule(module_key) + logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}") + if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)): + raise Exception( + f"Type of {type(tgt_module)} may not be supported because of its customized forward") + + def _forward(self, *args, **kwargs): + args_main = self.forward_origin(*args, **kwargs) + if isinstance(args_main, (tuple, list, dict)): + if isinstance(config.hidden_pos, str): + args_main[config.hidden_pos] = getattr(self, 'side')(*args, args_main[config.hidden_pos]) + else: + args_main = getattr(self, 'side')(*args, args_main) + return args_main + + if isinstance(tgt_module, nn.Sequential): + tgt_module.tgt_module_keys = copy.deepcopy(list(tgt_module._modules.keys())) + + def forward_seq(self, input, *args, **kwargs): + for idx, module in enumerate(self): + if idx >= len(tgt_module.tgt_module_keys): continue + input = module(input) + return input + + tgt_module.forward_origin = types.MethodType(forward_seq, tgt_module) + else: + tgt_module.forward_origin = tgt_module.forward + tgt_module.forward = types.MethodType(_forward, tgt_module) + side_module = SideModule(config.dim, config.side_module_name) + setattr(tgt_module, 'side', side_module) + + def state_dict_callback(state_dict): + return { + key: value + for key, value in state_dict.items() if 'side' in key + } + + def mark_trainable_callback(model): + return + + return SwiftOutput(config, state_dict_callback, + mark_trainable_callback) + + +class SideModule(nn.Module): + """The implementation of vision side-tuning method. + + Side-Tuning only needs to train one side network and + weights the output of pre-trained model and side network. + 'Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks' + by Zhang et al.(2019) + See https://arxiv.org/abs/1912.13503 + + Attributes: + side_module_name: The name of the additive side networks. + """ + + def __init__( + self, + dim, + side_module_name='fcn4' + ): + super(SideModule, self).__init__() + + side_module_name = side_module_name.lower() + if side_module_name == 'fcn4': + self.side_net = FCN4(out_dims=dim) + elif side_module_name == 'mlp': + self.side_net = Mlp(dim) + elif side_module_name == 'alexnet': + mm = torchvision.models.alexnet(pretrained=True) + self.side_net = nn.Sequential( + OrderedDict([ + ('features', mm.features), ('avgpool', mm.avgpool), + ('flatten', nn.Flatten()), + ('fc', nn.Linear(9216, dim, bias=False)) + ])) + else: + raise ValueError(f'Unsupported side_module_name: {side_module_name}') + self.alpha = nn.Parameter(torch.tensor(0.0)) + + def forward(self, x, x_main): + alpha_squashed = torch.sigmoid(self.alpha) + x_side = self.side_net(x) + x_out = alpha_squashed * x_main + (1 - alpha_squashed) * x_side + return x_out + + +class FCN4(nn.Module): + """The implementation of simple FCN4 network for side network. + """ + + def __init__(self, out_dims=-1, **kwargs): + super(FCN4, self).__init__(**kwargs) + + self.conv1 = nn.Sequential( + nn.Conv2d( + 3, + 16, + kernel_size=3, + stride=1, + padding=1, + bias=False, + dilation=1), nn.GroupNorm(2, 16), nn.ReLU()) + self.conv2 = nn.Sequential( + nn.Conv2d( + 16, + 16, + kernel_size=3, + stride=2, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 16), nn.ReLU()) + self.conv3 = nn.Sequential( + nn.Conv2d( + 16, + 32, + kernel_size=3, + stride=2, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 32), nn.ReLU()) + self.conv4 = nn.Sequential( + nn.Conv2d( + 32, + 64, + kernel_size=3, + stride=1, + padding=0, + bias=False, + dilation=1), nn.GroupNorm(2, 64), nn.ReLU()) + self.pool = nn.AdaptiveAvgPool2d((1, 1)) + if out_dims > 0: + self.fc = nn.Linear(64, out_dims) + else: + self.fc = None + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + x = self.conv4(x) + x = self.pool(x) + x = x.view(x.size(0), -1) + if self.fc is not None: + x = self.fc(x) + return x + + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer. + """ + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=None, + bias=True, + drop=0., + use_conv=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = tuple(repeat(bias, 2)) + drop_probs = tuple(repeat(drop, 2)) + linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear + + self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() + self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.norm(x) + x = self.fc2(x) + x = self.drop2(x) + return x From cbb0b2fc071e591ddeaebc459a51914b27d86496 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 6 Sep 2023 00:29:56 +0800 Subject: [PATCH 10/70] add tests --- swift/tuners/side.py | 2 +- tests/tuners/test_swift_base.py | 40 +++++++- tests/tuners/test_swift_restuning.py | 135 +++++++++++++++++++++++++++ 3 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 tests/tuners/test_swift_restuning.py diff --git a/swift/tuners/side.py b/swift/tuners/side.py index f5aabb7a69..9e4f043dd7 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -43,7 +43,7 @@ class SideConfig(SwiftConfig): 'help': 'The target module to be replaced, in full match format' }) - side_module_name: float = field( + side_module_name: str = field( default=1., metadata={'help': 'The name of the additive side networks'}) hidden_pos: Union[str, int] = field( diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index ce1ccb3307..7676a2a283 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -11,7 +11,7 @@ SbertForSequenceClassification) from peft.utils import WEIGHTS_NAME -from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub +from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig class TestSwift(unittest.TestCase): @@ -104,3 +104,41 @@ def test_swift_multiple_adapters(self): all( torch.isclose(state_dict[key], state_dict2[key]).flatten().detach().cpu())) + def test_swift_side(self): + from transformers import AutoModelForImageClassification + model = AutoModelForImageClassification.from_pretrained( + 'google/vit-base-patch16-224') + model2 = copy.deepcopy(model) + result_origin = model(torch.ones((1, 3, 224, 224))).logits + print( + f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}' + ) + + side_config = SideConfig( + dim=768, + target_modules=r'vit', + side_module_name='fcn4', + hidden_pos='last_hidden_state') + + model = Swift.prepare_model(model, config=side_config) + result = model(torch.ones((1, 3, 224, 224))).logits + print( + f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}' + ) + self.assertTrue(isinstance(model, SwiftModel)) + model.save_pretrained(self.tmp_dir) + self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) + + model2 = Swift.from_pretrained(model2, self.tmp_dir) + state_dict = model.state_dict() + state_dict2 = model2.state_dict() + for key in state_dict: + self.assertTrue(key in state_dict2) + self.assertTrue( + all( + torch.isclose(state_dict[key], + state_dict2[key]).flatten().detach().cpu())) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py new file mode 100644 index 0000000000..421544d0df --- /dev/null +++ b/tests/tuners/test_swift_restuning.py @@ -0,0 +1,135 @@ +import copy +import os +import shutil +import tempfile +import unittest + +import torch + +from swift import ResTuningConfig +from swift import Swift, SwiftModel + + +class TestSwiftResTuning(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def set_random_seed(self, seed=123): + """Set random seed manually to get deterministic results""" + import random + import numpy as np + import torch + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + def model_comparison(self, model, model2): + model_key = list(model.state_dict().keys()) + model2_key = list(model2.state_dict().keys()) + self.assertTrue(model_key == model2_key) + model_val = torch.sum(torch.stack([torch.sum(val) for val in model.state_dict().values()])) + model2_val = torch.sum(torch.stack([torch.sum(val) for val in model2.state_dict().values()])) + self.assertTrue(torch.isclose(model_val, model2_val)) + + def test_swift_restuning_vit(self): + from transformers import AutoModelForImageClassification + model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224") + model_swift_1 = copy.deepcopy(model) + model_swift_2 = copy.deepcopy(model) + result_origin = model(torch.ones((1, 3, 224, 224))).logits + print( + f"test_swift_restuning_vit result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}") + + # load type - 1 + self.set_random_seed() + restuning_config_1 = ResTuningConfig( + dims=768, + root_modules=r'.*vit.encoder.layer.0$', + stem_modules=r'.*vit.encoder.layer\.\d+$', + target_modules=r'.*vit.layernorm', + target_modules_hook="input", + tuner_cfg="res_adapter", + ) + model_swift_1 = Swift.prepare_model(model_swift_1, config=restuning_config_1) + self.assertTrue(isinstance(model_swift_1, SwiftModel)) + print(model_swift_1.get_trainable_parameters()) + result_swift_1 = model_swift_1(torch.ones((1, 3, 224, 224))).logits + print( + f"test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, result_swift_1 sum: {torch.sum(result_swift_1)}") + + # load type - 2 + self.set_random_seed() + restuning_config_2 = ResTuningConfig( + dims=768, + root_modules=r'.*vit.encoder.layer.0$', + stem_modules=r'.*vit.encoder.layer\.\d+$', + target_modules=r'.*vit.encoder', + target_modules_hook="output", + target_hidden_pos="last_hidden_state", + tuner_cfg="res_adapter", + ) + model_swift_2 = Swift.prepare_model(model_swift_2, config=restuning_config_2) + self.assertTrue(isinstance(model_swift_2, SwiftModel)) + print(model_swift_2.get_trainable_parameters()) + result_swift_2 = model_swift_2(torch.ones((1, 3, 224, 224))).logits + print( + f"test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, result_swift_2 sum: {torch.sum(result_swift_2)}") + + self.assertTrue(all(torch.isclose(result_swift_1, result_swift_2).flatten())) + + model_swift_1.save_pretrained(self.tmp_dir) + self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) + model_loaded = Swift.from_pretrained(model, self.tmp_dir) + self.model_comparison(model_swift_1, model_loaded) + + def test_swift_restuning_diffusers_sd(self): + from diffusers import UNet2DConditionModel + model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") + model.requires_grad_(False) + model2 = copy.deepcopy(model) + self.set_random_seed() + input_data = { + "sample": torch.ones((1, 4, 64, 64)), + "timestep": 10, + "encoder_hidden_states": torch.ones((1, 77, 768)) + } + result_origin = model(**input_data).sample + print( + f"test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}") + + self.set_random_seed() + restuning_config = ResTuningConfig( + dims=[1280, 1280, 1280, 640, 320], + root_modules='mid_block', + stem_modules=['mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2', 'up_blocks.3'], + target_modules='conv_norm_out', + tuner_cfg="res_group_adapter", + use_upsample=True, + upsample_out_channels=[1280, 1280, 640, 320, None], + zero_init_last=True + ) + + model = Swift.prepare_model(model, config=restuning_config) + self.assertTrue(isinstance(model, SwiftModel)) + print(model.get_trainable_parameters()) + + result = model(**input_data).sample + print(f"test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}") + model.save_pretrained(self.tmp_dir) + self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) + model2 = Swift.from_pretrained(model2, self.tmp_dir) + self.model_comparison(model, model2) + + +if __name__ == '__main__': + unittest.main() From f8d6b091511d9077cfbb54bc844ebe9d3d94dfcb Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 6 Sep 2023 10:19:23 +0800 Subject: [PATCH 11/70] temp --- .../llm/scripts/chatglm_6b/lora_ddp/sft.sh | 2 +- examples/pytorch/llm/src/llm_sft.py | 64 +++-- examples/pytorch/llm/src/utils/dataset.py | 26 +- examples/pytorch/llm/src/utils/preprocess.py | 6 + swift/trainers/trainers.py | 226 +++++++++++++++++- swift/tuners/adapter.py | 6 + swift/tuners/lora.py | 4 - swift/tuners/prompt.py | 5 +- 8 files changed, 314 insertions(+), 25 deletions(-) diff --git a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh index 3baa73288a..ff0c147200 100644 --- a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh @@ -10,7 +10,7 @@ torchrun \ --sft_type lora \ --output_dir runs \ --ddp_backend gloo \ - --dataset alpaca-en,alpaca-zh \ + --dataset advertise_gen \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 1024 \ diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 6da37e4ef4..e12cdacc0f 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -30,7 +30,7 @@ class SftArguments: metadata={'choices': list(MODEL_MAPPING.keys())}) # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G sft_type: str = field( - default='lora', metadata={'choices': ['lora', 'full']}) + default='lora') template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -83,7 +83,7 @@ class SftArguments: lr_scheduler_type: str = 'cosine' warmup_ratio: float = 0.05 - eval_steps: int = 50 + eval_steps: int = 10 save_steps: Optional[int] = None save_total_limit: int = 2 logging_steps: int = 5 @@ -123,12 +123,7 @@ def __post_init__(self): # Initialize in advance dist.init_process_group(backend=self.ddp_backend) - if self.sft_type == 'lora': - if self.learning_rate is None: - self.learning_rate = 1e-4 - if self.save_steps is None: - self.save_steps = self.eval_steps - elif self.sft_type == 'full': + if self.sft_type == 'full': assert self.quantization_bit is None, 'not supported' assert self.dtype != 'fp16', 'please use bf16 or fp32' if self.learning_rate is None: @@ -137,7 +132,11 @@ def __post_init__(self): # Saving the model takes a long time self.save_steps = self.eval_steps * 4 else: - raise ValueError(f'sft_type: {self.sft_type}') + if self.learning_rate is None: + self.learning_rate = 1e-4 + if self.save_steps is None: + self.save_steps = self.eval_steps + if self.template_type is None: self.template_type = MODEL_MAPPING[self.model_type].get( 'template', 'default') @@ -221,7 +220,7 @@ def llm_sft(args: SftArguments) -> None: elif sft_type == 'adapter': adapter_config = AdapterConfig( dim=model.config.hidden_size, - target_modules=MODEL_MAPPING[model.config.model_type].get( + target_modules=MODEL_MAPPING[args.model_type].get( 'adapter_TM', 'mlp'), method_name='forward', hidden_pos=0, @@ -239,10 +238,13 @@ def llm_sft(args: SftArguments) -> None: # ### Loading Dataset dataset = get_dataset(args.dataset.split(',')) - train_dataset, val_dataset = process_dataset(dataset, - args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) + if isinstance(dataset, tuple): + train_dataset, val_dataset = dataset + else: + train_dataset, val_dataset = process_dataset(dataset, + args.dataset_test_size, + args.dataset_sample, + args.dataset_seed) preprocess_func = get_preprocess( args.template_type, tokenizer, @@ -314,6 +316,39 @@ def llm_sft(args: SftArguments) -> None: trainer_args._frozen = True logger.info(f'trainer_args: {trainer_args}') + def compute_metrics(self, prediction): + preds, labels = prediction[0], prediction[1] + if isinstance(preds, tuple): + preds = preds[0] + + score_dict = { + 'rouge-1': [], + 'rouge-2': [], + 'rouge-l': [], + 'bleu-4': [] + } + for pred, label in zip(preds, labels): + hypothesis = list(jieba.cut(pred)) + if len(hypothesis) == 0 or ''.join(hypothesis) == '.': + hypothesis = [''] + reference = list(jieba.cut(label)) + rouge = Rouge() + scores = rouge.get_scores(' '.join(hypothesis), + ' '.join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v['f'] * 100, 4)) + bleu_score = sentence_bleu( + [list(label)], + list(pred), + smoothing_function=SmoothingFunction().method3) + score_dict['bleu-4'].append(round(bleu_score * 100, 4)) + + for k, v in score_dict.items(): + score_dict[k] = float(np.mean(v)) + return score_dict + trainer = Seq2SeqTrainer( model=model, args=trainer_args, @@ -321,6 +356,7 @@ def llm_sft(args: SftArguments) -> None: train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, + compute_metrics=compute_metrics, ) trainer.train(trainer_args.resume_from_checkpoint) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index ee87496835..1ec7397261 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -42,6 +42,26 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset: return _process_alpaca_dataset(dataset) +def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: + dataset_train: HfDataset = MsDataset.load( + 'lvjianjin/AdvertiseGen', split='train').to_hf_dataset().rename_columns({ + "content": "query", + "summary": "response", + }) + dataset_val: HfDataset = MsDataset.load( + 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset().rename_columns({ + "content": "query", + "summary": "response", + }) + return dataset_train, dataset_val + + +def get_alpaca_gpt4_en_dataset() -> HfDataset: + dataset: HfDataset = MsDataset.load( + 'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset() + return _process_alpaca_dataset(dataset) + + def get_alpaca_gpt4_zh_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset() @@ -304,6 +324,7 @@ def get_gpt4all_en_dataset() -> HfDataset: 'gpt4all-en': get_gpt4all_en_dataset, # multi-modal 'coco-en': get_coco_en_dataset, + 'advertise_gen': get_advertise_gen_dataset, } @@ -312,7 +333,10 @@ def get_dataset(dataset_name_list: List[str]) -> HfDataset: for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] dataset_list.append(get_function()) - dataset = concatenate_datasets(dataset_list) + if not isinstance(dataset_list[0], tuple): + dataset = concatenate_datasets(dataset_list) + else: + dataset = dataset_list[0] return dataset diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index d3be77610d..417e0fc713 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -12,6 +12,12 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, + 'default_no_template': { + 'prefix': [], + 'prompt': ['{{query}}'], + 'chat_sep': [], + 'suffix': [['eos_token_id']], + }, 'chatml': { 'prefix': [['im_start_id'], 'system\n{{system}}', ['im_end_id'], '\n'], 'prompt': [['im_start_id'], 'user\n{{query}}', ['im_end_id'], '\n', diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index aef7f0b5b3..faceae77a1 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -3,6 +3,13 @@ from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from torch import nn +from torch.utils.data import Dataset +from transformers.trainer_utils import PredictionOutput +from transformers.utils import logging from .mixin import PushToMsHubMixin, SwiftMixin from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew @@ -13,7 +20,224 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): - pass + + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + **gen_kwargs + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (`Dataset`, *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` + method. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + + gen_kwargs = gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.args.generation_max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams + ) + self._gen_kwargs = gen_kwargs + + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def predict( + self, + test_dataset: Dataset, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "test", + **gen_kwargs + ) -> PredictionOutput: + """ + Run prediction and returns predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method + will also return metrics, like in `evaluate()`. + + Args: + test_dataset (`Dataset`): + Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. Has to implement the method `__len__` + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + + + If your predictions or labels have different sequence lengths (for instance because you're doing dynamic + padding in a token classification task) the predictions will be padded (on the right) to allow for + concatenation into one array. The padding index is -100. + + + + Returns: *NamedTuple* A namedtuple with the following keys: + + - predictions (`np.ndarray`): The predictions on `test_dataset`. + - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). + - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained + labels). + """ + + gen_kwargs = gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.args.generation_max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams + ) + self._gen_kwargs = gen_kwargs + + + return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + # XXX: adapt synced_gpus for fairscale as well + gen_kwargs = self._gen_kwargs.copy() + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.model.config.max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams + ) + default_synced_gpus = True if is_deepspeed_zero3_enabled() else False + gen_kwargs["synced_gpus"] = ( + gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus + ) + + if "attention_mask" in inputs: + gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) + if "position_ids" in inputs: + gen_kwargs["position_ids"] = inputs.get("position_ids", None) + if "global_attention_mask" in inputs: + gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) + + # prepare generation inputs + # some encoder-decoder models can have varying encoder's and thus + # varying model input names + if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name: + generation_inputs = inputs[self.model.encoder.main_input_name] + else: + generation_inputs = inputs[self.model.main_input_name] + + gen_kwargs["input_ids"] = generation_inputs + generated_tokens = self.model.generate(**gen_kwargs) + generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:] + + # in case the batch is shorter than max length, the output should be padded + if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) + + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + if has_labels: + labels = inputs["labels"] + if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) + else: + labels = None + + return (loss, generated_tokens, labels) + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = ( + self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + ) + else: + if self.model.config.pad_token_id is not None: + pad_token_id = self.model.config.pad_token_id + else: + raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") + + padded_tensor = pad_token_id * torch.ones( + (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device + ) + padded_tensor[:, : tensor.shape[-1]] = tensor + return padded_tensor # monkey patching diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index c6885a6050..50e2d49100 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -158,6 +158,7 @@ def __init__( self.activate = act_layer() self.ln2 = nn.Linear(adapter_length, dim) self.init_weights() + self._prepared = False def init_weights(self): @@ -169,6 +170,11 @@ def _init_weights(m): self.apply(_init_weights) def forward(self, x, identity=None): + if not self._prepared: + self.ln1.to(x.device) + self.activate.to(x.device) + self.ln2.to(x.device) + self._prepared = True out = self.ln2(self.activate(self.ln1(x))) if identity is None: identity = x diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index e5e315385f..7a7fb711c9 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -126,7 +126,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, Returns: The lora modules """ - modules = [] module_keys = [key for key, _ in model.named_modules()] assert isinstance(replace_modules, (str, list)) if isinstance(replace_modules, str): @@ -234,9 +233,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, lora_module.state = sub_module.state lora_module.to(sub_module.weight.device) setattr(module, _key, lora_module) - modules.append(lora_module) - - return modules @staticmethod def unpatch_lora(model, config: LoRAConfig): diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 1f5c4b1b14..927e7437b2 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -79,7 +79,6 @@ class Prompt: @staticmethod def prepare_model(model: nn.Module, config: PromptConfig): module_keys = [key for key, _ in model.named_modules()] - match_module_keys = [] for module_key in module_keys: if isinstance(config.target_modules, str): target_module_found = re.fullmatch(config.target_modules, @@ -144,7 +143,6 @@ def _forward(self, *args, **kwargs): config.attention_mask_value, config.attach_front) setattr(module, 'prompt', prompt_module) - match_module_keys.append(module_key) def state_dict_callback(state_dict): return { @@ -185,12 +183,11 @@ def __init__(self, self.prompt_length = prompt_length self.mask_values = mask_values self.attach_front = attach_front - self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim)) nn.init.xavier_uniform_(self.prompt_token) def forward(self, x): - prompt_token = self.prompt_token.expand(x.shape[0], -1, -1) + prompt_token = self.prompt_token.expand(x.shape[0], -1, -1).to(x.device) if self.layer_num == 0: if self.attach_front: From ec27d414390a7e9341fe986be0b5370b66d3cfae Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 6 Sep 2023 11:27:57 +0800 Subject: [PATCH 12/70] add restuner test --- examples/pytorch/llm/src/llm_sft.py | 8 +++- examples/pytorch/llm/src/utils/dataset.py | 6 --- examples/pytorch/llm/src/utils/model.py | 57 +++++++++++++++++++++++ swift/trainers/trainers.py | 4 +- 4 files changed, 67 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index e12cdacc0f..131d02a570 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -14,7 +14,7 @@ select_bnb, select_dtype, show_layers) from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer, - Seq2SeqTrainingArguments, Swift, SwiftConfig, get_logger) + Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, seed_everything) @@ -227,6 +227,12 @@ def llm_sft(args: SftArguments) -> None: adapter_length=args.adapter_length, ) swift_config['adapter'] = adapter_config + elif sft_type == 'restuner': + restuner_config = ResTuningConfig( + dims=model.config.hidden_size, + **MODEL_MAPPING[args.model_type]['restuner_TM'] + ) + swift_config['restuner'] = restuner_config model = Swift.prepare_model(model, swift_config) else: model = Swift.from_pretrained( diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 1ec7397261..af3ccb6d66 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -56,12 +56,6 @@ def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: return dataset_train, dataset_val -def get_alpaca_gpt4_en_dataset() -> HfDataset: - dataset: HfDataset = MsDataset.load( - 'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset() - return _process_alpaca_dataset(dataset) - - def get_alpaca_gpt4_zh_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset() diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index d16b76bfa0..b5b2af91f2 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -181,6 +181,45 @@ class AdapterTM(NamedTuple): polylm = ['mlp'] +class ResTunerTM(NamedTuple): + # default lora target modules. qkv + baichuan = { + "root_modules": r'.*layers.0$', + "stem_modules": r'.*layers\.\d+$', + "target_modules": r'.*model.norm', + "target_modules_hook": "input", + "tuner_cfg": "res_adapter", + } + chatglm2 = { + "root_modules": r'.*layers.0$', + "stem_modules": r'.*layers\.\d+$', + "target_modules": r'.*final_layernorm', + "target_modules_hook": "input", + "tuner_cfg": "res_adapter", + } + llama2 = { + "root_modules": r'.*layers.0$', + "stem_modules": r'.*layers\.\d+$', + "target_modules": r'.*model.norm', + "target_modules_hook": "input", + "tuner_cfg": "res_adapter", + } + qwen = { + "root_modules": r'.*transformer.h.0$', + "stem_modules": r'.*transformer.h\.\d+$', + "target_modules": r'.*transformer.ln_f', + "target_modules_hook": "input", + "tuner_cfg": "res_adapter", + } + polylm = { + "root_modules": r'.*transformer.h.0$', + "stem_modules": r'.*transformer.h\.\d+$', + "target_modules": r'.*transformer.ln_f', + "target_modules_hook": "input", + "tuner_cfg": "res_adapter", + } + + # Model Home: 'https://modelscope.cn/models/{model_id}/summary' # keys: 'model_id', 'revision', 'get_function', 'template', # 'ignore_file_pattern', 'lora_TM' @@ -191,6 +230,7 @@ class AdapterTM(NamedTuple): 'get_function': get_model_tokenizer_qwen, 'lora_TM': LoRATM.qwen, 'adapter_TM': AdapterTM.qwen, + 'restuner_TM': ResTunerTM.qwen, }, 'qwen-7b-chat': { 'model_id': 'qwen/Qwen-7B-Chat', @@ -199,6 +239,7 @@ class AdapterTM(NamedTuple): 'template': 'chatml', 'lora_TM': LoRATM.qwen, 'adapter_TM': AdapterTM.qwen, + 'restuner_TM': ResTunerTM.qwen, }, 'qwen-vl': { 'model_id': 'qwen/Qwen-VL', @@ -206,6 +247,7 @@ class AdapterTM(NamedTuple): 'get_function': get_model_tokenizer_qwen_vl, 'lora_TM': LoRATM.qwen, 'adapter_TM': AdapterTM.qwen, + 'restuner_TM': ResTunerTM.qwen, }, 'qwen-vl-chat': { 'model_id': 'qwen/Qwen-VL-Chat', @@ -214,12 +256,14 @@ class AdapterTM(NamedTuple): 'template': 'chatml', 'lora_TM': LoRATM.qwen, 'adapter_TM': AdapterTM.qwen, + 'restuner_TM': ResTunerTM.qwen, }, 'baichuan-7b': { 'model_id': 'baichuan-inc/baichuan-7B', 'revision': 'v1.0.7', 'lora_TM': LoRATM.baichuan, 'adapter_TM': AdapterTM.baichuan, + 'restuner_TM': ResTunerTM.baichuan, }, 'baichuan-13b': { 'model_id': 'baichuan-inc/Baichuan-13B-Base', @@ -227,6 +271,7 @@ class AdapterTM(NamedTuple): 'get_function': get_model_tokenizer_baichuan13b, 'lora_TM': LoRATM.baichuan, 'adapter_TM': AdapterTM.baichuan, + 'restuner_TM': ResTunerTM.baichuan, }, 'baichuan-13b-chat': { 'model_id': 'baichuan-inc/Baichuan-13B-Chat', @@ -234,6 +279,7 @@ class AdapterTM(NamedTuple): 'template': 'baichuan', 'lora_TM': LoRATM.baichuan, 'adapter_TM': AdapterTM.baichuan, + 'restuner_TM': ResTunerTM.baichuan, }, 'chatglm2-6b': { 'model_id': 'ZhipuAI/chatglm2-6b', @@ -242,6 +288,7 @@ class AdapterTM(NamedTuple): 'template': 'chatglm2', 'lora_TM': LoRATM.chatglm2, 'adapter_TM': AdapterTM.chatglm2, + 'restuner_TM': ResTunerTM.chatglm2, }, 'chatglm2-6b-32k': { 'model_id': 'ZhipuAI/chatglm2-6b-32k', @@ -249,6 +296,7 @@ class AdapterTM(NamedTuple): 'template': 'chatglm2', 'lora_TM': LoRATM.chatglm2, 'adapter_TM': AdapterTM.chatglm2, + 'restuner_TM': ResTunerTM.chatglm2, }, 'llama2-7b': { 'model_id': 'modelscope/Llama-2-7b-ms', @@ -256,6 +304,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], # use safetensors 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'llama2-13b': { 'model_id': 'modelscope/Llama-2-13b-ms', @@ -264,6 +313,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'llama2-70b': { 'model_id': 'modelscope/Llama-2-70b-ms', @@ -271,6 +321,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'llama2-7b-chat': { 'model_id': 'modelscope/Llama-2-7b-chat-ms', @@ -279,6 +330,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], # use safetensors 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'llama2-13b-chat': { 'model_id': 'modelscope/Llama-2-13b-chat-ms', @@ -288,6 +340,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'llama2-70b-chat': { 'model_id': 'modelscope/Llama-2-70b-chat-ms', @@ -297,6 +350,7 @@ class AdapterTM(NamedTuple): 'ignore_file_pattern': [r'.+\.bin$'], 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'openbuddy-llama2-13b': { 'model_id': 'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16', @@ -304,6 +358,7 @@ class AdapterTM(NamedTuple): 'template': 'openbuddy_llama', 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'openbuddy-llama-65b': { 'model_id': 'OpenBuddy/openbuddy-llama-65b-v8-bf16', @@ -311,6 +366,7 @@ class AdapterTM(NamedTuple): 'template': 'openbuddy_llama', 'lora_TM': LoRATM.llama2, 'adapter_TM': AdapterTM.llama2, + 'restuner_TM': ResTunerTM.llama2, }, 'polylm-13b': { 'model_id': 'damo/nlp_polylm_13b_text_generation', @@ -318,6 +374,7 @@ class AdapterTM(NamedTuple): 'get_function': get_model_tokenizer_polylm, 'lora_TM': LoRATM.polylm, 'adapter_TM': AdapterTM.polylm, + 'restuner_TM': ResTunerTM.polylm, }, } diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index faceae77a1..bc416b9bea 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -8,6 +8,7 @@ import torch from torch import nn from torch.utils.data import Dataset +from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer_utils import PredictionOutput from transformers.utils import logging @@ -219,7 +220,8 @@ def prediction_step( else: labels = None - return (loss, generated_tokens, labels) + # return (loss, generated_tokens, labels) + return (loss, None, None) def _pad_tensors_to_max_len(self, tensor, max_length): if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): From 5c8b401597727dc8e131acbf00212fd8245a4abc Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 7 Sep 2023 13:17:19 +0800 Subject: [PATCH 13/70] test --- examples/pytorch/llm/src/llm_sft.py | 48 ++++++++++++++++---- examples/pytorch/llm/src/utils/preprocess.py | 25 ++++++++-- swift/trainers/trainers.py | 11 +++-- swift/tuners/adapter.py | 6 ++- swift/tuners/restuning_components.py | 24 ++++++++-- swift/utils/llm_utils.py | 3 +- 6 files changed, 95 insertions(+), 22 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 131d02a570..8f2118fa02 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -2,16 +2,17 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' from dataclasses import dataclass, field from functools import partial -from typing import Dict, List, Optional +from typing import Dict, List +from typing import Optional +import jieba +import numpy as np import torch import torch.distributed as dist +from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu) +from rouge import Rouge +from rouge.rouge import Rouge from transformers import BitsAndBytesConfig -from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, find_all_linear_for_lora, get_dataset, - get_dist_setting, get_model_tokenizer, get_preprocess, - is_dist, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers) from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger) @@ -19,6 +20,11 @@ from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, seed_everything) from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset +from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, + broadcast_string, find_all_linear_for_lora, get_dataset, + get_dist_setting, get_model_tokenizer, get_preprocess, + is_dist, is_master, plot_images, process_dataset, + select_bnb, select_dtype, show_layers) logger = get_logger() @@ -78,7 +84,7 @@ class SftArguments: optim: str = 'adamw_torch' learning_rate: Optional[float] = None weight_decay: float = 0.01 - gradient_accumulation_steps: int = 16 + gradient_accumulation_steps: int = 1 max_grad_norm: float = 1. lr_scheduler_type: str = 'cosine' warmup_ratio: float = 0.05 @@ -246,6 +252,8 @@ def llm_sft(args: SftArguments) -> None: dataset = get_dataset(args.dataset.split(',')) if isinstance(dataset, tuple): train_dataset, val_dataset = dataset + # train_dataset = train_dataset.select(range(100)) + # val_dataset = val_dataset.select(range(100)) else: train_dataset, val_dataset = process_dataset(dataset, args.dataset_test_size, @@ -258,6 +266,13 @@ def llm_sft(args: SftArguments) -> None: args.max_length, batched=True) train_dataset = train_dataset.map(preprocess_func, batched=True) + preprocess_func = get_preprocess( + args.template_type, + tokenizer, + args.system, + args.max_length, + batched=True, + train=False) val_dataset = val_dataset.map(preprocess_func, batched=True) del dataset # Data analysis @@ -279,7 +294,7 @@ def llm_sft(args: SftArguments) -> None: do_eval=True, evaluation_strategy='steps', per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=args.batch_size, + per_device_eval_batch_size=1, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, @@ -309,6 +324,7 @@ def llm_sft(args: SftArguments) -> None: resume_from_checkpoint=args.resume_from_ckpt, ddp_backend=args.ddp_backend, gradient_checkpointing=args.gradient_checkpointing, + predict_with_generate=True, local_rank=local_rank) if args.gradient_checkpointing: @@ -322,7 +338,7 @@ def llm_sft(args: SftArguments) -> None: trainer_args._frozen = True logger.info(f'trainer_args: {trainer_args}') - def compute_metrics(self, prediction): + def compute_metrics(prediction): preds, labels = prediction[0], prediction[1] if isinstance(preds, tuple): preds = preds[0] @@ -333,7 +349,21 @@ def compute_metrics(self, prediction): 'rouge-l': [], 'bleu-4': [] } + + def _decode(tokens, ignore_pad_token_for_loss=False): + if ignore_pad_token_for_loss: + tokens = np.where(tokens != -100, tokens, + tokenizer.pad_token_id) + tokens = np.where(tokens < tokenizer.vocab_size, tokens, + tokenizer.pad_token_id) + return [ + t for t in tokenizer.batch_decode( + tokens, skip_special_tokens=True) if t != '' + ] + for pred, label in zip(preds, labels): + pred = ''.join(_decode(pred, False)) + label = ''.join(_decode(label, True)) hypothesis = list(jieba.cut(pred)) if len(hypothesis) == 0 or ''.join(hypothesis) == '.': hypothesis = [''] diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 417e0fc713..f52dc4bb82 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -40,6 +40,12 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, + 'chatglm2_no_template': { + 'prefix': [[64790, 64792]], + 'prompt': ['{{query}}'], + 'chat_sep': [], + 'suffix': [['eos_token_id']], + }, 'llama': { 'prefix': [['bos_token_id'], '[INST] <>\n{{system}}\n<>\n\n'], @@ -126,6 +132,7 @@ def _preprocess( history: Optional[History] = None, system: Optional[str] = None, max_length: Optional[int] = None, + train = True, ) -> Dict[str, List[int]]: if history is None: history = [] @@ -158,17 +165,24 @@ def _preprocess( labels = None if response is not None: - labels = [-100] * len(input_ids) tgt_input_ids = _encode(tokenizer, [response], []) tgt_input_ids += _encode(tokenizer, template_config['suffix'], []) - input_ids += tgt_input_ids - labels += tgt_input_ids + if train: + labels = [-100] * len(input_ids) + tgt_input_ids + input_ids += tgt_input_ids + else: + labels = tgt_input_ids if max_length is not None: input_ids = input_ids[-max_length:] if labels is not None: labels = labels[-max_length:] + if train: + pass + else: + input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids + return {'input_ids': input_ids, 'labels': labels} @@ -177,7 +191,8 @@ def get_preprocess( tokenizer: PreTrainedTokenizer, system: Optional[str] = None, max_length: Optional[int] = None, - batched: bool = False + batched: bool = False, + train=True, ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]: def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: @@ -186,7 +201,7 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: response: str = example.get('response', None) custom_system = example.get('system', system) return _preprocess(template_type, tokenizer, query, response, history, - custom_system, max_length) + custom_system, max_length, train) if batched: # Avoid tqdm printing too much logs when dataset.map(...) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index bc416b9bea..440ec1633d 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -166,7 +166,12 @@ def prediction_step( inputs = self._prepare_inputs(inputs) # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = self._gen_kwargs.copy() + gen_kwargs = { + 'do_sample': True, + 'top_p': 0.7, + 'max_length': 512, + 'temperature': 0.95 + } if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: gen_kwargs["max_length"] = self.model.config.max_length gen_kwargs["num_beams"] = ( @@ -220,8 +225,8 @@ def prediction_step( else: labels = None - # return (loss, generated_tokens, labels) - return (loss, None, None) + return (loss, generated_tokens, labels) + # return (loss, None, None) def _pad_tensors_to_max_len(self, tensor, max_length): if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 50e2d49100..bbd2dd880e 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -175,8 +175,12 @@ def forward(self, x, identity=None): self.activate.to(x.device) self.ln2.to(x.device) self._prepared = True + + x_dtype = x.dtype + x = x.to(self.ln1.weight.dtype) out = self.ln2(self.activate(self.ln1(x))) if identity is None: identity = x + identity = identity.to(out.dtype) out = identity + out - return out + return out.to(x_dtype) diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py index f6aefb0610..207f02ecd5 100644 --- a/swift/tuners/restuning_components.py +++ b/swift/tuners/restuning_components.py @@ -60,7 +60,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", self.layer_num = layer_num self.depth = depth - self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 10 + self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 17 self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None @@ -86,6 +86,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", self._xavier_init_weights(self.ln2) self.scaling = init_weight_type(dim, self.adapter_weight) + self._prepared = False def _zero_init_weights(self, m): if isinstance(m, nn.Linear): @@ -103,6 +104,14 @@ def _xavier_init_weights(self, m): nn.init.normal_(m.bias, std=1e-6) def forward(self, x): + if not self._prepared: + self.ln1.to(x.device) + self.activate.to(x.device) + self.ln2.to(x.device) + self._prepared = True + + x_dtype = x.dtype + x = x.to(self.ln1.weight.dtype) x_shortcut = x if len(x_shortcut.size()) == 4: B, C, N1, N2 = x.size() @@ -117,7 +126,7 @@ def forward(self, x): x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1], x_shortcut.size()[2], x_shortcut.size()[3]) x_out = x_shortcut + x_adapter - return x_out + return x_out.to(x_dtype) class ResGroupAdapter(nn.Module): @@ -148,6 +157,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", else: self._kaiming_init_weights(self.ln2) self.scaling = init_weight_type(dim, self.adapter_weight) + self._prepared = False def _zero_init_weights(self, m): if isinstance(m, nn.Linear): @@ -165,6 +175,14 @@ def _xavier_init_weights(self, m): nn.init.normal_(m.bias, std=1e-6) def forward(self, x): + if not self._prepared: + self.ln1.to(x.device) + self.activate.to(x.device) + self.ln2.to(x.device) + self._prepared = True + + x_dtype = x.dtype + x = x.to(self.ln1.weight.dtype) x_shortcut = x batch, inner_dim, height, width = x.shape @@ -181,7 +199,7 @@ def forward(self, x): x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous() x_out = x_shortcut + x_adapter - return x_out + return x_out.to(x_dtype) class Identity(nn.Module): diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py index 3ae6e3aca7..21ab38ae6f 100644 --- a/swift/utils/llm_utils.py +++ b/swift/utils/llm_utils.py @@ -32,7 +32,8 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: input_ids = [torch.tensor(b['input_ids']) for b in batch] labels = [torch.tensor(b['labels']) for b in batch] attention_mask = [ - torch.ones(len(input_ids[i]), dtype=torch.int64) + torch.where(input_ids[i]==tokenizer.pad_token_id, + 0, 1) for i in range(len(input_ids)) ] From 534d4c91be4cd3c65a3d8bb40d3452f80810fcfe Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 7 Sep 2023 13:58:21 +0800 Subject: [PATCH 14/70] wip --- swift/tuners/restuning_components.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py index 207f02ecd5..a3ab2dfe28 100644 --- a/swift/tuners/restuning_components.py +++ b/swift/tuners/restuning_components.py @@ -295,7 +295,7 @@ def detach_tensors(feats): elif isinstance(feats, dict): feats = {key: detach_tensors(val) for key, val in feats.items()} elif isinstance(feats, torch.Tensor): - feats = feats.detach().float() + feats = feats.detach() else: feats = feats.detach() return feats From 7734e98abc5f15f393545cb52178fda85b069519 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 7 Sep 2023 17:39:03 +0800 Subject: [PATCH 15/70] wip --- examples/pytorch/llm/src/llm_sft.py | 103 ++++++++++++++++++- examples/pytorch/llm/src/utils/preprocess.py | 8 +- swift/utils/llm_utils.py | 9 ++ 3 files changed, 113 insertions(+), 7 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 8f2118fa02..83eddd817b 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -259,13 +259,110 @@ def llm_sft(args: SftArguments) -> None: args.dataset_test_size, args.dataset_sample, args.dataset_seed) + + args.max_source_length = 64 + args.max_target_length = 64 + prompt_column = 'query' + response_column = 'response' + history_column = None + prefix = '' + max_target_length = 128 + def preprocess_function_eval(examples): + inputs, targets = [], [] + for i in range(len(examples[prompt_column])): + if examples[prompt_column][i] and examples[response_column][i]: + query = examples[prompt_column][i] + if history_column is None or len(examples[history_column][i]) == 0: + prompt = query + else: + prompt = '' + history = examples[history_column][i] + for turn_idx, (old_query, response) in enumerate(history): + prompt += '[Round {}]\n问:{}\n答:{}\n'.format( + turn_idx, old_query, response) + prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) + inputs.append(prompt) + targets.append(examples[response_column][i]) + + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer( + inputs, + max_length=args.max_source_length, + truncation=True, + padding=True) + labels = tokenizer( + text_target=targets, max_length=max_target_length, truncation=True) + + if True: + labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100) + for lb in label] + for label in labels['input_ids']] + model_inputs['labels'] = labels['input_ids'] + + return model_inputs + + + def preprocess_function_train(examples): + max_seq_length = args.max_source_length + args.max_target_length + + model_inputs = { + 'input_ids': [], + 'labels': [], + } + for i in range(len(examples[prompt_column])): + if examples[prompt_column][i] and examples[response_column][i]: + query, answer = examples[prompt_column][i], examples[ + response_column][i] + + if history_column is None: + prompt = query + else: + prompt = '' + history = examples[history_column][i] + for turn_idx, (old_query, response) in enumerate(history): + prompt += '[Round {}]\n问:{}\n答:{}\n'.format( + turn_idx, old_query, response) + prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) + + prompt = prefix + prompt + a_ids = tokenizer.encode(text=prompt, add_special_tokens=False) + b_ids = tokenizer.encode(text=answer, add_special_tokens=False) + + if len(a_ids) > args.max_source_length - 1: + a_ids = a_ids[:args.max_source_length - 1] + + if len(b_ids) > args.max_target_length - 2: + b_ids = b_ids[:args.max_target_length - 2] + + input_ids = tokenizer.build_inputs_with_special_tokens( + a_ids, b_ids) + + if False: + context_length = input_ids.index(tokenizer.bos_token_id) + else: + context_length = len(a_ids) + 2 + mask_position = context_length - 1 + labels = [-100] * context_length + input_ids[mask_position + 1:] + + pad_len = max_seq_length - len(input_ids) + input_ids = input_ids + [tokenizer.pad_token_id] * pad_len + labels = labels + [tokenizer.pad_token_id] * pad_len + if True: + labels = [(lb if lb != tokenizer.pad_token_id else -100) + for lb in labels] + + model_inputs['input_ids'].append(input_ids) + model_inputs['labels'].append(labels) + + return model_inputs + preprocess_func = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, batched=True) - train_dataset = train_dataset.map(preprocess_func, batched=True) + train_dataset = train_dataset.map(preprocess_function_train, batched=True) preprocess_func = get_preprocess( args.template_type, tokenizer, @@ -273,7 +370,7 @@ def llm_sft(args: SftArguments) -> None: args.max_length, batched=True, train=False) - val_dataset = val_dataset.map(preprocess_func, batched=True) + val_dataset = val_dataset.map(preprocess_function_eval, batched=True) del dataset # Data analysis stat_dataset(train_dataset) @@ -329,7 +426,7 @@ def llm_sft(args: SftArguments) -> None: if args.gradient_checkpointing: # fix: gradients will be None - model.config.use_cache = False + model.config.use_cache = True model.enable_input_require_grads() if is_dist(): trainer_args._frozen = False # Compatible with transformers==4.32.0 diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index f52dc4bb82..13d4f267da 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -178,10 +178,10 @@ def _preprocess( if labels is not None: labels = labels[-max_length:] - if train: - pass - else: - input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids + # if train: + # pass + # else: + # input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids return {'input_ids': input_ids, 'labels': labels} diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py index 21ab38ae6f..2892099b20 100644 --- a/swift/utils/llm_utils.py +++ b/swift/utils/llm_utils.py @@ -42,6 +42,15 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: attention_mask = pad_sequence( attention_mask, batch_first=True, padding_value=0) labels = pad_sequence(labels, batch_first=True, padding_value=-100) + + # if 'position_ids' in batch[0]: + # position_ids = [torch.tensor(b['position_ids']) for b in batch] + # return { + # 'input_ids': input_ids, + # 'attention_mask': attention_mask, + # 'labels': labels, + # 'position_ids': torch.stack(position_ids), + # } return { 'input_ids': input_ids, 'attention_mask': attention_mask, From 0be303b1530edc97dc5d1afefaf1226f1b2f0e02 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 19:37:56 +0800 Subject: [PATCH 16/70] refine code --- examples/pytorch/llm/src/llm_sft.py | 239 ++++++++++--------- examples/pytorch/llm/src/utils/dataset.py | 20 +- examples/pytorch/llm/src/utils/preprocess.py | 18 +- swift/trainers/trainers.py | 233 +----------------- swift/utils/llm_utils.py | 13 +- 5 files changed, 148 insertions(+), 375 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 83eddd817b..2f730f76fa 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -74,22 +74,24 @@ class SftArguments: lora_rank: int = 8 lora_alpha: int = 32 lora_dropout_p: float = 0.1 - adapter_length: int = 128 + adapter_length: int = 32 gradient_checkpointing: bool = True batch_size: int = 1 + eval_batch_size: int = 1 num_train_epochs: int = 1 # if max_steps >= 0, override num_train_epochs max_steps: int = -1 optim: str = 'adamw_torch' learning_rate: Optional[float] = None weight_decay: float = 0.01 - gradient_accumulation_steps: int = 1 + gradient_accumulation_steps: int = 16 max_grad_norm: float = 1. + predict_with_generate: bool = False lr_scheduler_type: str = 'cosine' warmup_ratio: float = 0.05 - eval_steps: int = 10 + eval_steps: int = 50 save_steps: Optional[int] = None save_total_limit: int = 2 logging_steps: int = 5 @@ -105,7 +107,7 @@ class SftArguments: default=None, metadata={ 'help': - 'SDK token can be found in https://modelscope.cn/my/myaccesstoken' + 'SDK token can be found in https://modelscope.cn/my/myaccesstoken' }) # other @@ -113,7 +115,7 @@ class SftArguments: default=None, metadata={ 'help': - "This parameter is used only when model_type.startswith('qwen-7b')" + "This parameter is used only when model_type.startswith('qwen-7b')" }) def __post_init__(self): @@ -232,12 +234,14 @@ def llm_sft(args: SftArguments) -> None: hidden_pos=0, adapter_length=args.adapter_length, ) + logger.info(f'adapter_config: {adapter_config}') swift_config['adapter'] = adapter_config elif sft_type == 'restuner': restuner_config = ResTuningConfig( dims=model.config.hidden_size, **MODEL_MAPPING[args.model_type]['restuner_TM'] ) + logger.info(f'restuner_config: {restuner_config}') swift_config['restuner'] = restuner_config model = Swift.prepare_model(model, swift_config) else: @@ -252,125 +256,124 @@ def llm_sft(args: SftArguments) -> None: dataset = get_dataset(args.dataset.split(',')) if isinstance(dataset, tuple): train_dataset, val_dataset = dataset - # train_dataset = train_dataset.select(range(100)) - # val_dataset = val_dataset.select(range(100)) else: train_dataset, val_dataset = process_dataset(dataset, - args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) - - args.max_source_length = 64 - args.max_target_length = 64 - prompt_column = 'query' - response_column = 'response' - history_column = None - prefix = '' - max_target_length = 128 - def preprocess_function_eval(examples): - inputs, targets = [], [] - for i in range(len(examples[prompt_column])): - if examples[prompt_column][i] and examples[response_column][i]: - query = examples[prompt_column][i] - if history_column is None or len(examples[history_column][i]) == 0: - prompt = query - else: - prompt = '' - history = examples[history_column][i] - for turn_idx, (old_query, response) in enumerate(history): - prompt += '[Round {}]\n问:{}\n答:{}\n'.format( - turn_idx, old_query, response) - prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) - inputs.append(prompt) - targets.append(examples[response_column][i]) - - inputs = [prefix + inp for inp in inputs] - model_inputs = tokenizer( - inputs, - max_length=args.max_source_length, - truncation=True, - padding=True) - labels = tokenizer( - text_target=targets, max_length=max_target_length, truncation=True) - - if True: - labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100) - for lb in label] - for label in labels['input_ids']] - model_inputs['labels'] = labels['input_ids'] - - return model_inputs - - - def preprocess_function_train(examples): - max_seq_length = args.max_source_length + args.max_target_length - - model_inputs = { - 'input_ids': [], - 'labels': [], - } - for i in range(len(examples[prompt_column])): - if examples[prompt_column][i] and examples[response_column][i]: - query, answer = examples[prompt_column][i], examples[ - response_column][i] - - if history_column is None: - prompt = query - else: - prompt = '' - history = examples[history_column][i] - for turn_idx, (old_query, response) in enumerate(history): - prompt += '[Round {}]\n问:{}\n答:{}\n'.format( - turn_idx, old_query, response) - prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) - - prompt = prefix + prompt - a_ids = tokenizer.encode(text=prompt, add_special_tokens=False) - b_ids = tokenizer.encode(text=answer, add_special_tokens=False) - - if len(a_ids) > args.max_source_length - 1: - a_ids = a_ids[:args.max_source_length - 1] - - if len(b_ids) > args.max_target_length - 2: - b_ids = b_ids[:args.max_target_length - 2] - - input_ids = tokenizer.build_inputs_with_special_tokens( - a_ids, b_ids) - - if False: - context_length = input_ids.index(tokenizer.bos_token_id) - else: - context_length = len(a_ids) + 2 - mask_position = context_length - 1 - labels = [-100] * context_length + input_ids[mask_position + 1:] - - pad_len = max_seq_length - len(input_ids) - input_ids = input_ids + [tokenizer.pad_token_id] * pad_len - labels = labels + [tokenizer.pad_token_id] * pad_len - if True: - labels = [(lb if lb != tokenizer.pad_token_id else -100) - for lb in labels] - - model_inputs['input_ids'].append(input_ids) - model_inputs['labels'].append(labels) - - return model_inputs + args.dataset_test_size, + args.dataset_sample, + args.dataset_seed) + + # args.max_source_length = 64 + # args.max_target_length = 64 + # prompt_column = 'query' + # response_column = 'response' + # history_column = None + # prefix = '' + # max_target_length = 128 + # + # def preprocess_function_eval(examples): + # inputs, targets = [], [] + # for i in range(len(examples[prompt_column])): + # if examples[prompt_column][i] and examples[response_column][i]: + # query = examples[prompt_column][i] + # if history_column is None or len(examples[history_column][i]) == 0: + # prompt = query + # else: + # prompt = '' + # history = examples[history_column][i] + # for turn_idx, (old_query, response) in enumerate(history): + # prompt += '[Round {}]\n问:{}\n答:{}\n'.format( + # turn_idx, old_query, response) + # prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) + # inputs.append(prompt) + # targets.append(examples[response_column][i]) + # + # inputs = [prefix + inp for inp in inputs] + # model_inputs = tokenizer( + # inputs, + # max_length=args.max_source_length, + # truncation=True, + # padding=True) + # labels = tokenizer( + # text_target=targets, max_length=max_target_length, truncation=True) + # + # if True: + # labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100) + # for lb in label] + # for label in labels['input_ids']] + # model_inputs['labels'] = labels['input_ids'] + # + # return model_inputs + # + # def preprocess_function_train(examples): + # max_seq_length = args.max_source_length + args.max_target_length + # + # model_inputs = { + # 'input_ids': [], + # 'labels': [], + # } + # for i in range(len(examples[prompt_column])): + # if examples[prompt_column][i] and examples[response_column][i]: + # query, answer = examples[prompt_column][i], examples[ + # response_column][i] + # + # if history_column is None: + # prompt = query + # else: + # prompt = '' + # history = examples[history_column][i] + # for turn_idx, (old_query, response) in enumerate(history): + # prompt += '[Round {}]\n问:{}\n答:{}\n'.format( + # turn_idx, old_query, response) + # prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) + # + # prompt = prefix + prompt + # a_ids = tokenizer.encode(text=prompt, add_special_tokens=False) + # b_ids = tokenizer.encode(text=answer, add_special_tokens=False) + # + # if len(a_ids) > args.max_source_length - 1: + # a_ids = a_ids[:args.max_source_length - 1] + # + # if len(b_ids) > args.max_target_length - 2: + # b_ids = b_ids[:args.max_target_length - 2] + # + # input_ids = tokenizer.build_inputs_with_special_tokens( + # a_ids, b_ids) + # + # if False: + # context_length = input_ids.index(tokenizer.bos_token_id) + # else: + # context_length = len(a_ids) + 2 + # mask_position = context_length - 1 + # labels = [-100] * context_length + input_ids[mask_position + 1:] + # + # pad_len = max_seq_length - len(input_ids) + # input_ids = input_ids + [tokenizer.pad_token_id] * pad_len + # labels = labels + [tokenizer.pad_token_id] * pad_len + # if True: + # labels = [(lb if lb != tokenizer.pad_token_id else -100) + # for lb in labels] + # + # model_inputs['input_ids'].append(input_ids) + # model_inputs['labels'].append(labels) + # + # return model_inputs preprocess_func = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, - batched=True) - train_dataset = train_dataset.map(preprocess_function_train, batched=True) + batched=True, + validate_generation=False) + train_dataset = train_dataset.map(preprocess_func, batched=True) preprocess_func = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, batched=True, - train=False) - val_dataset = val_dataset.map(preprocess_function_eval, batched=True) + validate_generation=True) + val_dataset = val_dataset.map(preprocess_func, batched=True) del dataset # Data analysis stat_dataset(train_dataset) @@ -391,7 +394,7 @@ def preprocess_function_train(examples): do_eval=True, evaluation_strategy='steps', per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=1, + per_device_eval_batch_size=args.eval_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, @@ -421,7 +424,7 @@ def preprocess_function_train(examples): resume_from_checkpoint=args.resume_from_ckpt, ddp_backend=args.ddp_backend, gradient_checkpointing=args.gradient_checkpointing, - predict_with_generate=True, + predict_with_generate=args.predict_with_generate, local_rank=local_rank) if args.gradient_checkpointing: @@ -437,8 +440,6 @@ def preprocess_function_train(examples): def compute_metrics(prediction): preds, labels = prediction[0], prediction[1] - if isinstance(preds, tuple): - preds = preds[0] score_dict = { 'rouge-1': [], @@ -450,12 +451,12 @@ def compute_metrics(prediction): def _decode(tokens, ignore_pad_token_for_loss=False): if ignore_pad_token_for_loss: tokens = np.where(tokens != -100, tokens, - tokenizer.pad_token_id) + tokenizer.pad_token_id) tokens = np.where(tokens < tokenizer.vocab_size, tokens, - tokenizer.pad_token_id) + tokenizer.pad_token_id) return [ t for t in tokenizer.batch_decode( - tokens, skip_special_tokens=True) if t != '' + tokens, skip_special_tokens=True) ] for pred, label in zip(preds, labels): @@ -463,7 +464,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False): label = ''.join(_decode(label, True)) hypothesis = list(jieba.cut(pred)) if len(hypothesis) == 0 or ''.join(hypothesis) == '.': - hypothesis = [''] + hypothesis = [tokenizer.decode(tokenizer.eos_token_id)] reference = list(jieba.cut(label)) rouge = Rouge() scores = rouge.get_scores(' '.join(hypothesis), @@ -489,7 +490,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False): train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, - compute_metrics=compute_metrics, + compute_metrics=compute_metrics if args.predict_with_generate else None, ) trainer.train(trainer_args.resume_from_checkpoint) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index af3ccb6d66..2f0c73b819 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -2,7 +2,7 @@ import os import re from functools import partial -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import json import numpy as np @@ -322,15 +322,27 @@ def get_gpt4all_en_dataset() -> HfDataset: } -def get_dataset(dataset_name_list: List[str]) -> HfDataset: - dataset_list: List[HfDataset] = [] +def get_dataset(dataset_name_list: List[str]) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]: + """Returns a dataset to be split or a train-val dataset tuple""" + dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = [] for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] dataset_list.append(get_function()) + + assert(all(isinstance(dataset, tuple) for dataset in dataset_list) + or all(isinstance(dataset, HfDataset) for dataset in dataset_list)) if not isinstance(dataset_list[0], tuple): dataset = concatenate_datasets(dataset_list) else: - dataset = dataset_list[0] + train_datasets = [dataset[0] for dataset in dataset_list] + val_datasets = [dataset[1] for dataset in dataset_list] + if len(train_datasets) > 1: + train_dataset = concatenate_datasets(train_datasets) + val_dataset = concatenate_datasets(val_datasets) + else: + train_dataset = train_datasets[0] + val_dataset = val_datasets[0] + dataset = (train_dataset, val_dataset) return dataset diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 13d4f267da..05090b0862 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -12,7 +12,7 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, - 'default_no_template': { + 'default_generate': { 'prefix': [], 'prompt': ['{{query}}'], 'chat_sep': [], @@ -40,7 +40,7 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, - 'chatglm2_no_template': { + 'chatglm2_generate': { 'prefix': [[64790, 64792]], 'prompt': ['{{query}}'], 'chat_sep': [], @@ -132,7 +132,7 @@ def _preprocess( history: Optional[History] = None, system: Optional[str] = None, max_length: Optional[int] = None, - train = True, + validate_generation=True, # do cross-validation with `model.generate()` ) -> Dict[str, List[int]]: if history is None: history = [] @@ -167,10 +167,12 @@ def _preprocess( if response is not None: tgt_input_ids = _encode(tokenizer, [response], []) tgt_input_ids += _encode(tokenizer, template_config['suffix'], []) - if train: + if not validate_generation: + # train, or validate with `loss` labels = [-100] * len(input_ids) + tgt_input_ids input_ids += tgt_input_ids else: + # validate with `model.generate()` labels = tgt_input_ids if max_length is not None: @@ -178,9 +180,7 @@ def _preprocess( if labels is not None: labels = labels[-max_length:] - # if train: - # pass - # else: + # if validate_generation: # input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids return {'input_ids': input_ids, 'labels': labels} @@ -192,7 +192,7 @@ def get_preprocess( system: Optional[str] = None, max_length: Optional[int] = None, batched: bool = False, - train=True, + validate_generation=False, ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]: def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: @@ -201,7 +201,7 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: response: str = example.get('response', None) custom_system = example.get('system', system) return _preprocess(template_type, tokenizer, query, response, history, - custom_system, max_length, train) + custom_system, max_length, validate_generation) if batched: # Avoid tqdm printing too much logs when dataset.map(...) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 440ec1633d..aef7f0b5b3 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -3,14 +3,6 @@ from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch -from torch import nn -from torch.utils.data import Dataset -from transformers.deepspeed import is_deepspeed_zero3_enabled -from transformers.trainer_utils import PredictionOutput -from transformers.utils import logging from .mixin import PushToMsHubMixin, SwiftMixin from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew @@ -21,230 +13,7 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): - - def evaluate( - self, - eval_dataset: Optional[Dataset] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - **gen_kwargs - ) -> Dict[str, float]: - """ - Run evaluation and returns metrics. - - The calling script will be responsible for providing a method to compute metrics, as they are task-dependent - (pass it to the init `compute_metrics` argument). - - You can also subclass and override this method to inject custom behavior. - - Args: - eval_dataset (`Dataset`, *optional*): - Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns - not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` - method. - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is `"eval"` (default) - max_length (`int`, *optional*): - The maximum target length to use when predicting with the generate method. - num_beams (`int`, *optional*): - Number of beams for beam search that will be used when predicting with the generate method. 1 means no - beam search. - gen_kwargs: - Additional `generate` specific kwargs. - - Returns: - A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The - dictionary also contains the epoch number which comes from the training state. - """ - - gen_kwargs = gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.args.generation_max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) - self._gen_kwargs = gen_kwargs - - return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - - def predict( - self, - test_dataset: Dataset, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "test", - **gen_kwargs - ) -> PredictionOutput: - """ - Run prediction and returns predictions and potential metrics. - - Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method - will also return metrics, like in `evaluate()`. - - Args: - test_dataset (`Dataset`): - Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the - `model.forward()` method are automatically removed. Has to implement the method `__len__` - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is `"eval"` (default) - max_length (`int`, *optional*): - The maximum target length to use when predicting with the generate method. - num_beams (`int`, *optional*): - Number of beams for beam search that will be used when predicting with the generate method. 1 means no - beam search. - gen_kwargs: - Additional `generate` specific kwargs. - - - - If your predictions or labels have different sequence lengths (for instance because you're doing dynamic - padding in a token classification task) the predictions will be padded (on the right) to allow for - concatenation into one array. The padding index is -100. - - - - Returns: *NamedTuple* A namedtuple with the following keys: - - - predictions (`np.ndarray`): The predictions on `test_dataset`. - - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). - - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained - labels). - """ - - gen_kwargs = gen_kwargs.copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.args.generation_max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams - ) - self._gen_kwargs = gen_kwargs - - - return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) - - def prediction_step( - self, - model: nn.Module, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on `model` using `inputs`. - - Subclass and override to inject custom behavior. - - Args: - model (`nn.Module`): - The model to evaluate. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (`bool`): - Whether or not to return the loss only. - - Return: - Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and - labels (each being optional). - """ - - if not self.args.predict_with_generate or prediction_loss_only: - return super().prediction_step( - model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys - ) - - has_labels = "labels" in inputs - inputs = self._prepare_inputs(inputs) - - # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = { - 'do_sample': True, - 'top_p': 0.7, - 'max_length': 512, - 'temperature': 0.95 - } - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.model.config.max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams - ) - default_synced_gpus = True if is_deepspeed_zero3_enabled() else False - gen_kwargs["synced_gpus"] = ( - gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus - ) - - if "attention_mask" in inputs: - gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) - if "position_ids" in inputs: - gen_kwargs["position_ids"] = inputs.get("position_ids", None) - if "global_attention_mask" in inputs: - gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) - - # prepare generation inputs - # some encoder-decoder models can have varying encoder's and thus - # varying model input names - if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name: - generation_inputs = inputs[self.model.encoder.main_input_name] - else: - generation_inputs = inputs[self.model.main_input_name] - - gen_kwargs["input_ids"] = generation_inputs - generated_tokens = self.model.generate(**gen_kwargs) - generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:] - - # in case the batch is shorter than max length, the output should be padded - if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) - - loss = None - - if self.args.prediction_loss_only: - return (loss, None, None) - - if has_labels: - labels = inputs["labels"] - if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) - else: - labels = None - - return (loss, generated_tokens, labels) - # return (loss, None, None) - - def _pad_tensors_to_max_len(self, tensor, max_length): - if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): - # If PAD token is not defined at least EOS token has to be defined - pad_token_id = ( - self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - ) - else: - if self.model.config.pad_token_id is not None: - pad_token_id = self.model.config.pad_token_id - else: - raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") - - padded_tensor = pad_token_id * torch.ones( - (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device - ) - padded_tensor[:, : tensor.shape[-1]] = tensor - return padded_tensor + pass # monkey patching diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py index 2892099b20..6173b9292b 100644 --- a/swift/utils/llm_utils.py +++ b/swift/utils/llm_utils.py @@ -32,8 +32,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: input_ids = [torch.tensor(b['input_ids']) for b in batch] labels = [torch.tensor(b['labels']) for b in batch] attention_mask = [ - torch.where(input_ids[i]==tokenizer.pad_token_id, - 0, 1) + torch.ones(len(input_ids[i]), dtype=torch.int64) for i in range(len(input_ids)) ] @@ -42,15 +41,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: attention_mask = pad_sequence( attention_mask, batch_first=True, padding_value=0) labels = pad_sequence(labels, batch_first=True, padding_value=-100) - - # if 'position_ids' in batch[0]: - # position_ids = [torch.tensor(b['position_ids']) for b in batch] - # return { - # 'input_ids': input_ids, - # 'attention_mask': attention_mask, - # 'labels': labels, - # 'position_ids': torch.stack(position_ids), - # } + return { 'input_ids': input_ids, 'attention_mask': attention_mask, From 2ec90616cfb5c02527954b2a308dcf4343ed0a67 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 19:47:43 +0800 Subject: [PATCH 17/70] add generation config --- examples/pytorch/llm/src/llm_sft.py | 10 +++++++++- swift/utils/llm_utils.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 2f730f76fa..dc05bc2c81 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -12,7 +12,7 @@ from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu) from rouge import Rouge from rouge.rouge import Rouge -from transformers import BitsAndBytesConfig +from transformers import BitsAndBytesConfig, GenerationConfig from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger) @@ -262,6 +262,13 @@ def llm_sft(args: SftArguments) -> None: args.dataset_sample, args.dataset_seed) + generation_config = { + 'do_sample': True, + 'top_p': 0.7, + 'max_length': args.max_length, + 'temperature': 0.95 + } + # args.max_source_length = 64 # args.max_target_length = 64 # prompt_column = 'query' @@ -425,6 +432,7 @@ def llm_sft(args: SftArguments) -> None: ddp_backend=args.ddp_backend, gradient_checkpointing=args.gradient_checkpointing, predict_with_generate=args.predict_with_generate, + generation_config=GenerationConfig.from_dict(generation_config), local_rank=local_rank) if args.gradient_checkpointing: diff --git a/swift/utils/llm_utils.py b/swift/utils/llm_utils.py index 6173b9292b..b61ce173d7 100644 --- a/swift/utils/llm_utils.py +++ b/swift/utils/llm_utils.py @@ -41,7 +41,7 @@ def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: attention_mask = pad_sequence( attention_mask, batch_first=True, padding_value=0) labels = pad_sequence(labels, batch_first=True, padding_value=-100) - + return { 'input_ids': input_ids, 'attention_mask': attention_mask, From 953889039eb7fa70fc60c1c23211e68e8ca33f01 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 20:20:51 +0800 Subject: [PATCH 18/70] fix --- swift/trainers/trainers.py | 97 +++++++++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index aef7f0b5b3..0ba6d62aff 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -1,8 +1,13 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from torch import nn from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer +from transformers.deepspeed import is_deepspeed_zero3_enabled from .mixin import PushToMsHubMixin, SwiftMixin from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew @@ -13,7 +18,97 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): - pass + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + **gen_kwargs, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + gen_kwargs: + Additional `generate` specific kwargs. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + # XXX: adapt synced_gpus for fairscale as well + # Priority (handled in generate): + # gen_kwargs > model.generation_config > default GenerationConfig() + + if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): + gen_kwargs = self._gen_kwargs.copy() + + if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: + gen_kwargs["max_length"] = self.model.config.max_length + gen_kwargs["num_beams"] = ( + gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams + ) + default_synced_gpus = True if is_deepspeed_zero3_enabled() else False + gen_kwargs["synced_gpus"] = ( + gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus + ) + + # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate + # (otherwise, it would continue generating from the padded `decoder_input_ids`) + if ( + "labels" in inputs + and "decoder_input_ids" in inputs + and inputs["labels"].shape == inputs["decoder_input_ids"].shape + ): + inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} + generated_tokens = self.model.generate(**inputs, **gen_kwargs) + + # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop + # TODO: remove this hack when the legacy code that initializes generation_config from a model config is + # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 + if self.model.generation_config._from_model_config: + self.model.generation_config._from_model_config = False + + # Retrieves GenerationConfig from model.generation_config + gen_config = self.model.generation_config + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_config.max_length: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) + elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) + + if has_labels: + labels = inputs["labels"] + if labels.shape[-1] < gen_config.max_length: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) + elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) + else: + labels = None + + return None, generated_tokens, labels # monkey patching From 55ffd9e1532c54141ef03941f3e84467db261c98 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 20:28:30 +0800 Subject: [PATCH 19/70] fix --- swift/trainers/trainers.py | 88 +++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 0ba6d62aff..ee45ff5d15 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -25,7 +25,6 @@ def prediction_step( inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, - **gen_kwargs, ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform an evaluation step on `model` using `inputs`. @@ -42,8 +41,6 @@ def prediction_step( argument `labels`. Check your model's documentation for all accepted arguments. prediction_loss_only (`bool`): Whether or not to return the loss only. - gen_kwargs: - Additional `generate` specific kwargs. Return: Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and @@ -59,12 +56,7 @@ def prediction_step( inputs = self._prepare_inputs(inputs) # XXX: adapt synced_gpus for fairscale as well - # Priority (handled in generate): - # gen_kwargs > model.generation_config > default GenerationConfig() - - if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): - gen_kwargs = self._gen_kwargs.copy() - + gen_kwargs = self._gen_kwargs.copy() if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: gen_kwargs["max_length"] = self.model.config.max_length gen_kwargs["num_beams"] = ( @@ -75,40 +67,68 @@ def prediction_step( gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus ) - # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate - # (otherwise, it would continue generating from the padded `decoder_input_ids`) - if ( - "labels" in inputs - and "decoder_input_ids" in inputs - and inputs["labels"].shape == inputs["decoder_input_ids"].shape - ): - inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} - generated_tokens = self.model.generate(**inputs, **gen_kwargs) + if "attention_mask" in inputs: + gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) + if "position_ids" in inputs: + gen_kwargs["position_ids"] = inputs.get("position_ids", None) + if "global_attention_mask" in inputs: + gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) + + # prepare generation inputs + # some encoder-decoder models can have varying encoder's and thus + # varying model input names + if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name: + generation_inputs = inputs[self.model.encoder.main_input_name] + else: + generation_inputs = inputs[self.model.main_input_name] - # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop - # TODO: remove this hack when the legacy code that initializes generation_config from a model config is - # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 - if self.model.generation_config._from_model_config: - self.model.generation_config._from_model_config = False + gen_kwargs["input_ids"] = generation_inputs + generated_tokens = self.model.generate(**gen_kwargs) + generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:] - # Retrieves GenerationConfig from model.generation_config - gen_config = self.model.generation_config # in case the batch is shorter than max length, the output should be padded - if generated_tokens.shape[-1] < gen_config.max_length: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) - elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) + if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) + + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) if has_labels: labels = inputs["labels"] - if labels.shape[-1] < gen_config.max_length: - labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) - elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: - labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) + if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( + gen_kwargs["max_new_tokens"] + 1 + ): + labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) else: labels = None - return None, generated_tokens, labels + return (loss, generated_tokens, labels) + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = ( + self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + ) + else: + if self.model.config.pad_token_id is not None: + pad_token_id = self.model.config.pad_token_id + else: + raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") + + padded_tensor = pad_token_id * torch.ones( + (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device + ) + padded_tensor[:, : tensor.shape[-1]] = tensor + return padded_tensor # monkey patching From eaf9fc98fb8a70681b9edab0b30d365bb20dc924 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Thu, 7 Sep 2023 20:41:49 +0800 Subject: [PATCH 20/70] fix --- swift/trainers/trainers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index ee45ff5d15..e97aa7f179 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -56,7 +56,7 @@ def prediction_step( inputs = self._prepare_inputs(inputs) # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = self._gen_kwargs.copy() + gen_kwargs = self.model.generation_config.to_dict().copy() if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: gen_kwargs["max_length"] = self.model.config.max_length gen_kwargs["num_beams"] = ( From 462d3ca64eb0ba3c3beccf71cbb9cb5d3807a5dd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 20:52:12 +0800 Subject: [PATCH 21/70] add perf --- swift/trainers/trainers.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index e97aa7f179..d738b6d817 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -8,6 +8,7 @@ from transformers import Trainer as HfTrainer from transformers import trainer from transformers.deepspeed import is_deepspeed_zero3_enabled +import time from .mixin import PushToMsHubMixin, SwiftMixin from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew @@ -19,6 +20,15 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): + def __init__(self, *args, **kwargs): + super.__init__(*args, **kwargs) + self.perf = { + 'gen_time': 0., + 'gen_len': 0, + 'eval_memory': 0., + 'train_memory': 0., + } + def prediction_step( self, model: nn.Module, @@ -83,8 +93,14 @@ def prediction_step( generation_inputs = inputs[self.model.main_input_name] gen_kwargs["input_ids"] = generation_inputs + gen_time = time.time() generated_tokens = self.model.generate(**gen_kwargs) + gen_time = time.time() - gen_time generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:] + gen_len = len(generated_tokens[0]) + self.perf['gen_time'] = self.perf['gen_time'] + gen_time + self.perf['gen_len'] = self.perf['gen_len'] + gen_len + self.perf['eval_memory'] = torch.cuda.memory_allocated() # in case the batch is shorter than max length, the output should be padded if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: From f956f93585863f094333f4db70b15b9f04e7c662 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 7 Sep 2023 22:07:46 +0800 Subject: [PATCH 22/70] add perf info --- examples/pytorch/llm/src/llm_sft.py | 1 + swift/trainers/trainers.py | 17 ++++++++++++++--- swift/tuners/base.py | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index dc05bc2c81..7ee5a717d3 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -502,6 +502,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False): ) trainer.train(trainer_args.resume_from_checkpoint) + logger.info(trainer.perf) # ### Visualization if is_master(): diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index d738b6d817..23e9a3d775 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -21,14 +21,25 @@ class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): def __init__(self, *args, **kwargs): - super.__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.perf = { 'gen_time': 0., 'gen_len': 0, 'eval_memory': 0., - 'train_memory': 0., + 'train_memory': None, + 'model': self.model.get_trainable_parameters(), } + def train( + self, + *args, + **kwargs, + ): + training_output = super().train(*args, **kwargs) + if self.perf['train_memory'] is None: + self.perf['train_memory'] = torch.cuda.memory_allocated() + return training_output + def prediction_step( self, model: nn.Module, @@ -100,7 +111,7 @@ def prediction_step( gen_len = len(generated_tokens[0]) self.perf['gen_time'] = self.perf['gen_time'] + gen_time self.perf['gen_len'] = self.perf['gen_len'] + gen_len - self.perf['eval_memory'] = torch.cuda.memory_allocated() + self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), self.perf['eval_memory']) # in case the batch is shorter than max length, the output should be padded if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: diff --git a/swift/tuners/base.py b/swift/tuners/base.py index b6f4d1c3db..0eedaba028 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -397,7 +397,8 @@ def get_trainable_parameters(self): if param.requires_grad: trainable_params += num_params return f'trainable params: {trainable_params:,d} || all params: {all_param:,d} ' \ - f'|| trainable%: {100 * trainable_params / all_param}' + f'|| trainable%: {100 * trainable_params / all_param}' \ + f'|| cuda memory: {sum([torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count())])}' class Swift: From df194a2274016a78083b5b24f801d242958ebd87 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Sep 2023 13:39:33 +0800 Subject: [PATCH 23/70] fix --- examples/pytorch/llm/src/llm_sft.py | 107 ++-------------------- examples/pytorch/llm/src/utils/dataset.py | 15 +++ 2 files changed, 21 insertions(+), 101 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 7ee5a717d3..38f2e177a5 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -229,7 +229,7 @@ def llm_sft(args: SftArguments) -> None: adapter_config = AdapterConfig( dim=model.config.hidden_size, target_modules=MODEL_MAPPING[args.model_type].get( - 'adapter_TM', 'mlp'), + 'adapter_TM', ['mlp']), method_name='forward', hidden_pos=0, adapter_length=args.adapter_length, @@ -251,6 +251,7 @@ def llm_sft(args: SftArguments) -> None: show_layers(model) print_model_info(model) logger.info(str(model)) + logger.info(model.get_trainable_parameters()) # ### Loading Dataset dataset = get_dataset(args.dataset.split(',')) @@ -263,108 +264,12 @@ def llm_sft(args: SftArguments) -> None: args.dataset_seed) generation_config = { - 'do_sample': True, - 'top_p': 0.7, - 'max_length': args.max_length, - 'temperature': 0.95 + 'do_sample': True, + 'top_p': 0.7, + 'max_length': args.max_length, + 'temperature': 0.95 } - # args.max_source_length = 64 - # args.max_target_length = 64 - # prompt_column = 'query' - # response_column = 'response' - # history_column = None - # prefix = '' - # max_target_length = 128 - # - # def preprocess_function_eval(examples): - # inputs, targets = [], [] - # for i in range(len(examples[prompt_column])): - # if examples[prompt_column][i] and examples[response_column][i]: - # query = examples[prompt_column][i] - # if history_column is None or len(examples[history_column][i]) == 0: - # prompt = query - # else: - # prompt = '' - # history = examples[history_column][i] - # for turn_idx, (old_query, response) in enumerate(history): - # prompt += '[Round {}]\n问:{}\n答:{}\n'.format( - # turn_idx, old_query, response) - # prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) - # inputs.append(prompt) - # targets.append(examples[response_column][i]) - # - # inputs = [prefix + inp for inp in inputs] - # model_inputs = tokenizer( - # inputs, - # max_length=args.max_source_length, - # truncation=True, - # padding=True) - # labels = tokenizer( - # text_target=targets, max_length=max_target_length, truncation=True) - # - # if True: - # labels['input_ids'] = [[(lb if lb != tokenizer.pad_token_id else -100) - # for lb in label] - # for label in labels['input_ids']] - # model_inputs['labels'] = labels['input_ids'] - # - # return model_inputs - # - # def preprocess_function_train(examples): - # max_seq_length = args.max_source_length + args.max_target_length - # - # model_inputs = { - # 'input_ids': [], - # 'labels': [], - # } - # for i in range(len(examples[prompt_column])): - # if examples[prompt_column][i] and examples[response_column][i]: - # query, answer = examples[prompt_column][i], examples[ - # response_column][i] - # - # if history_column is None: - # prompt = query - # else: - # prompt = '' - # history = examples[history_column][i] - # for turn_idx, (old_query, response) in enumerate(history): - # prompt += '[Round {}]\n问:{}\n答:{}\n'.format( - # turn_idx, old_query, response) - # prompt += '[Round {}]\n问:{}\n答:'.format(len(history), query) - # - # prompt = prefix + prompt - # a_ids = tokenizer.encode(text=prompt, add_special_tokens=False) - # b_ids = tokenizer.encode(text=answer, add_special_tokens=False) - # - # if len(a_ids) > args.max_source_length - 1: - # a_ids = a_ids[:args.max_source_length - 1] - # - # if len(b_ids) > args.max_target_length - 2: - # b_ids = b_ids[:args.max_target_length - 2] - # - # input_ids = tokenizer.build_inputs_with_special_tokens( - # a_ids, b_ids) - # - # if False: - # context_length = input_ids.index(tokenizer.bos_token_id) - # else: - # context_length = len(a_ids) + 2 - # mask_position = context_length - 1 - # labels = [-100] * context_length + input_ids[mask_position + 1:] - # - # pad_len = max_seq_length - len(input_ids) - # input_ids = input_ids + [tokenizer.pad_token_id] * pad_len - # labels = labels + [tokenizer.pad_token_id] * pad_len - # if True: - # labels = [(lb if lb != tokenizer.pad_token_id else -100) - # for lb in labels] - # - # model_inputs['input_ids'].append(input_ids) - # model_inputs['labels'].append(labels) - # - # return model_inputs - preprocess_func = get_preprocess( args.template_type, tokenizer, diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 2f0c73b819..3b9cf1c241 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -134,6 +134,20 @@ def get_instinwild_en_dataset() -> HfDataset: return _process_alpaca_dataset(dataset) +def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]: + dataset_train: HfDataset = MsDataset.load( + 'modelscope/DuReader_robust-QG', split='train').to_hf_dataset().rename_columns({ + "text1": "query", + "text2": "response", + }) + dataset_val: HfDataset = MsDataset.load( + 'modelscope/DuReader_robust-QG', split='validation').to_hf_dataset().rename_columns({ + "text1": "query", + "text2": "response", + }) + return dataset_train, dataset_val + + def get_cot_en_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'YorickHe/CoT', split='train').to_hf_dataset() @@ -319,6 +333,7 @@ def get_gpt4all_en_dataset() -> HfDataset: # multi-modal 'coco-en': get_coco_en_dataset, 'advertise_gen': get_advertise_gen_dataset, + 'du_reader': get_du_reader_dataset, } From e7cf7f724f47e6e26ba5c9d4ae1cc1d0d225ed55 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 8 Sep 2023 14:49:04 +0800 Subject: [PATCH 24/70] fix --- examples/pytorch/llm/src/llm_sft.py | 31 ++++++++++++++++------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 38f2e177a5..2593843215 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -324,8 +324,8 @@ def llm_sft(args: SftArguments) -> None: eval_steps=args.eval_steps, dataloader_num_workers=args.dataloader_num_workers, load_best_model_at_end=True, - metric_for_best_model='loss', - greater_is_better=False, + metric_for_best_model='rouge-l', + greater_is_better=True, sortish_sampler=True, optim=args.optim, hub_model_id=args.hub_model_id, @@ -379,18 +379,21 @@ def _decode(tokens, ignore_pad_token_for_loss=False): if len(hypothesis) == 0 or ''.join(hypothesis) == '.': hypothesis = [tokenizer.decode(tokenizer.eos_token_id)] reference = list(jieba.cut(label)) - rouge = Rouge() - scores = rouge.get_scores(' '.join(hypothesis), - ' '.join(reference)) - result = scores[0] - - for k, v in result.items(): - score_dict[k].append(round(v['f'] * 100, 4)) - bleu_score = sentence_bleu( - [list(label)], - list(pred), - smoothing_function=SmoothingFunction().method3) - score_dict['bleu-4'].append(round(bleu_score * 100, 4)) + try: + rouge = Rouge() + scores = rouge.get_scores(' '.join(hypothesis), + ' '.join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v['f'] * 100, 4)) + bleu_score = sentence_bleu( + [list(label)], + list(pred), + smoothing_function=SmoothingFunction().method3) + score_dict['bleu-4'].append(round(bleu_score * 100, 4)) + except: + logger.error(f'eval error {hypothesis}, {reference}') for k, v in score_dict.items(): score_dict[k] = float(np.mean(v)) From 20b077297b6134f52e039199a7f434ae3b115504 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Fri, 8 Sep 2023 17:04:27 +0800 Subject: [PATCH 25/70] revert code --- examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh index ff0c147200..3baa73288a 100644 --- a/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm_6b/lora_ddp/sft.sh @@ -10,7 +10,7 @@ torchrun \ --sft_type lora \ --output_dir runs \ --ddp_backend gloo \ - --dataset advertise_gen \ + --dataset alpaca-en,alpaca-zh \ --dataset_sample -1 \ --num_train_epochs 1 \ --max_length 1024 \ From cebdd11549ca8035405d93203a2798e1e1d95c3b Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Sep 2023 20:21:33 +0800 Subject: [PATCH 26/70] support activate/deactivate adapter --- swift/tuners/adapter.py | 32 +++++++++---- swift/tuners/base.py | 25 ++++++++-- swift/tuners/lora.py | 85 ++++++++++++++++++++++----------- swift/tuners/prompt.py | 30 ++++++++---- swift/tuners/restuning.py | 40 +++++++++++----- swift/tuners/side.py | 27 ++++++++--- swift/tuners/utils.py | 4 +- swift/utils/torch_utils.py | 12 +++++ tests/utils/test_torch_utils.py | 14 ++++++ 9 files changed, 199 insertions(+), 70 deletions(-) create mode 100644 tests/utils/test_torch_utils.py diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 19233e60eb..b60ffb4ad0 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -3,12 +3,12 @@ import re import types from dataclasses import dataclass, field -from typing import Union +from typing import Union, List import torch from torch import nn from transformers.activations import ACT2CLS - +from swift.utils.torch_utils import find_sub_module from .utils import SwiftConfig, SwiftOutput @@ -71,7 +71,7 @@ def __post_init__(self): class Adapter: @staticmethod - def prepare_model(model: nn.Module, config: AdapterConfig) -> SwiftOutput: + def prepare_model(model: nn.Module, config: AdapterConfig, adapter_name: str) -> SwiftOutput: """Prepare a model with `AdapterConfig`""" module_keys = [key for key, _ in model.named_modules()] @@ -84,19 +84,21 @@ def _forward(self, *args, **kwargs): if isinstance(args, (tuple, list, dict)): if isinstance(config.hidden_pos, int): return args[0:config.hidden_pos] + args[ - config.hidden_pos] + getattr(self, 'adapter')(args[config.hidden_pos]) \ + config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) \ + args[config.hidden_pos + 1:] # noqa else: kwargs[config.hidden_pos] = args[ - config.hidden_pos] + getattr(self, 'adapter')( + config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')( args[config.hidden_pos]) elif isinstance(args, torch.Tensor): - args = getattr(self, 'adapter')(args) + args = getattr(self, f'adapter_{adapter_name}')(args) return args def _feed_forward_chunk(self, attention_output): return _forward(self, attention_output) + # TODO The `config.method_name` method should not be replaced twice. + module.forward_origin = getattr(module, config.method_name) num_args_in_forward_chunk_fn = len( inspect.signature(module.forward_origin).parameters) @@ -109,12 +111,12 @@ def _feed_forward_chunk(self, attention_output): adapter_module = AdapterModule(config.dim, config.adapter_length, ACT2CLS[config.act_layer]) - setattr(module, 'adapter', adapter_module) + setattr(module, f'adapter_{adapter_name}', adapter_module) - def state_dict_callback(state_dict): + def state_dict_callback(state_dict, adapter_name: str): return { key: value - for key, value in state_dict.items() if 'adapter' in key + for key, value in state_dict.items() if f'adapter_{adapter_name}' in key } def mark_trainable_callback(model): @@ -123,6 +125,12 @@ def mark_trainable_callback(model): return SwiftOutput(config, state_dict_callback, mark_trainable_callback) + @staticmethod + def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + for _module in modules: + module.activate(activate) + class AdapterModule(nn.Module): """The implementation of adapter tuning method. @@ -150,6 +158,7 @@ def __init__( self.activate = act_layer() self.ln2 = nn.Linear(adapter_length, dim) self.init_weights() + self._activate = True def init_weights(self): @@ -160,7 +169,12 @@ def _init_weights(m): self.apply(_init_weights) + def activate(self, activate=True): + self._activate = activate + def forward(self, x, identity=None): + if not self.activate: + return 0. out = self.ln2(self.activate(self.ln1(x))) if identity is None: identity = x diff --git a/swift/tuners/base.py b/swift/tuners/base.py index b6f4d1c3db..a513e67588 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -52,12 +52,12 @@ def __init__(self, self.adapters = {} if isinstance(config, SwiftConfig): - self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config) + self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config, DEFAULT_ADAPTER) elif isinstance(config, dict): assert (all(isinstance(c, SwiftConfig) for c in config.values())) for adapter_name, config in config.items(): self.adapters[adapter_name] = self._prepare_model( - model, config) + model, config, adapter_name) self.model = model self.extra_state_keys = extra_state_keys or [] @@ -151,7 +151,7 @@ def state_dict(self, if kwargs.get('save_adapter', True): for name, output in self.adapters.items(): if adapter_name == name or adapter_name is None: - state_dicts.update(output.state_dict_callback(destination)) + state_dicts.update(output.state_dict_callback(destination, adapter_name)) if kwargs.get('save_extra_states', True): state_dicts.update({ k: v @@ -260,10 +260,11 @@ def _prepare_model( cls, model: nn.Module, config: SwiftConfig, + adapter_name: str, ): assert (hasattr(config, SWIFT_TYPE_KEY)) from .mapping import SWIFT_MAPPING - return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config) + return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config, adapter_name) def create_or_update_model_card(self, output_dir: str): """ @@ -381,6 +382,22 @@ def save_pretrained(self, def base_model(self): return self.model + def activate_adapter(self, adapter_name): + if adapter_name not in self.adapters: + return + + from .mapping import SWIFT_MAPPING + SWIFT_MAPPING[self.adapters[adapter_name].config.swift_type][1]\ + .activate_adapter(self.base_model, adapter_name, True) + + def deactivate_adapter(self, adapter_name): + if adapter_name not in self.adapters: + return + + from .mapping import SWIFT_MAPPING + SWIFT_MAPPING[self.adapters[adapter_name].config.swift_type][1]\ + .activate_adapter(self.base_model, adapter_name, False) + def get_trainable_parameters(self): """ Get the content of trainable parameters in the model. diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 5cbb797970..4048d775a9 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -15,6 +15,7 @@ from peft.utils import get_auto_gptq_quant_linear, get_quantization_config from .utils import SwiftConfig, SwiftOutput +from ..utils.torch_utils import find_sub_module if is_bnb_available(): import bitsandbytes as bnb @@ -90,12 +91,13 @@ def __post_init__(self): class LoRA: @staticmethod - def prepare_model(model: nn.Module, config: LoRAConfig): + def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str): """Prepare a model with `LoRAConfig`""" LoRA._dynamic_patch_lora( model, replace_modules=config.target_modules, r=config.r, + adapter_name=adapter_name, lora_alpha=config.lora_alpha, lora_dropout=config.lora_dropout, merge_weights=config.merge_weights, @@ -103,8 +105,8 @@ def prepare_model(model: nn.Module, config: LoRAConfig): enable_lora=config.enable_lora, fan_in_fan_out=config.fan_in_fan_out) - def state_dict_callback(state_dict): - return lora_state_dict(state_dict, config.bias) + def state_dict_callback(state_dict, adapter_name): + return lora_state_dict(state_dict, model.lora_module_map, adapter_name, config.bias) def mark_trainable_callback(model): mark_lora_as_trainable(model, config.bias) @@ -113,7 +115,16 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def _dynamic_patch_lora(model, replace_modules, use_merged_linear, + def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + for _module in modules: + if isinstance(module, LoRALayer): + module.activate(activate) + else: + module.active_adapter = 'default' if activate else 'invalid' + + @staticmethod + def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name, **kwargs): """Dynamic patch lora to model @@ -126,7 +137,9 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, Returns: The lora modules """ - modules = [] + if not hasattr(model, 'lora_module_map'): + model.lora_module_map = {} + modules = {} module_keys = [key for key, _ in model.named_modules()] assert isinstance(replace_modules, (str, list)) AutoGPTQQuantLinear = get_auto_gptq_quant_linear( @@ -209,9 +222,15 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, lora_module = Embedding( num_embeddings=sub_module.num_embeddings, embedding_dim=sub_module.embedding_dim, + padding_idx=sub_module.padding_idx, + max_norm=sub_module.max_norm, + norm_type=sub_module.norm_type, + scale_grad_by_freq=sub_module.scale_grad_by_freq, + sparse=sub_module.sparse, r=kwargs['r'], lora_alpha=kwargs['lora_alpha'], - merge_weights=kwargs['merge_weights']) + merge_weights=kwargs['merge_weights'], + ) elif isinstance(sub_module, torch.nn.Conv2d): kwargs.pop('fan_in_fan_out', None) lora_module = Conv2d( @@ -231,10 +250,11 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, if getattr(sub_module, 'state', None) is not None: lora_module.state = sub_module.state lora_module.to(sub_module.weight.device) + lora_module.adapter_name = adapter_name setattr(module, _key, lora_module) - modules.append(lora_module) + modules[module_key] = adapter_name - return modules + model.lora_module_map.update(modules) @staticmethod def unpatch_lora(model, config: LoRAConfig): @@ -247,11 +267,9 @@ def unpatch_lora(model, config: LoRAConfig): Args: model: The model called with `tune` function. config: The `LoRAConfig` to use. - - Returns: - The lora modules. """ - modules = [] + if not hasattr(model, 'lora_module_map'): + model.lora_module_map = {} module_keys = [key for key, _ in model.named_modules()] assert isinstance(config.replace_modules, (str, list)) replace_modules = config.replace_modules @@ -274,7 +292,18 @@ def unpatch_lora(model, config: LoRAConfig): origin_module = torch.nn.Linear( sub_module.in_features, sub_module.out_features, - bias=sub_module.bias is not None) + bias=hasattr(sub_module, 'bias') and sub_module.bias is not None, + ) + elif isinstance(sub_module, Embedding): + origin_module = torch.nn.Embedding( + num_embeddings=sub_module.num_embeddings, + embedding_dim=sub_module.embedding_dim, + padding_idx=sub_module.padding_idx, + max_norm=sub_module.max_norm, + norm_type=sub_module.norm_type, + scale_grad_by_freq=sub_module.scale_grad_by_freq, + sparse=sub_module.sparse, + ) elif isinstance(sub_module, Conv2d): origin_module = torch.nn.Conv2d( sub_module.in_channels, @@ -289,19 +318,12 @@ def unpatch_lora(model, config: LoRAConfig): sub_module.merge_weights = True sub_module.eval() origin_module.weight = sub_module.weight - if sub_module.bias is not None: + if getattr(sub_module, 'bias', None) is not None: origin_module.bias = sub_module.bias origin_module.to(sub_module.weight.device).to( sub_module.weight.dtype) setattr(module, _key, origin_module) - modules.append(sub_module) - - model.state_dict_hook_handle.remove() - if hasattr(model, 'load_state_dict_hook_handle'): - model.load_state_dict_hook_handle.remove() - else: - model.load_state_dict = model.load_state_dict_origin - return modules + model.lora_module_map.pop(module_key, None) class LoRALayer: @@ -314,6 +336,7 @@ def __init__( merge_weights: bool, ): self.r = r + self.old_r = r self.lora_alpha = lora_alpha # Optional dropout if lora_dropout > 0.: @@ -324,6 +347,12 @@ def __init__( self.merged = False self.merge_weights = merge_weights + def activate(self, activate=True): + if activate: + self.r = self.old_r + else: + self.r = 0 + class Embedding(nn.Embedding, LoRALayer): # LoRA implemented in a dense layer @@ -694,7 +723,7 @@ def forward(self, x: torch.Tensor): return nn.Conv2d.forward(self, x) -def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: +def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'none') -> None: if bias == 'none': return elif bias == 'all': @@ -703,7 +732,7 @@ def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: p.requires_grad = True elif bias == 'lora_only': for m in model.modules(): - if isinstance(m, LoRALayer) and \ + if adapter_name == getattr(m, 'adapter_name', None) and \ hasattr(m, 'bias') and \ m.bias is not None: m.bias.requires_grad = True @@ -711,18 +740,18 @@ def mark_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: raise NotImplementedError -def lora_state_dict(state_dict, bias: str = 'none') -> Dict[str, torch.Tensor]: +def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]: if bias == 'none': - return {k: state_dict[k] for k in state_dict if 'lora_' in k} + return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k, None) == adapter_name} elif bias == 'all': return { k: state_dict[k] - for k in state_dict if 'lora_' in k or 'bias' in k + for k in state_dict if ('lora_' in k and module_map.get(k, None) == adapter_name) or 'bias' in k } elif bias == 'lora_only': to_return = {} for k in state_dict: - if 'lora_' in k: + if 'lora_' in k and module_map.get(k, None) == adapter_name: to_return[k] = state_dict[k] bias_name = k.split('lora_')[0] + 'bias' if bias_name in state_dict: diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index f426a4dd83..2600d841e4 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -3,12 +3,13 @@ import re import types from dataclasses import dataclass, field -from typing import Union +from typing import Union, List import torch from torch import nn from .utils import SwiftConfig, SwiftOutput +from ..utils.torch_utils import find_sub_module @dataclass @@ -77,7 +78,7 @@ def __post_init__(self): class Prompt: @staticmethod - def prepare_model(model: nn.Module, config: PromptConfig): + def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str): module_keys = [key for key, _ in model.named_modules()] match_module_keys = [] for module_key in module_keys: @@ -91,7 +92,7 @@ def _forward(self, *args, **kwargs): input_embedding = kwargs[config.embedding_pos] input_embedding = getattr( - self, 'prompt').forward(input_embedding) + self, f'prompt_{adapter_name}').forward(input_embedding) if isinstance(config.embedding_pos, int): args = type(args)( args[0:config.embedding_pos] + (input_embedding, ) @@ -109,7 +110,7 @@ def _forward(self, *args, **kwargs): if attention_mask is not None: attention_mask = getattr( self, - 'prompt').patch_attention_mask(attention_mask) + f'prompt_{adapter_name}').patch_attention_mask(attention_mask) if isinstance(config.attention_mask_pos, int): args = type(args)( args[0:config.attention_mask_pos] @@ -121,7 +122,7 @@ def _forward(self, *args, **kwargs): forward_output = self.forward_origin(*args, **kwargs) if config.extract_embedding: forward_output = getattr( - self, 'prompt').extract(forward_output) + self, f'prompt_{adapter_name}').extract(forward_output) return forward_output @@ -136,13 +137,13 @@ def _forward(self, *args, **kwargs): config.prompt_length, config.attention_mask_value, config.attach_front) - setattr(module, 'prompt', prompt_module) + setattr(module, f'prompt_{adapter_name}', prompt_module) match_module_keys.append(module_key) - def state_dict_callback(state_dict): + def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if 'prompt' in key + for key, value in state_dict.items() if f'prompt_{adapter_name}' in key } def mark_trainable_callback(model): @@ -151,6 +152,12 @@ def mark_trainable_callback(model): return SwiftOutput(config, state_dict_callback, mark_trainable_callback) + @staticmethod + def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + for _module in modules: + module.activate(activate) + class PromptModule(nn.Module): """The implementation of vision prompt tuning method. @@ -178,11 +185,13 @@ def __init__(self, self.prompt_length = prompt_length self.mask_values = mask_values self.attach_front = attach_front - + self._activate = True self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim)) nn.init.xavier_uniform_(self.prompt_token) def forward(self, x): + if not self._activate: + return x prompt_token = self.prompt_token.expand(x.shape[0], -1, -1) if self.layer_num == 0: @@ -199,6 +208,9 @@ def forward(self, x): dim=1) return x + def activate(self, activate=True): + self._activate = activate + def patch_attention_mask(self, m): prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length), self.mask_values).to(m.device) diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index b72e000bcb..1f5d7f3a2f 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -5,11 +5,13 @@ from dataclasses import dataclass, field from typing import Union, Dict, Optional, List +import torch import torch.nn as nn from swift.utils.logger import get_logger from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner from .utils import SwiftConfig, SwiftOutput +from ..utils.torch_utils import find_sub_module logger = get_logger() @@ -112,7 +114,7 @@ def __post_init__(self): class ResTuning: @staticmethod - def prepare_model(model: nn.Module, config: ResTuningConfig) -> SwiftOutput: + def prepare_model(model: nn.Module, config: ResTuningConfig, adapter_name: str) -> SwiftOutput: """Prepare a model with `ResTuningConfig`""" def _forward_seq(self, input, *args, **kwargs): @@ -123,17 +125,21 @@ def _forward_seq(self, input, *args, **kwargs): def _forward_target(self, *args, **kwargs): if self.target_modules_hook == "input": - args_main = _forward_restuning(self) - args_main = self.forward_origin(args_main, **kwargs) + args = list(args) + _arg = args[0 if self.target_hidden_pos is None else self.target_hidden_pos] + args_main = _forward_restuning(self, _arg) + args[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main + args_main = self.forward_origin(*args, **kwargs) else: _args_main = self.forward_origin(*args, **kwargs) - args_main = _forward_restuning(self) + _arg = _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] if isinstance(_args_main, (tuple, list)) else _args_main + args_main = _forward_restuning(self, _arg) if type(_args_main) != type(args_main): - _args_main[self.target_hidden_pos] = args_main + _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main args_main = _args_main return args_main - def _forward_restuning(self): + def _forward_restuning(self, origin_arg): probe_results = [] root_module_ins = self.root_module_ins_list[0] stem_module_ins_list = self.stem_module_ins_list @@ -150,7 +156,7 @@ def _forward_restuning(self): probe_results.append(st_mod.probe_input_data) else: probe_results.append(st_mod.probe_output_data) - args_main = getattr(top_module, 'restuning')(probe_results) + args_main = getattr(top_module, f'restuning_{adapter_name}')(probe_results, origin_arg) return args_main # 1. Matching the root module @@ -208,7 +214,7 @@ def _forward_restuning(self): restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample, config.upsample_out_channels, config.zero_init_last, config.tuner_cfg) - setattr(top_module, 'restuning', restuning_module) + setattr(top_module, f'restuning_{adapter_name}', restuning_module) # 4. Matching the target module target_module_ins = None @@ -235,10 +241,10 @@ def _forward_restuning(self): if target_module_ins is None: raise Exception(f"Cannot match target modules") - def state_dict_callback(state_dict): + def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if 'restuning' in key + for key, value in state_dict.items() if f'restuning_{adapter_name}' in key } def mark_trainable_callback(model): @@ -247,6 +253,12 @@ def mark_trainable_callback(model): return SwiftOutput(config, state_dict_callback, mark_trainable_callback) + @staticmethod + def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + for _module in modules: + module.activate(activate) + class ResTuningBypassModule(nn.Module): """The implementation of ResTuningBypass method. @@ -263,6 +275,7 @@ def __init__( ): super(ResTuningBypassModule, self).__init__() + self._activate = True self.bypass_blocks = nn.Sequential(*[ ResTunerBypassBlock( dim=dims[i] if isinstance(dims, list) else dims, @@ -276,7 +289,12 @@ def __init__( ) for i in range(depth)]) - def forward(self, x_list, **kwargs): + def activate(self, activate=True): + self._activate = activate + + def forward(self, x_list, origin_arg, **kwargs): + if not self._activate: + return origin_arg x_bypass = detach_tensors(x_list.pop(0)) x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass x_list = detach_tensors(x_list) diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 9e4f043dd7..0101e74ab4 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -5,7 +5,7 @@ import copy from dataclasses import dataclass, field from functools import partial -from typing import Union, Callable, Any +from typing import Union, Callable, Any, List from collections import OrderedDict from itertools import repeat @@ -15,6 +15,7 @@ from swift.utils.logger import get_logger from .utils import SwiftConfig, SwiftOutput +from ..utils.torch_utils import find_sub_module logger = get_logger() @@ -61,7 +62,7 @@ def __post_init__(self): class Side: @staticmethod - def prepare_model(model: nn.Module, config: SideConfig) -> SwiftOutput: + def prepare_model(model: nn.Module, config: SideConfig, adapter_name: str) -> SwiftOutput: """Prepare a model with `SideConfig`""" module_keys = [key for key, _ in model.named_modules()] @@ -77,9 +78,9 @@ def _forward(self, *args, **kwargs): args_main = self.forward_origin(*args, **kwargs) if isinstance(args_main, (tuple, list, dict)): if isinstance(config.hidden_pos, str): - args_main[config.hidden_pos] = getattr(self, 'side')(*args, args_main[config.hidden_pos]) + args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos]) else: - args_main = getattr(self, 'side')(*args, args_main) + args_main = getattr(self, f'side_{adapter_name}')(*args, args_main) return args_main if isinstance(tgt_module, nn.Sequential): @@ -96,12 +97,12 @@ def forward_seq(self, input, *args, **kwargs): tgt_module.forward_origin = tgt_module.forward tgt_module.forward = types.MethodType(_forward, tgt_module) side_module = SideModule(config.dim, config.side_module_name) - setattr(tgt_module, 'side', side_module) + setattr(tgt_module, f'side_{adapter_name}', side_module) - def state_dict_callback(state_dict): + def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if 'side' in key + for key, value in state_dict.items() if f'side_{adapter_name}' in key } def mark_trainable_callback(model): @@ -110,6 +111,12 @@ def mark_trainable_callback(model): return SwiftOutput(config, state_dict_callback, mark_trainable_callback) + @staticmethod + def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + for _module in modules: + module.activate(activate) + class SideModule(nn.Module): """The implementation of vision side-tuning method. @@ -147,8 +154,14 @@ def __init__( else: raise ValueError(f'Unsupported side_module_name: {side_module_name}') self.alpha = nn.Parameter(torch.tensor(0.0)) + self._activate = True + + def activate(self, activate=True): + self._activate = activate def forward(self, x, x_main): + if not self._activate: + return x_main alpha_squashed = torch.sigmoid(self.alpha) x_side = self.side_net(x) x_out = alpha_squashed * x_main + (1 - alpha_squashed) * x_side diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py index 42faa94e84..0e0c4bed4f 100644 --- a/swift/tuners/utils.py +++ b/swift/tuners/utils.py @@ -109,10 +109,10 @@ class SwiftOutput: which is used to get the tuner's state dict among the model's state dict. This callback should receive a state dict, and returns a created state dict. Examples: - >>> def state_dict_callback(state_dict): + >>> def state_dict_callback(state_dict, adapter_name): >>> return { >>> key: value - >>> for key, value in state_dict.items() if 'adapter' in key + >>> for key, value in state_dict.items() if adapter_name in key >>> } mark_trainable_callback (`FunctionType`): A callback returned by the tuner which is used to mark the tuner's adapter's parameters to trainable. diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 03db127012..f2f1903273 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -98,6 +98,18 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: logger.info(''.join(s)) +def find_sub_module(module: torch.nn.Module, module_name: str) -> List[torch.nn.Module]: + _modules = list() + for name, sub_module in module.named_modules(): + if not name: + continue + if module_name == name or getattr(sub_module, 'adapter_name', None) == module_name: + _modules.append(sub_module) + else: + _modules.extend(find_sub_module(sub_module, module_name)) + return _modules + + def get_seed(random_state: RandomState) -> int: seed_max = np.iinfo(np.int32).max seed = random_state.randint(0, seed_max) diff --git a/tests/utils/test_torch_utils.py b/tests/utils/test_torch_utils.py new file mode 100644 index 0000000000..3517d7f475 --- /dev/null +++ b/tests/utils/test_torch_utils.py @@ -0,0 +1,14 @@ +import unittest +from modelscope import Model +from swift.utils.torch_utils import find_sub_module + + +class TestTorchUtils(unittest.TestCase): + + def test_find_sub_module(self): + model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + self.assertTrue(find_sub_module(model, 'query') is not None) + + +if __name__ == '__main__': + unittest.main() From 851124706b5c3083c1280e89ddcba9bc9d280050 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Fri, 8 Sep 2023 20:49:10 +0800 Subject: [PATCH 27/70] fix indent --- swift/tuners/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 245bb3eeb2..37c4374760 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -167,7 +167,7 @@ def __init__( self.ln2 = nn.Linear(adapter_length, dim) self.init_weights() self._prepared = False - self._activate = True + self._activate = True def init_weights(self): From d27fe8d63dbb1e304ea9a0b3e94d42558be03787 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Sat, 9 Sep 2023 14:17:43 +0800 Subject: [PATCH 28/70] fix --- .../pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh | 2 +- examples/pytorch/llm/src/llm_infer.py | 2 +- swift/tuners/adapter.py | 6 +++--- swift/tuners/base.py | 8 ++++---- swift/tuners/lora.py | 6 +++--- swift/tuners/restuning_components.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh index b47ece0d8c..46ad5c849f 100644 --- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh +++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora/infer.sh @@ -1,6 +1,6 @@ CUDA_VISIBLE_DEVICES=0,1 \ python src/llm_infer.py \ - --model_type llama2-7b-chat \ + --model_type llama2-70b-chat \ --sft_type lora \ --ckpt_dir "runs/llama2-70b-chat/vx_xxx/checkpoint-xxx" \ --eval_human true \ diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index da88454821..30ccd22c68 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -110,7 +110,7 @@ def llm_infer(args: InferArguments) -> None: # ### Preparing lora if args.sft_type == 'lora': - model = Swift.from_pretrained(model, args.ckpt_dir) + model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True) show_layers(model) print_model_info(model) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 37c4374760..03b16d13e6 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -163,7 +163,7 @@ def __init__( self.adapter_length = adapter_length # self.adapter_type = adapter_type self.ln1 = nn.Linear(dim, adapter_length) - self.activate = act_layer() + self.act = act_layer() self.ln2 = nn.Linear(adapter_length, dim) self.init_weights() self._prepared = False @@ -186,13 +186,13 @@ def forward(self, x, identity=None): return 0. if not self._prepared: self.ln1.to(x.device) - self.activate.to(x.device) + self.act.to(x.device) self.ln2.to(x.device) self._prepared = True x_dtype = x.dtype x = x.to(self.ln1.weight.dtype) - out = self.ln2(self.activate(self.ln1(x))) + out = self.ln2(self.act(self.ln1(x))) if identity is None: identity = x identity = identity.to(out.dtype) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index 8243625dee..11a96ef9fa 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -318,14 +318,14 @@ def create_or_update_model_card(self, output_dir: str): def save_pretrained(self, save_directory: str, safe_serialization: bool = False, - adapter_name: Union[str, List[str]] = 'default', + adapter_name: Union[str, List[str]] = None, **kwargs): """Save the adapters to a local directory. Args: save_directory (`str`): The directory to use. safe_serialization (`bool`): Use safe tensors to save the weights, default False. - adapter_name(`Union[str, List[str]]`): The adapters to be saved, default is `default`. + adapter_name(`Union[str, List[str]]`): The adapters to be saved, default is `None` to save all. """ if os.path.isfile(save_directory): raise ValueError( @@ -335,9 +335,9 @@ def save_pretrained(self, self.create_or_update_model_card(save_directory) adapter_names = adapter_name if isinstance(adapter_name, - list) else [adapter_name] + list) or adapter_name is None else [adapter_name] for adapter_name, output in self.adapters.items(): - if adapter_name not in adapter_names: + if adapter_names is not None and adapter_name not in adapter_names: continue # save only the trainable weights diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 4048d775a9..5e649a9e93 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -742,16 +742,16 @@ def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'non def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]: if bias == 'none': - return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k, None) == adapter_name} + return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name} elif bias == 'all': return { k: state_dict[k] - for k in state_dict if ('lora_' in k and module_map.get(k, None) == adapter_name) or 'bias' in k + for k in state_dict if ('lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name) or 'bias' in k } elif bias == 'lora_only': to_return = {} for k in state_dict: - if 'lora_' in k and module_map.get(k, None) == adapter_name: + if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name: to_return[k] = state_dict[k] bias_name = k.split('lora_')[0] + 'bias' if bias_name in state_dict: diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py index a3ab2dfe28..db50a945a2 100644 --- a/swift/tuners/restuning_components.py +++ b/swift/tuners/restuning_components.py @@ -60,7 +60,7 @@ def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", self.layer_num = layer_num self.depth = depth - self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 17 + self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 32 self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None From fc2a1103d2b77f7d43975f2e42fd27c2d7d2ae42 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sat, 9 Sep 2023 16:15:12 +0800 Subject: [PATCH 29/70] fix bugs --- swift/tuners/adapter.py | 18 +++++++------ swift/tuners/lora.py | 6 ++--- swift/tuners/prompt.py | 4 +-- swift/tuners/restuning.py | 4 +-- swift/tuners/side.py | 9 ++++--- tests/tuners/test_swift_base.py | 45 +++++++++++++++++++++++++++++++-- 6 files changed, 66 insertions(+), 20 deletions(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 03b16d13e6..1d070af1f4 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -91,15 +91,17 @@ def _forward(self, *args, **kwargs): args = self.forward_origin(*args, **kwargs) if isinstance(args, (tuple, list, dict)): if isinstance(config.hidden_pos, int): - return args[0:config.hidden_pos] + args[ - config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) \ - + args[config.hidden_pos + 1:] # noqa + _type = type(args) + args = list(args) + args[config.hidden_pos] = args[ + config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) + return _type(args) else: - kwargs[config.hidden_pos] = args[ + args[config.hidden_pos] = args[ config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')( args[config.hidden_pos]) elif isinstance(args, torch.Tensor): - args = getattr(self, f'adapter_{adapter_name}')(args) + args = args + getattr(self, f'adapter_{adapter_name}')(args) return args def _feed_forward_chunk(self, attention_output): @@ -135,9 +137,9 @@ def mark_trainable_callback(model): @staticmethod def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + modules: List[torch.nn.Module] = find_sub_module(module, f'adapter_{adapter_name}') for _module in modules: - module.activate(activate) + _module.activate(activate) class AdapterModule(nn.Module): @@ -182,7 +184,7 @@ def activate(self, activate=True): self._activate = activate def forward(self, x, identity=None): - if not self.activate: + if not self._activate: return 0. if not self._prepared: self.ln1.to(x.device) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 5e649a9e93..8cffafa4b6 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -118,10 +118,10 @@ def mark_trainable_callback(model): def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) for _module in modules: - if isinstance(module, LoRALayer): - module.activate(activate) + if isinstance(_module, LoRALayer): + _module.activate(activate) else: - module.active_adapter = 'default' if activate else 'invalid' + _module.active_adapter = 'default' if activate else 'invalid' @staticmethod def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name, diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index abf22455e1..a18e176fe7 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -159,9 +159,9 @@ def mark_trainable_callback(model): @staticmethod def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + modules: List[torch.nn.Module] = find_sub_module(module, f'prompt_{adapter_name}') for _module in modules: - module.activate(activate) + _module.activate(activate) class PromptModule(nn.Module): diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index 1f5d7f3a2f..cc385d700c 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -255,9 +255,9 @@ def mark_trainable_callback(model): @staticmethod def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + modules: List[torch.nn.Module] = find_sub_module(module, f'restuning_{adapter_name}') for _module in modules: - module.activate(activate) + _module.activate(activate) class ResTuningBypassModule(nn.Module): diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 0101e74ab4..2e509e9e11 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -80,7 +80,10 @@ def _forward(self, *args, **kwargs): if isinstance(config.hidden_pos, str): args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos]) else: - args_main = getattr(self, f'side_{adapter_name}')(*args, args_main) + _type = type(args_main) + args_main = list(args_main) + args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos]) + args_main = _type(args_main) return args_main if isinstance(tgt_module, nn.Sequential): @@ -113,9 +116,9 @@ def mark_trainable_callback(model): @staticmethod def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + modules: List[torch.nn.Module] = find_sub_module(module, f'side_{adapter_name}') for _module in modules: - module.activate(activate) + _module.activate(activate) class SideModule(nn.Module): diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 7676a2a283..c515fab9f8 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -10,7 +10,8 @@ from modelscope.models.nlp.structbert import (SbertConfig, SbertForSequenceClassification) from peft.utils import WEIGHTS_NAME - +from torch import nn +import math from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig @@ -27,15 +28,55 @@ def tearDown(self): super().tearDown() def test_swift_lora_forward(self): + + from swift.tuners.lora import Linear + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.ones_(self.lora_B) + + Linear.reset_parameters = reset_parameters + model = Model.from_pretrained( 'damo/nlp_structbert_sentence-similarity_chinese-base') preprocessor = Preprocessor.from_pretrained( 'damo/nlp_structbert_sentence-similarity_chinese-base') + inputs = preprocessor('how are you') lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) + outputs = model(**inputs) model = Swift.prepare_model(model, config=lora_config) + outputs_lora = model(**inputs) + model.deactivate_adapter('default') + outputs_deactivate = model(**inputs) + model.activate_adapter('default') + outputs_reactivate = model(**inputs) + self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + + def test_swift_adapter_forward(self): + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') inputs = preprocessor('how are you') + adapter_config = AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0) outputs = model(**inputs) - self.assertTrue(hasattr(outputs, 'logits')) + model = Swift.prepare_model(model, config=adapter_config) + outputs_lora = model(**inputs) + model.deactivate_adapter('default') + outputs_deactivate = model(**inputs) + model.activate_adapter('default') + outputs_reactivate = model(**inputs) + self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_lora_injection(self): model = SbertForSequenceClassification(SbertConfig()) From 8528a71aae11050a6fdb2954039363480242ddfd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sat, 9 Sep 2023 22:07:26 +0800 Subject: [PATCH 30/70] fix --- swift/tuners/prompt.py | 9 ++++- swift/tuners/restuning.py | 10 ++--- swift/tuners/restuning_components.py | 6 +-- tests/tuners/test_swift_base.py | 60 ++++++++++++++++++++++++++-- 4 files changed, 73 insertions(+), 12 deletions(-) diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index a18e176fe7..ec21650c3a 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -80,6 +80,7 @@ class Prompt: @staticmethod def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str): module_keys = [key for key, _ in model.named_modules()] + match_module_keys = [] for module_key in module_keys: if isinstance(config.target_modules, str): target_module_found = re.fullmatch(config.target_modules, @@ -144,6 +145,7 @@ def _forward(self, *args, **kwargs): config.attention_mask_value, config.attach_front) setattr(module, f'prompt_{adapter_name}', prompt_module) + match_module_keys.append(module_key) def state_dict_callback(state_dict, adapter_name): return { @@ -217,9 +219,14 @@ def activate(self, activate=True): self._activate = activate def patch_attention_mask(self, m): + if not self._activate: + return m prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length), self.mask_values).to(m.device) - return torch.cat((prefix_attention_mask, m), dim=-1) + if self.attach_front: + return torch.cat((prefix_attention_mask, m), dim=-1) + else: + return torch.cat((m, prefix_attention_mask), dim=-1) def extract(self, x): if self.attach_front: diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index cc385d700c..bc16ce40b9 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -171,9 +171,9 @@ def _forward_restuning(self, origin_arg): logger.warning( f"Type of {type(root_module)} may not be supported because of its customized forward") if config.root_modules_hook == "input": - root_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + root_module.register_forward_pre_hook(probe_input_pre_hook) else: - root_module.register_forward_hook(probe_output_hook, with_kwargs=True) + root_module.register_forward_hook(probe_output_hook) root_module.root_modules_hook = config.root_modules_hook root_module_ins_list.append(root_module) break @@ -194,11 +194,11 @@ def _forward_restuning(self, origin_arg): logger.warning( f"Type of {type(stem_module)} may not be supported because of its customized forward") if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0: - stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + stem_module.register_forward_pre_hook(probe_input_pre_hook) if config.stem_modules_hook == "input": - stem_module.register_forward_pre_hook(probe_input_pre_hook, with_kwargs=True) + stem_module.register_forward_pre_hook(probe_input_pre_hook) else: - stem_module.register_forward_hook(probe_output_hook, with_kwargs=True) + stem_module.register_forward_hook(probe_output_hook) stem_module.stem_modules_hook = config.stem_modules_hook stem_module_ins_list.append(stem_module) if isinstance(config.stem_modules, list): diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py index db50a945a2..9c99543c37 100644 --- a/swift/tuners/restuning_components.py +++ b/swift/tuners/restuning_components.py @@ -306,13 +306,13 @@ def probe_tensors(module, feats, name): setattr(module, name, feats) -def probe_input_pre_hook(self, args, kwargs): +def probe_input_pre_hook(self, args): input = args[0] probe_tensors(self, input, 'probe_input_data') - return args, kwargs + return args -def probe_output_hook(self, args, kwargs, result): +def probe_output_hook(self, args, result): output = result probe_tensors(self, output, 'probe_output_data') return output diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index c515fab9f8..1abfd0994f 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -12,7 +12,7 @@ from peft.utils import WEIGHTS_NAME from torch import nn import math -from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig +from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig, PromptConfig, ResTuningConfig class TestSwift(unittest.TestCase): @@ -78,6 +78,53 @@ def test_swift_adapter_forward(self): self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + def test_swift_prompt_forward(self): + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + inputs = preprocessor('how are you') + prompt_config = PromptConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + embedding_pos=0, + attention_mask_pos=1) + outputs = model(**inputs) + model = Swift.prepare_model(model, config=prompt_config) + outputs_lora = model(**inputs) + model.deactivate_adapter('default') + outputs_deactivate = model(**inputs) + model.activate_adapter('default') + outputs_reactivate = model(**inputs) + self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + + def test_swift_restuner_forward(self): + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + inputs = preprocessor('how are you') + restuner_config = ResTuningConfig( + dims=model.config.hidden_size, + root_modules=r'.*layer.0$', + stem_modules=r'.*layer\.\d+$', + target_modules=r'.*pooler', + target_modules_hook='input', + tuner_cfg="res_adapter", + ) + outputs = model(**inputs) + model = Swift.prepare_model(model, config=restuner_config) + outputs_lora = model(**inputs) + model.deactivate_adapter('default') + outputs_deactivate = model(**inputs) + model.activate_adapter('default') + outputs_reactivate = model(**inputs) + self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + def test_swift_lora_injection(self): model = SbertForSequenceClassification(SbertConfig()) model2 = copy.deepcopy(model) @@ -154,7 +201,7 @@ def test_swift_side(self): print( f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}' ) - + result = model(torch.ones((1, 3, 224, 224))).logits side_config = SideConfig( dim=768, target_modules=r'vit', @@ -162,7 +209,14 @@ def test_swift_side(self): hidden_pos='last_hidden_state') model = Swift.prepare_model(model, config=side_config) - result = model(torch.ones((1, 3, 224, 224))).logits + result_activate = model(torch.ones((1, 3, 224, 224))).logits + model.deactivate_adapter('default') + result_deactivate = model(torch.ones((1, 3, 224, 224))).logits + model.activate_adapter('default') + result_reactivate = model(torch.ones((1, 3, 224, 224))).logits + self.assertTrue(torch.allclose(result, result_deactivate)) + self.assertTrue(not torch.allclose(result, result_activate)) + self.assertTrue(torch.allclose(result_activate, result_reactivate)) print( f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}' ) From 7b56a778826829f588b83a66744d782e3e12a3e9 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sat, 9 Sep 2023 22:13:54 +0800 Subject: [PATCH 31/70] fix inference --- swift/trainers/trainers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 144ca95807..298593c11d 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -104,6 +104,8 @@ def prediction_step( generation_inputs = inputs[self.model.main_input_name] gen_kwargs["input_ids"] = generation_inputs + gen_kwargs["pad_token_id"] = self.tokenizer.pad_token_id + gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id gen_time = time.time() generated_tokens = self.model.generate(**gen_kwargs) gen_time = time.time() - gen_time From 95ffddf278f4c63314de0c3f568ea43571468109 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 08:08:17 +0800 Subject: [PATCH 32/70] update code --- tests/tuners/test_swift_base.py | 43 ++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 1abfd0994f..5992cddcbe 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -192,39 +192,48 @@ def test_swift_multiple_adapters(self): all( torch.isclose(state_dict[key], state_dict2[key]).flatten().detach().cpu())) - def test_swift_side(self): - from transformers import AutoModelForImageClassification - model = AutoModelForImageClassification.from_pretrained( - 'google/vit-base-patch16-224') + + def test_swift_side_bert(self): + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + inputs = preprocessor('how are you') model2 = copy.deepcopy(model) - result_origin = model(torch.ones((1, 3, 224, 224))).logits + result_origin = model(**inputs).logits print( - f'test_swift_side result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}' + f'test_swift_side_bert result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}' ) - result = model(torch.ones((1, 3, 224, 224))).logits + side_config = SideConfig( - dim=768, - target_modules=r'vit', - side_module_name='fcn4', - hidden_pos='last_hidden_state') + dim=model.config.hidden_size, + target_modules=r'.*encoder.encoder', + side_module_name='mlp', + hidden_pos='last_hidden_state' + ) model = Swift.prepare_model(model, config=side_config) - result_activate = model(torch.ones((1, 3, 224, 224))).logits + result_activate = model(**inputs).logits model.deactivate_adapter('default') - result_deactivate = model(torch.ones((1, 3, 224, 224))).logits + result_deactivate = model(**inputs).logits model.activate_adapter('default') - result_reactivate = model(torch.ones((1, 3, 224, 224))).logits - self.assertTrue(torch.allclose(result, result_deactivate)) - self.assertTrue(not torch.allclose(result, result_activate)) + result_reactivate = model(**inputs).logits + self.assertTrue(torch.allclose(result_origin, result_deactivate)) + self.assertTrue(not torch.allclose(result_origin, result_activate)) self.assertTrue(torch.allclose(result_activate, result_reactivate)) print( - f'test_swift_side result shape: {result.shape}, result sum: {torch.sum(result)}' + f'test_swift_side_bert result shape: {result_origin.shape}, result sum: {torch.sum(result_origin)}' ) + self.assertTrue(isinstance(model, SwiftModel)) model.save_pretrained(self.tmp_dir) self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) + self.assertTrue( + os.path.exists( + os.path.join(self.tmp_dir, 'default', WEIGHTS_NAME))) model2 = Swift.from_pretrained(model2, self.tmp_dir) + state_dict = model.state_dict() state_dict2 = model2.state_dict() for key in state_dict: From 3193aae9653f4a53419b4e23fb5c8da346c8ff49 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 12:32:49 +0800 Subject: [PATCH 33/70] fix and pass pre-commit --- examples/pytorch/llm/src/llm_infer.py | 3 +- examples/pytorch/llm/src/llm_sft.py | 48 ++--- examples/pytorch/llm/src/utils/dataset.py | 38 ++-- examples/pytorch/llm/src/utils/model.py | 50 ++--- examples/pytorch/llm/src/utils/preprocess.py | 31 +-- swift/__init__.py | 23 +- swift/trainers/trainers.py | 188 ++++++++--------- swift/tuners/adapter.py | 31 ++- swift/tuners/base.py | 13 +- swift/tuners/lora.py | 39 ++-- swift/tuners/prompt.py | 28 ++- swift/tuners/restuning.py | 208 ++++++++++++------- swift/tuners/restuning_components.py | 201 ++++++++++++------ swift/tuners/side.py | 93 +++++---- swift/utils/torch_utils.py | 6 +- tests/tuners/test_swift_base.py | 52 +++-- tests/tuners/test_swift_restuning.py | 68 +++--- tests/utils/test_torch_utils.py | 5 +- 18 files changed, 668 insertions(+), 457 deletions(-) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index 30ccd22c68..0d7730aa10 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -110,7 +110,8 @@ def llm_infer(args: InferArguments) -> None: # ### Preparing lora if args.sft_type == 'lora': - model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True) + model = Swift.from_pretrained( + model, args.ckpt_dir, inference_mode=True) show_layers(model) print_model_info(model) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 9f52ce9bb6..f0acda895c 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -4,30 +4,29 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' from dataclasses import dataclass, field from functools import partial -from typing import Dict, List -from typing import Optional +from typing import Dict, List, Optional import jieba import numpy as np import torch import torch.distributed as dist -from nltk.translate.bleu_score import (SmoothingFunction, sentence_bleu) -from rouge import Rouge +from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu from rouge.rouge import Rouge from transformers import BitsAndBytesConfig, GenerationConfig - -from swift import (AdapterConfig, HubStrategy, LoRAConfig, Seq2SeqTrainer, - Seq2SeqTrainingArguments, Swift, SwiftConfig, ResTuningConfig, get_logger) -from swift.hub import HubApi, ModelScopeConfig -from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, - seed_everything) -from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, broadcast_string, find_all_linear_for_lora, get_dataset, get_dist_setting, get_model_tokenizer, get_preprocess, is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, show_layers) +from swift import (AdapterConfig, HubStrategy, LoRAConfig, ResTuningConfig, + Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, + SwiftConfig, get_logger) +from swift.hub import HubApi, ModelScopeConfig +from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, + seed_everything) +from swift.utils.llm_utils import data_collate_fn, print_example, stat_dataset + logger = get_logger() @@ -37,8 +36,7 @@ class SftArguments: default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G - sft_type: str = field( - default='lora') + sft_type: str = field(default='lora') template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -110,7 +108,7 @@ class SftArguments: default=None, metadata={ 'help': - 'SDK token can be found in https://modelscope.cn/my/myaccesstoken' + 'SDK token can be found in https://modelscope.cn/my/myaccesstoken' }) # other @@ -118,7 +116,7 @@ class SftArguments: default=None, metadata={ 'help': - "This parameter is used only when model_type.startswith('qwen-7b')" + "This parameter is used only when model_type.startswith('qwen-7b')" }) def __post_init__(self): @@ -241,8 +239,7 @@ def llm_sft(args: SftArguments) -> None: elif sft_type == 'restuner': restuner_config = ResTuningConfig( dims=model.config.hidden_size, - **MODEL_MAPPING[args.model_type]['restuner_TM'] - ) + **MODEL_MAPPING[args.model_type]['restuner_TM']) logger.info(f'restuner_config: {restuner_config}') swift_config['restuner'] = restuner_config model = Swift.prepare_model(model, swift_config) @@ -330,8 +327,9 @@ def llm_sft(args: SftArguments) -> None: eval_steps=args.eval_steps, dataloader_num_workers=args.dataloader_num_workers, load_best_model_at_end=True, - metric_for_best_model='rouge-l', - greater_is_better=True, + metric_for_best_model='rouge-l' + if args.predict_with_generate else 'loss', + greater_is_better=args.predict_with_generate, sortish_sampler=True, optim=args.optim, hub_model_id=args.hub_model_id, @@ -344,8 +342,8 @@ def llm_sft(args: SftArguments) -> None: gradient_checkpointing=args.gradient_checkpointing, predict_with_generate=args.predict_with_generate, generation_config=GenerationConfig.from_dict(generation_config), - local_rank=local_rank, - **kwargs) + local_rank=local_rank, + **kwargs) if args.gradient_checkpointing: # fix: gradients will be None @@ -389,7 +387,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False): try: rouge = Rouge() scores = rouge.get_scores(' '.join(hypothesis), - ' '.join(reference)) + ' '.join(reference)) result = scores[0] for k, v in result.items(): @@ -399,7 +397,8 @@ def _decode(tokens, ignore_pad_token_for_loss=False): list(pred), smoothing_function=SmoothingFunction().method3) score_dict['bleu-4'].append(round(bleu_score * 100, 4)) - except: + except Exception as e: + logger.error(e) logger.error(f'eval error {hypothesis}, {reference}') for k, v in score_dict.items(): @@ -413,7 +412,8 @@ def _decode(tokens, ignore_pad_token_for_loss=False): train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, - compute_metrics=compute_metrics if args.predict_with_generate else None, + compute_metrics=compute_metrics + if args.predict_with_generate else None, ) trainer.train(trainer_args.resume_from_checkpoint) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 08adb9da4d..595dacf83a 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -45,14 +45,18 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset: def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: dataset_train: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen', split='train').to_hf_dataset().rename_columns({ - "content": "query", - "summary": "response", + 'lvjianjin/AdvertiseGen', + split='train').to_hf_dataset().rename_columns({ + 'content': 'query', + 'summary': 'response', }) dataset_val: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset().rename_columns({ - "content": "query", - "summary": "response", + 'lvjianjin/AdvertiseGen', + split='validation').to_hf_dataset().rename_columns({ + 'content': + 'query', + 'summary': + 'response', }) return dataset_train, dataset_val @@ -137,14 +141,16 @@ def get_instinwild_en_dataset() -> HfDataset: def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]: dataset_train: HfDataset = MsDataset.load( - 'modelscope/DuReader_robust-QG', split='train').to_hf_dataset().rename_columns({ - "text1": "query", - "text2": "response", + 'modelscope/DuReader_robust-QG', + split='train').to_hf_dataset().rename_columns({ + 'text1': 'query', + 'text2': 'response', }) dataset_val: HfDataset = MsDataset.load( - 'modelscope/DuReader_robust-QG', split='validation').to_hf_dataset().rename_columns({ - "text1": "query", - "text2": "response", + 'modelscope/DuReader_robust-QG', + split='validation').to_hf_dataset().rename_columns({ + 'text1': 'query', + 'text2': 'response', }) return dataset_train, dataset_val @@ -368,15 +374,17 @@ def get_cmnli_zh_dataset() -> HfDataset: } -def get_dataset(dataset_name_list: List[str]) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]: +def get_dataset( + dataset_name_list: List[str] +) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]: """Returns a dataset to be split or a train-val dataset tuple""" dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = [] for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] dataset_list.append(get_function()) - assert(all(isinstance(dataset, tuple) for dataset in dataset_list) - or all(isinstance(dataset, HfDataset) for dataset in dataset_list)) + assert (all(isinstance(dataset, tuple) for dataset in dataset_list) + or all(isinstance(dataset, HfDataset) for dataset in dataset_list)) if not isinstance(dataset_list[0], tuple): dataset = concatenate_datasets(dataset_list) else: diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index aac85025bf..10e1c7f8b9 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -187,39 +187,39 @@ class AdapterTM(NamedTuple): class ResTunerTM(NamedTuple): # default lora target modules. qkv baichuan = { - "root_modules": r'.*layers.0$', - "stem_modules": r'.*layers\.\d+$', - "target_modules": r'.*model.norm', - "target_modules_hook": "input", - "tuner_cfg": "res_adapter", + 'root_modules': r'.*layers.0$', + 'stem_modules': r'.*layers\.\d+$', + 'target_modules': r'.*model.norm', + 'target_modules_hook': 'input', + 'tuner_cfg': 'res_adapter', } chatglm2 = { - "root_modules": r'.*layers.0$', - "stem_modules": r'.*layers\.\d+$', - "target_modules": r'.*final_layernorm', - "target_modules_hook": "input", - "tuner_cfg": "res_adapter", + 'root_modules': r'.*layers.0$', + 'stem_modules': r'.*layers\.\d+$', + 'target_modules': r'.*final_layernorm', + 'target_modules_hook': 'input', + 'tuner_cfg': 'res_adapter', } llama2 = { - "root_modules": r'.*layers.0$', - "stem_modules": r'.*layers\.\d+$', - "target_modules": r'.*model.norm', - "target_modules_hook": "input", - "tuner_cfg": "res_adapter", + 'root_modules': r'.*layers.0$', + 'stem_modules': r'.*layers\.\d+$', + 'target_modules': r'.*model.norm', + 'target_modules_hook': 'input', + 'tuner_cfg': 'res_adapter', } qwen = { - "root_modules": r'.*transformer.h.0$', - "stem_modules": r'.*transformer.h\.\d+$', - "target_modules": r'.*transformer.ln_f', - "target_modules_hook": "input", - "tuner_cfg": "res_adapter", + 'root_modules': r'.*transformer.h.0$', + 'stem_modules': r'.*transformer.h\.\d+$', + 'target_modules': r'.*transformer.ln_f', + 'target_modules_hook': 'input', + 'tuner_cfg': 'res_adapter', } polylm = { - "root_modules": r'.*transformer.h.0$', - "stem_modules": r'.*transformer.h\.\d+$', - "target_modules": r'.*transformer.ln_f', - "target_modules_hook": "input", - "tuner_cfg": "res_adapter", + 'root_modules': r'.*transformer.h.0$', + 'stem_modules': r'.*transformer.h\.\d+$', + 'target_modules': r'.*transformer.ln_f', + 'target_modules_hook': 'input', + 'tuner_cfg': 'res_adapter', } diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index c4e44f5637..92decc5f1b 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -13,7 +13,7 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, - 'default_generation': { + 'default-generation': { 'prefix': [], 'prompt': ['{{query}}'], 'suffix': [['eos_token_id']], @@ -37,7 +37,7 @@ 'chat_sep': ['\n\n'], 'suffix': [['eos_token_id']], }, - 'chatglm2_generation': { + 'chatglm2-generation': { 'prefix': [[64790, 64792]], 'prompt': ['{{query}}'], 'suffix': [['eos_token_id']], @@ -124,14 +124,14 @@ def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context], def _preprocess( - template_type: str, - tokenizer: PreTrainedTokenizer, - query: str, - response: Optional[str] = None, - history: Optional[History] = None, - system: Optional[str] = None, - max_length: Optional[int] = None, - validate_generation=True, # do cross-validation with `model.generate()` + template_type: str, + tokenizer: PreTrainedTokenizer, + query: str, + response: Optional[str] = None, + history: Optional[History] = None, + system: Optional[str] = None, + max_length: Optional[int] = None, + validate_generation=True, # do cross-validation with `model.generate()` ) -> Dict[str, List[int]]: if history is None: history = [] @@ -187,11 +187,11 @@ def _preprocess( def get_preprocess( - template_type: str, - tokenizer: PreTrainedTokenizer, - system: Optional[str] = None, - max_length: Optional[int] = None, - validate_generation=False, + template_type: str, + tokenizer: PreTrainedTokenizer, + system: Optional[str] = None, + max_length: Optional[int] = None, + validate_generation=False, ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]: def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: @@ -201,4 +201,5 @@ def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: custom_system = example.get('system', system) return _preprocess(template_type, tokenizer, query, response, history, custom_system, max_length, validate_generation) + return preprocess diff --git a/swift/__init__.py b/swift/__init__.py index e41615c414..6e866d6515 100644 --- a/swift/__init__.py +++ b/swift/__init__.py @@ -5,15 +5,16 @@ if TYPE_CHECKING: from .version import __version__, __release_datetime__ - from .tuners import ( - Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig, - SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM, - ResTuningConfig, SideConfig, - PeftModelForSeq2SeqLM, PeftModelForSequenceClassification, - PeftModelForTokenClassification, PrefixTuningConfig, - PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, - get_peft_config, get_peft_model, get_peft_model_state_dict, Prompt, - PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift) + from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel, + LoRA, LoRAConfig, SWIFT_MAPPING, LoraConfig, + PeftConfig, PeftModel, PeftModelForCausalLM, + ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, PrefixTuningConfig, + PromptEncoderConfig, PromptLearningConfig, + PromptTuningConfig, get_peft_config, get_peft_model, + get_peft_model_state_dict, Prompt, PromptConfig, + PromptModule, SwiftConfig, SwiftOutput, Swift) from .hub import snapshot_download, push_to_hub, push_to_hub_async, push_to_hub_in_queue from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType, @@ -30,8 +31,8 @@ 'tuners': [ 'Adapter', 'AdapterConfig', 'AdapterModule', 'SwiftModel', 'LoRA', 'LoRAConfig', 'SWIFT_MAPPING', 'LoraConfig', 'PeftConfig', - 'ResTuningConfig', 'SideConfig', - 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM', + 'ResTuningConfig', 'SideConfig', 'PeftModel', + 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification', 'PeftModelForTokenClassification', 'PrefixTuningConfig', 'PromptEncoderConfig', 'PromptLearningConfig', diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 298593c11d..b31da08f2d 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import time from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -8,7 +9,6 @@ from transformers import Trainer as HfTrainer from transformers import trainer from transformers.deepspeed import is_deepspeed_zero3_enabled -import time from .callback import DefaultFlowCallbackNew, ProgressCallbackNew from .mixin import PushToMsHubMixin, SwiftMixin @@ -22,12 +22,18 @@ class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.perf = { - 'gen_time': 0., - 'gen_len': 0, - 'eval_memory': 0., - 'train_memory': None, - 'model': self.model.get_trainable_parameters(), + self.perf: Dict[str, Any] = { + 'gen_time': + 0., + 'gen_len': + 0, + 'eval_memory': + 0., + 'train_memory': + 0., + 'model': + self.model.get_trainable_parameters() if hasattr( + self.model, 'get_trainable_parameters') else None, } def train( @@ -37,127 +43,107 @@ def train( ): training_output = super().train(*args, **kwargs) if self.perf['train_memory'] is None: - self.perf['train_memory'] = torch.cuda.memory_allocated() + self.perf['train_memory'] = sum([ + torch.cuda.memory_allocated(i) + for i in range(torch.cuda.device_count()) + ]) return training_output def prediction_step( - self, - model: nn.Module, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on `model` using `inputs`. - - Subclass and override to inject custom behavior. - - Args: - model (`nn.Module`): - The model to evaluate. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (`bool`): - Whether or not to return the loss only. - - Return: - Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and - labels (each being optional). - """ - + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + **gen_kwargs, + ) -> Tuple[Optional[float], Optional[torch.Tensor], + Optional[torch.Tensor]]: if not self.args.predict_with_generate or prediction_loss_only: return super().prediction_step( - model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys - ) + model, + inputs, + prediction_loss_only=prediction_loss_only, + ignore_keys=ignore_keys) - has_labels = "labels" in inputs + has_labels = 'labels' in inputs inputs = self._prepare_inputs(inputs) # XXX: adapt synced_gpus for fairscale as well - gen_kwargs = self.model.generation_config.to_dict().copy() - if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: - gen_kwargs["max_length"] = self.model.config.max_length - gen_kwargs["num_beams"] = ( - gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams - ) + # Priority (handled in generate): + # gen_kwargs > model.generation_config > default GenerationConfig() + + if len(gen_kwargs) == 0 and hasattr(self, '_gen_kwargs'): + gen_kwargs = self._gen_kwargs.copy() + + if gen_kwargs.get('max_length') is None and gen_kwargs.get( + 'max_new_tokens') is None: + gen_kwargs['max_length'] = self.model.config.max_length + gen_kwargs['num_beams'] = ( + gen_kwargs['num_beams'] if gen_kwargs.get('num_beams') is not None + else self.model.config.num_beams) default_synced_gpus = True if is_deepspeed_zero3_enabled() else False - gen_kwargs["synced_gpus"] = ( - gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus - ) - - if "attention_mask" in inputs: - gen_kwargs["attention_mask"] = inputs.get("attention_mask", None) - if "position_ids" in inputs: - gen_kwargs["position_ids"] = inputs.get("position_ids", None) - if "global_attention_mask" in inputs: - gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None) - - # prepare generation inputs - # some encoder-decoder models can have varying encoder's and thus - # varying model input names - if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name: + gen_kwargs['synced_gpus'] = ( + gen_kwargs['synced_gpus'] if gen_kwargs.get('synced_gpus') + is not None else default_synced_gpus) + + # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate + # (otherwise, it would continue generating from the padded `decoder_input_ids`) + if ('labels' in inputs and 'decoder_input_ids' in inputs and + inputs['labels'].shape == inputs['decoder_input_ids'].shape): + inputs = { + k: v + for k, v in inputs.items() if k != 'decoder_input_ids' + } + + gen_kwargs['pad_token_id'] = self.tokenizer.pad_token_id + gen_kwargs['eos_token_id'] = self.tokenizer.eos_token_id + gen_time = time.time() + generated_tokens = self.model.generate(**inputs, **gen_kwargs) + gen_time = time.time() - gen_time + + if hasattr( + self.model, 'encoder' + ) and self.model.encoder.main_input_name != self.model.main_input_name: generation_inputs = inputs[self.model.encoder.main_input_name] else: generation_inputs = inputs[self.model.main_input_name] - gen_kwargs["input_ids"] = generation_inputs - gen_kwargs["pad_token_id"] = self.tokenizer.pad_token_id - gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id - gen_time = time.time() - generated_tokens = self.model.generate(**gen_kwargs) - gen_time = time.time() - gen_time generated_tokens = generated_tokens[:, generation_inputs.size()[-1]:] gen_len = len(generated_tokens[0]) self.perf['gen_time'] = self.perf['gen_time'] + gen_time self.perf['gen_len'] = self.perf['gen_len'] + gen_len - self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), self.perf['eval_memory']) + self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), + self.perf['eval_memory']) # in case the batch is shorter than max length, the output should be padded - if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1) - - loss = None + if gen_kwargs.get('max_length') is not None and generated_tokens.shape[ + -1] < gen_kwargs['max_length']: + generated_tokens = self._pad_tensors_to_max_len( + generated_tokens, gen_kwargs['max_length']) + elif gen_kwargs.get('max_new_tokens' + ) is not None and generated_tokens.shape[-1] < ( + gen_kwargs['max_new_tokens'] + 1): + generated_tokens = self._pad_tensors_to_max_len( + generated_tokens, gen_kwargs['max_new_tokens'] + 1) if self.args.prediction_loss_only: - return (loss, None, None) + return None, None, None if has_labels: - labels = inputs["labels"] - if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]: - labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) - elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < ( - gen_kwargs["max_new_tokens"] + 1 - ): - labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1)) + labels = inputs['labels'] + if gen_kwargs.get('max_length') is not None and labels.shape[ + -1] < gen_kwargs['max_length']: + labels = self._pad_tensors_to_max_len(labels, + gen_kwargs['max_length']) + elif gen_kwargs.get( + 'max_new_tokens') is not None and labels.shape[-1] < ( + gen_kwargs['max_new_tokens'] + 1): + labels = self._pad_tensors_to_max_len( + labels, (gen_kwargs['max_new_tokens'] + 1)) else: labels = None - return (loss, generated_tokens, labels) - - def _pad_tensors_to_max_len(self, tensor, max_length): - if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): - # If PAD token is not defined at least EOS token has to be defined - pad_token_id = ( - self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - ) - else: - if self.model.config.pad_token_id is not None: - pad_token_id = self.model.config.pad_token_id - else: - raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") - - padded_tensor = pad_token_id * torch.ones( - (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device - ) - padded_tensor[:, : tensor.shape[-1]] = tensor - return padded_tensor + return None, generated_tokens, labels # monkey patching diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 1d070af1f4..12f3d30641 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -3,11 +3,12 @@ import re import types from dataclasses import dataclass, field -from typing import Union, List +from typing import List, Union import torch from torch import nn from transformers.activations import ACT2CLS + from swift.utils.torch_utils import find_sub_module from .utils import SwiftConfig, SwiftOutput @@ -71,7 +72,8 @@ def __post_init__(self): class Adapter: @staticmethod - def prepare_model(model: nn.Module, config: AdapterConfig, adapter_name: str) -> SwiftOutput: + def prepare_model(model: nn.Module, config: AdapterConfig, + adapter_name: str) -> SwiftOutput: """Prepare a model with `AdapterConfig`""" module_keys = [key for key, _ in model.named_modules()] @@ -94,14 +96,18 @@ def _forward(self, *args, **kwargs): _type = type(args) args = list(args) args[config.hidden_pos] = args[ - config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')(args[config.hidden_pos]) - return _type(args) + config.hidden_pos] + getattr( + self, f'adapter_{adapter_name}')( + args[config.hidden_pos]) + args = _type(args) else: args[config.hidden_pos] = args[ - config.hidden_pos] + getattr(self, f'adapter_{adapter_name}')( - args[config.hidden_pos]) + config.hidden_pos] + getattr( + self, f'adapter_{adapter_name}')( + args[config.hidden_pos]) elif isinstance(args, torch.Tensor): - args = args + getattr(self, f'adapter_{adapter_name}')(args) + args = args + getattr(self, f'adapter_{adapter_name}')( + args) return args def _feed_forward_chunk(self, attention_output): @@ -126,7 +132,8 @@ def _feed_forward_chunk(self, attention_output): def state_dict_callback(state_dict, adapter_name: str): return { key: value - for key, value in state_dict.items() if f'adapter_{adapter_name}' in key + for key, value in state_dict.items() + if f'adapter_{adapter_name}' in key } def mark_trainable_callback(model): @@ -136,8 +143,10 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, f'adapter_{adapter_name}') + def activate_adapter(module: torch.nn.Module, adapter_name: str, + activate: bool): + modules: List[torch.nn.Module] = find_sub_module( + module, f'adapter_{adapter_name}') for _module in modules: _module.activate(activate) @@ -191,7 +200,7 @@ def forward(self, x, identity=None): self.act.to(x.device) self.ln2.to(x.device) self._prepared = True - + x_dtype = x.dtype x = x.to(self.ln1.weight.dtype) out = self.ln2(self.act(self.ln1(x))) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index 11a96ef9fa..dd3f984dc0 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -52,7 +52,8 @@ def __init__(self, self.adapters = {} if isinstance(config, SwiftConfig): - self.adapters[DEFAULT_ADAPTER] = self._prepare_model(model, config, DEFAULT_ADAPTER) + self.adapters[DEFAULT_ADAPTER] = self._prepare_model( + model, config, DEFAULT_ADAPTER) elif isinstance(config, dict): assert (all(isinstance(c, SwiftConfig) for c in config.values())) for adapter_name, config in config.items(): @@ -151,7 +152,8 @@ def state_dict(self, if kwargs.get('save_adapter', True): for name, output in self.adapters.items(): if adapter_name == name or adapter_name is None: - state_dicts.update(output.state_dict_callback(destination, adapter_name)) + state_dicts.update( + output.state_dict_callback(destination, adapter_name)) if kwargs.get('save_extra_states', True): state_dicts.update({ k: v @@ -264,7 +266,8 @@ def _prepare_model( ): assert (hasattr(config, SWIFT_TYPE_KEY)) from .mapping import SWIFT_MAPPING - return SWIFT_MAPPING[config.swift_type][1].prepare_model(model, config, adapter_name) + return SWIFT_MAPPING[config.swift_type][1].prepare_model( + model, config, adapter_name) def create_or_update_model_card(self, output_dir: str): """ @@ -334,8 +337,8 @@ def save_pretrained(self, os.makedirs(save_directory, exist_ok=True) self.create_or_update_model_card(save_directory) - adapter_names = adapter_name if isinstance(adapter_name, - list) or adapter_name is None else [adapter_name] + adapter_names = adapter_name if isinstance( + adapter_name, list) or adapter_name is None else [adapter_name] for adapter_name, output in self.adapters.items(): if adapter_names is not None and adapter_name not in adapter_names: continue diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 8cffafa4b6..1a52628bc9 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -14,8 +14,8 @@ is_bnb_available) from peft.utils import get_auto_gptq_quant_linear, get_quantization_config -from .utils import SwiftConfig, SwiftOutput from ..utils.torch_utils import find_sub_module +from .utils import SwiftConfig, SwiftOutput if is_bnb_available(): import bitsandbytes as bnb @@ -106,7 +106,8 @@ def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str): fan_in_fan_out=config.fan_in_fan_out) def state_dict_callback(state_dict, adapter_name): - return lora_state_dict(state_dict, model.lora_module_map, adapter_name, config.bias) + return lora_state_dict(state_dict, model.lora_module_map, + adapter_name, config.bias) def mark_trainable_callback(model): mark_lora_as_trainable(model, config.bias) @@ -115,7 +116,8 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): + def activate_adapter(module: torch.nn.Module, adapter_name: str, + activate: bool): modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) for _module in modules: if isinstance(_module, LoRALayer): @@ -124,8 +126,8 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool) _module.active_adapter = 'default' if activate else 'invalid' @staticmethod - def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name, - **kwargs): + def _dynamic_patch_lora(model, replace_modules, use_merged_linear, + adapter_name, **kwargs): """Dynamic patch lora to model Args: @@ -230,7 +232,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, adapter_name, r=kwargs['r'], lora_alpha=kwargs['lora_alpha'], merge_weights=kwargs['merge_weights'], - ) + ) elif isinstance(sub_module, torch.nn.Conv2d): kwargs.pop('fan_in_fan_out', None) lora_module = Conv2d( @@ -292,7 +294,8 @@ def unpatch_lora(model, config: LoRAConfig): origin_module = torch.nn.Linear( sub_module.in_features, sub_module.out_features, - bias=hasattr(sub_module, 'bias') and sub_module.bias is not None, + bias=hasattr(sub_module, 'bias') + and sub_module.bias is not None, ) elif isinstance(sub_module, Embedding): origin_module = torch.nn.Embedding( @@ -723,7 +726,9 @@ def forward(self, x: torch.Tensor): return nn.Conv2d.forward(self, x) -def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'none') -> None: +def mark_lora_as_trainable(model: nn.Module, + adapter_name: str, + bias: str = 'none') -> None: if bias == 'none': return elif bias == 'all': @@ -740,18 +745,28 @@ def mark_lora_as_trainable(model: nn.Module, adapter_name: str, bias: str = 'non raise NotImplementedError -def lora_state_dict(state_dict, module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]: +def lora_state_dict(state_dict, + module_map: Dict, + adapter_name: str, + bias: str = 'none') -> Dict[str, torch.Tensor]: if bias == 'none': - return {k: state_dict[k] for k in state_dict if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name} + return { + k: state_dict[k] + for k in state_dict + if 'lora_' in k and module_map.get(k[:k.find('lora_') + - 1], None) == adapter_name + } elif bias == 'all': return { k: state_dict[k] - for k in state_dict if ('lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name) or 'bias' in k + for k in state_dict if ('lora_' in k and module_map.get( + k[:k.find('lora_') - 1], None) == adapter_name) or 'bias' in k } elif bias == 'lora_only': to_return = {} for k in state_dict: - if 'lora_' in k and module_map.get(k[:k.find('lora_')-1], None) == adapter_name: + if 'lora_' in k and module_map.get(k[:k.find('lora_') - 1], + None) == adapter_name: to_return[k] = state_dict[k] bias_name = k.split('lora_')[0] + 'bias' if bias_name in state_dict: diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index ec21650c3a..3c64479369 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -3,13 +3,13 @@ import re import types from dataclasses import dataclass, field -from typing import Union, List +from typing import List, Union import torch from torch import nn -from .utils import SwiftConfig, SwiftOutput from ..utils.torch_utils import find_sub_module +from .utils import SwiftConfig, SwiftOutput @dataclass @@ -78,7 +78,8 @@ def __post_init__(self): class Prompt: @staticmethod - def prepare_model(model: nn.Module, config: PromptConfig, adapter_name: str): + def prepare_model(model: nn.Module, config: PromptConfig, + adapter_name: str): module_keys = [key for key, _ in model.named_modules()] match_module_keys = [] for module_key in module_keys: @@ -99,7 +100,8 @@ def _forward(self, *args, **kwargs): input_embedding = kwargs[config.embedding_pos] input_embedding = getattr( - self, f'prompt_{adapter_name}').forward(input_embedding) + self, + f'prompt_{adapter_name}').forward(input_embedding) if isinstance(config.embedding_pos, int): args = type(args)( args[0:config.embedding_pos] + (input_embedding, ) @@ -117,7 +119,8 @@ def _forward(self, *args, **kwargs): if attention_mask is not None: attention_mask = getattr( self, - f'prompt_{adapter_name}').patch_attention_mask(attention_mask) + f'prompt_{adapter_name}').patch_attention_mask( + attention_mask) if isinstance(config.attention_mask_pos, int): args = type(args)( args[0:config.attention_mask_pos] @@ -129,7 +132,8 @@ def _forward(self, *args, **kwargs): forward_output = self.forward_origin(*args, **kwargs) if config.extract_embedding: forward_output = getattr( - self, f'prompt_{adapter_name}').extract(forward_output) + self, + f'prompt_{adapter_name}').extract(forward_output) return forward_output @@ -150,7 +154,8 @@ def _forward(self, *args, **kwargs): def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if f'prompt_{adapter_name}' in key + for key, value in state_dict.items() + if f'prompt_{adapter_name}' in key } def mark_trainable_callback(model): @@ -160,8 +165,10 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, f'prompt_{adapter_name}') + def activate_adapter(module: torch.nn.Module, adapter_name: str, + activate: bool): + modules: List[torch.nn.Module] = find_sub_module( + module, f'prompt_{adapter_name}') for _module in modules: _module.activate(activate) @@ -199,7 +206,8 @@ def __init__(self, def forward(self, x): if not self._activate: return x - prompt_token = self.prompt_token.expand(x.shape[0], -1, -1).to(x.device) + prompt_token = self.prompt_token.expand(x.shape[0], -1, + -1).to(x.device) if self.layer_num == 0: if self.attach_front: diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index bc16ce40b9..d8ddbc5aab 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -3,15 +3,16 @@ import re import types from dataclasses import dataclass, field -from typing import Union, Dict, Optional, List +from typing import Dict, List, Optional, Union import torch import torch.nn as nn from swift.utils.logger import get_logger -from .restuning_components import probe_input_pre_hook, probe_output_hook, detach_tensors, ResTuner -from .utils import SwiftConfig, SwiftOutput from ..utils.torch_utils import find_sub_module +from .restuning_components import (ResTuner, detach_tensors, + probe_input_pre_hook, probe_output_hook) +from .utils import SwiftConfig, SwiftOutput logger = get_logger() @@ -46,11 +47,12 @@ class ResTuningConfig(SwiftConfig): root_modules: str = field( default=None, metadata={ - 'help': 'The root module to be replaced, can a regex string (use the first matching module) or full match format' + 'help': + 'The root module to be replaced, can a regex string (use the first matching module) or full match format' }) root_modules_hook: str = field( - default="input", + default='input', metadata={ 'help': 'The hook type of root modules, can be "input" or "output"' }) @@ -58,11 +60,12 @@ class ResTuningConfig(SwiftConfig): stem_modules: Optional[Union[List[str], str]] = field( default=None, metadata={ - 'help': 'The stem modules to be replaced, can a regex string or name list of full match format' + 'help': + 'The stem modules to be replaced, can a regex string or name list of full match format' }) stem_modules_hook: str = field( - default="output", + default='output', metadata={ 'help': 'The hook type of stem modules, can be "input" or "output"' }) @@ -70,25 +73,30 @@ class ResTuningConfig(SwiftConfig): target_modules: str = field( default=None, metadata={ - 'help': 'The target module to be replaced, can a regex string (use the first matching module) or full match format' + 'help': + 'The target module to be replaced, can a regex string (use the first matching module) or full match format' }) target_modules_hook: str = field( - default="input", + default='input', metadata={ - 'help': 'The hook type of target modules, can be "input" or "output"' + 'help': + 'The hook type of target modules, can be "input" or "output"' }) target_hidden_pos: str = field( default=None, metadata={ 'help': - 'The position of the hidden state for target modules output' + 'The position of the hidden state for target modules output' }) tuner_cfg: Optional[Union[List[Dict], Dict, str]] = field( default=None, - metadata={'help': 'The configuration of the tuning module, can a string or customized config'}) + metadata={ + 'help': + 'The configuration of the tuning module, can a string or customized config' + }) use_upsample: bool = field( default=False, @@ -96,15 +104,16 @@ class ResTuningConfig(SwiftConfig): upsample_out_channels: List[int] = field( default=None, - metadata={'help': 'The number of output channels when "use_upsample" is set to "True"'}) + metadata={ + 'help': + 'The number of output channels when "use_upsample" is set to "True"' + }) zero_init_last: bool = field( - default=False, - metadata={'help': 'Zero init last weight'}) + default=False, metadata={'help': 'Zero init last weight'}) use_bypass: bool = field( - default=True, - metadata={'help': 'Whether to use bypass'}) + default=True, metadata={'help': 'Whether to use bypass'}) def __post_init__(self): from .mapping import SwiftTuners @@ -114,28 +123,36 @@ def __post_init__(self): class ResTuning: @staticmethod - def prepare_model(model: nn.Module, config: ResTuningConfig, adapter_name: str) -> SwiftOutput: + def prepare_model(model: nn.Module, config: ResTuningConfig, + adapter_name: str) -> SwiftOutput: """Prepare a model with `ResTuningConfig`""" def _forward_seq(self, input, *args, **kwargs): for idx, module in enumerate(self): - if idx >= len(self.origin_module_keys): continue + if idx >= len(self.origin_module_keys): + continue input = module(input) return input def _forward_target(self, *args, **kwargs): - if self.target_modules_hook == "input": + if self.target_modules_hook == 'input': args = list(args) - _arg = args[0 if self.target_hidden_pos is None else self.target_hidden_pos] + _arg = args[0 if self.target_hidden_pos is None else self. + target_hidden_pos] args_main = _forward_restuning(self, _arg) - args[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main + args[0 if self.target_hidden_pos is None else self. + target_hidden_pos] = args_main args_main = self.forward_origin(*args, **kwargs) else: _args_main = self.forward_origin(*args, **kwargs) - _arg = _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] if isinstance(_args_main, (tuple, list)) else _args_main + _arg = _args_main[0 if self.target_hidden_pos is None else self + .target_hidden_pos] if isinstance( + _args_main, + (tuple, list)) else _args_main args_main = _forward_restuning(self, _arg) if type(_args_main) != type(args_main): - _args_main[0 if self.target_hidden_pos is None else self.target_hidden_pos] = args_main + _args_main[0 if self.target_hidden_pos is None else self. + target_hidden_pos] = args_main args_main = _args_main return args_main @@ -156,7 +173,9 @@ def _forward_restuning(self, origin_arg): probe_results.append(st_mod.probe_input_data) else: probe_results.append(st_mod.probe_output_data) - args_main = getattr(top_module, f'restuning_{adapter_name}')(probe_results, origin_arg) + args_main = getattr(top_module, + f'restuning_{adapter_name}')(probe_results, + origin_arg) return args_main # 1. Matching the root module @@ -166,19 +185,23 @@ def _forward_restuning(self, origin_arg): for module_key in module_keys: if re.fullmatch(config.root_modules, module_key): root_module = model.get_submodule(module_key) - logger.info(f"Matching root module [{module_key}] of type {type(root_module)}") + logger.info( + f'Matching root module [{module_key}] of type {type(root_module)}' + ) if isinstance(root_module, (nn.ModuleList, nn.ModuleDict)): logger.warning( - f"Type of {type(root_module)} may not be supported because of its customized forward") - if config.root_modules_hook == "input": - root_module.register_forward_pre_hook(probe_input_pre_hook) + f'Type of {type(root_module)} may not be supported because of its customized forward' + ) + if config.root_modules_hook == 'input': + root_module.register_forward_pre_hook( + probe_input_pre_hook) else: root_module.register_forward_hook(probe_output_hook) root_module.root_modules_hook = config.root_modules_hook root_module_ins_list.append(root_module) break if len(root_module_ins_list) == 0: - logger.error(f"Cannot match root modules") + logger.error('Cannot match root modules') # 2. Matching the stem module stem_module_ins_list = [] @@ -188,32 +211,40 @@ def _forward_restuning(self, origin_arg): (isinstance(config.stem_modules, list) and module_key in config.stem_modules): stem_module = model.get_submodule(module_key) if isinstance(config.stem_modules, list): - stem_module_ins_index.append(config.stem_modules.index(module_key)) - logger.info(f"Matching stem module [{module_key}] of type {type(stem_module)}") + stem_module_ins_index.append( + config.stem_modules.index(module_key)) + logger.info( + f'Matching stem module [{module_key}] of type {type(stem_module)}' + ) if isinstance(stem_module, (nn.ModuleList, nn.ModuleDict)): logger.warning( - f"Type of {type(stem_module)} may not be supported because of its customized forward") - if len(root_module_ins_list) == 0 and len(stem_module_ins_list) == 0: + f'Type of {type(stem_module)} may not be supported because of its customized forward' + ) + if len(root_module_ins_list) == 0 and len( + stem_module_ins_list) == 0: stem_module.register_forward_pre_hook(probe_input_pre_hook) - if config.stem_modules_hook == "input": + if config.stem_modules_hook == 'input': stem_module.register_forward_pre_hook(probe_input_pre_hook) else: stem_module.register_forward_hook(probe_output_hook) stem_module.stem_modules_hook = config.stem_modules_hook stem_module_ins_list.append(stem_module) if isinstance(config.stem_modules, list): - stem_module_ins_list = [stem_module_ins_list[stem_module_ins_index.index(i)] for i in - range(len(stem_module_ins_index))] + stem_module_ins_list = [ + stem_module_ins_list[stem_module_ins_index.index(i)] + for i in range(len(stem_module_ins_index)) + ] depth = len(stem_module_ins_list) if len(stem_module_ins_list) == 0: - raise Exception(f"Cannot match source modules") + raise Exception('Cannot match source modules') # 3. Init restuning module if len(stem_module_ins_list) != 0: top_module = model.get_submodule('') - restuning_module = ResTuningBypassModule(config.dims, depth, config.use_upsample, - config.upsample_out_channels, config.zero_init_last, - config.tuner_cfg) + restuning_module = ResTuningBypassModule( + config.dims, depth, config.use_upsample, + config.upsample_out_channels, config.zero_init_last, + config.tuner_cfg) setattr(top_module, f'restuning_{adapter_name}', restuning_module) # 4. Matching the target module @@ -221,10 +252,13 @@ def _forward_restuning(self, origin_arg): for module_key in module_keys: if re.fullmatch(config.target_modules, module_key): tgt_module = model.get_submodule(module_key) - logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}") + logger.info( + f'Matching target module [{module_key}] of type {type(tgt_module)}' + ) if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)): raise Exception( - f"Type of {type(tgt_module)} may not be supported because of its customized forward") + f'Type of {type(tgt_module)} may not be supported because of its customized forward' + ) tgt_module.target_modules_hook = config.target_modules_hook tgt_module.target_hidden_pos = config.target_hidden_pos @@ -233,18 +267,22 @@ def _forward_restuning(self, origin_arg): target_module_ins = tgt_module if isinstance(tgt_module, nn.Sequential): - tgt_module.origin_module_keys = copy.deepcopy(list(tgt_module._modules.keys())) - tgt_module.forward_origin = types.MethodType(_forward_seq, tgt_module) + tgt_module.origin_module_keys = copy.deepcopy( + list(tgt_module._modules.keys())) + tgt_module.forward_origin = types.MethodType( + _forward_seq, tgt_module) else: tgt_module.forward_origin = tgt_module.forward - tgt_module.forward = types.MethodType(_forward_target, tgt_module) + tgt_module.forward = types.MethodType(_forward_target, + tgt_module) if target_module_ins is None: - raise Exception(f"Cannot match target modules") + raise Exception('Cannot match target modules') def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if f'restuning_{adapter_name}' in key + for key, value in state_dict.items() + if f'restuning_{adapter_name}' in key } def mark_trainable_callback(model): @@ -254,8 +292,10 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, f'restuning_{adapter_name}') + def activate_adapter(module: torch.nn.Module, adapter_name: str, + activate: bool): + modules: List[torch.nn.Module] = find_sub_module( + module, f'restuning_{adapter_name}') for _module in modules: _module.activate(activate) @@ -265,13 +305,13 @@ class ResTuningBypassModule(nn.Module): """ def __init__( - self, - dims, - depth, - use_upsample=False, - upsample_out_channels=None, - zero_init_last=False, - tuner_cfg=None, + self, + dims, + depth, + use_upsample=False, + upsample_out_channels=None, + zero_init_last=False, + tuner_cfg=None, ): super(ResTuningBypassModule, self).__init__() @@ -282,12 +322,13 @@ def __init__( layer_num=i, depth=depth, use_upsample=use_upsample, - upsample_out_channels=upsample_out_channels[i] if isinstance(upsample_out_channels, - list) else upsample_out_channels, + upsample_out_channels=upsample_out_channels[i] if isinstance( + upsample_out_channels, list) else upsample_out_channels, zero_init_last=zero_init_last, - tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list) else tuner_cfg - ) - for i in range(depth)]) + tuner_cfg=tuner_cfg[i] if isinstance(tuner_cfg, list + ) else tuner_cfg) + for i in range(depth) + ]) def activate(self, activate=True): self._activate = activate @@ -296,17 +337,29 @@ def forward(self, x_list, origin_arg, **kwargs): if not self._activate: return origin_arg x_bypass = detach_tensors(x_list.pop(0)) - x_bypass = x_bypass[0] if isinstance(x_bypass, (list, tuple)) else x_bypass + x_bypass = x_bypass[0] if isinstance(x_bypass, + (list, tuple)) else x_bypass x_list = detach_tensors(x_list) - x_list = [_x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list] + x_list = [ + _x[0] if isinstance(_x, (list, tuple)) else _x for _x in x_list + ] for i, (bp_blk, x_stem) in enumerate(zip(self.bypass_blocks, x_list)): - target_size = x_list[i + 1].shape[2:] if i < len(x_list) - 1 else None + target_size = x_list[ + i + 1].shape[2:] if i < len(x_list) - 1 else None x_bypass = bp_blk(x_stem, x_bypass, target_size, **kwargs) return x_bypass class ResTunerBypassBlock(nn.Module): - def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_last=False, tuner_cfg=None, **kwargs): + + def __init__(self, + dim, + layer_num=-1, + depth=-1, + use_upsample=False, + zero_init_last=False, + tuner_cfg=None, + **kwargs): super().__init__() self.layer_num = layer_num self.depth = depth @@ -314,16 +367,21 @@ def __init__(self, dim, layer_num=-1, depth=-1, use_upsample=False, zero_init_la if isinstance(tuner_cfg, str): lateral_cfg = tuner_cfg vertical_cfg = tuner_cfg - aux_cfg = "upsample" if use_upsample and layer_num != depth - 1 else None + aux_cfg = 'upsample' if use_upsample and layer_num != depth - 1 else None elif isinstance(tuner_cfg, dict): - lateral_cfg = tuner_cfg['lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None - vertical_cfg = tuner_cfg['vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None + lateral_cfg = tuner_cfg[ + 'lateral_cfg'] if 'lateral_cfg' in tuner_cfg else None + vertical_cfg = tuner_cfg[ + 'vertical_cfg'] if 'vertical_cfg' in tuner_cfg else None aux_cfg = tuner_cfg['aux_cfg'] if 'aux_cfg' in tuner_cfg else None - self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "lateral", lateral_cfg, **kwargs) - self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "vertical", vertical_cfg, **kwargs) + self.lateral_tuner = ResTuner(dim, layer_num, depth, zero_init_last, + 'lateral', lateral_cfg, **kwargs) + self.vertical_tuner = ResTuner(dim, layer_num, depth, zero_init_last, + 'vertical', vertical_cfg, **kwargs) if aux_cfg and len(aux_cfg) != 0: - self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last, "aux", aux_cfg, **kwargs) + self.aux_tuner = ResTuner(dim, layer_num, depth, zero_init_last, + 'aux', aux_cfg, **kwargs) def forward(self, x_stem, x_bypass, target_size=None, **kwargs): x_lateral = self.lateral_tuner(x_stem) @@ -332,10 +390,4 @@ def forward(self, x_stem, x_bypass, target_size=None, **kwargs): x_bypass_out = x_lateral + x_vertical if hasattr(self, 'aux_tuner'): x_bypass_out = self.aux_tuner(x_bypass_out, target_size) - - # logger.info(f"x_main:{x_stem.shape} / {torch.sum(x_stem)}, x_side:{x_bypass.shape} / {torch.sum(x_bypass)}") - # logger.info(f"x_lateral:{x_lateral.shape} / {torch.sum(x_lateral)}, x_vertical:{x_vertical.shape} / {torch.sum(x_vertical)}") - # logger.info(f"x_bypass_out: {x_bypass_out.shape} / {torch.sum(x_bypass_out)}") - return x_bypass_out - diff --git a/swift/tuners/restuning_components.py b/swift/tuners/restuning_components.py index 9c99543c37..e7f02aa5d8 100644 --- a/swift/tuners/restuning_components.py +++ b/swift/tuners/restuning_components.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import math + import torch import torch.nn as nn import torch.nn.functional as F @@ -11,8 +12,15 @@ class ResTuner(nn.Module): - def __init__( - self, dim=None, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg={}, **kwargs): + + def __init__(self, + dim=None, + layer_num=-1, + depth=-1, + zero_init_last=False, + stage='', + tuner_cfg={}, + **kwargs): super().__init__() self.dim = dim self.layer_num = layer_num @@ -20,32 +28,51 @@ def __init__( self.stage = stage self.tuner_cfg = tuner_cfg - if (isinstance(tuner_cfg, str) and tuner_cfg == "res_adapter") or \ - (isinstance(tuner_cfg, dict) and "res_adapter" in tuner_cfg): - tuner_cfg = tuner_cfg['res_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg - self.tuner = ResAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last, - stage=stage, tuner_cfg=tuner_cfg, **kwargs) - elif (isinstance(tuner_cfg, str) and tuner_cfg == "res_group_adapter") or \ - (isinstance(tuner_cfg, dict) and "res_group_adapter" in tuner_cfg): - tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance(tuner_cfg, dict) else tuner_cfg - self.tuner = ResGroupAdapter(dim=dim, layer_num=layer_num, depth=depth, zero_init_last=zero_init_last, - stage=stage, tuner_cfg=tuner_cfg, **kwargs) - elif (isinstance(tuner_cfg, str) and tuner_cfg == "upsample") or \ - (isinstance(tuner_cfg, dict) and "upsample" in tuner_cfg): - tuner_cfg = tuner_cfg['upsample'] if isinstance(tuner_cfg, dict) else tuner_cfg + if (isinstance(tuner_cfg, str) and tuner_cfg == 'res_adapter') or \ + (isinstance(tuner_cfg, dict) and 'res_adapter' in tuner_cfg): + tuner_cfg = tuner_cfg['res_adapter'] if isinstance( + tuner_cfg, dict) else tuner_cfg + self.tuner = ResAdapter( + dim=dim, + layer_num=layer_num, + depth=depth, + zero_init_last=zero_init_last, + stage=stage, + tuner_cfg=tuner_cfg, + **kwargs) + elif (isinstance(tuner_cfg, str) and tuner_cfg == 'res_group_adapter') or \ + (isinstance(tuner_cfg, dict) and 'res_group_adapter' in tuner_cfg): + tuner_cfg = tuner_cfg['res_group_adapter'] if isinstance( + tuner_cfg, dict) else tuner_cfg + self.tuner = ResGroupAdapter( + dim=dim, + layer_num=layer_num, + depth=depth, + zero_init_last=zero_init_last, + stage=stage, + tuner_cfg=tuner_cfg, + **kwargs) + elif (isinstance(tuner_cfg, str) and tuner_cfg == 'upsample') or \ + (isinstance(tuner_cfg, dict) and 'upsample' in tuner_cfg): + tuner_cfg = tuner_cfg['upsample'] if isinstance( + tuner_cfg, dict) else tuner_cfg if 'upsample_out_channels' in kwargs: out_channels = kwargs['upsample_out_channels'] use_conv = True if out_channels else False else: out_channels = dim use_conv = False - self.tuner = Upsample(channels=dim, use_conv=use_conv, out_channels=out_channels, tuner_cfg=tuner_cfg, - **kwargs) + self.tuner = Upsample( + channels=dim, + use_conv=use_conv, + out_channels=out_channels, + tuner_cfg=tuner_cfg, + **kwargs) else: self.tuner = Identity() def forward(self, x, *args, **kwargs): - if self.tuner_cfg == "zero" or "zero" in self.tuner_cfg: + if self.tuner_cfg == 'zero' or 'zero' in self.tuner_cfg: x_out = 0.0 else: x_out = self.tuner(x, *args, **kwargs) @@ -53,30 +80,45 @@ def forward(self, x, *args, **kwargs): class ResAdapter(nn.Module): - def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU, + + def __init__(self, + dim, + layer_num=-1, + depth=-1, + zero_init_last=False, + stage='', + tuner_cfg=None, + act_layer=nn.GELU, **kwargs): super(ResAdapter, self).__init__() self.dim = dim self.layer_num = layer_num self.depth = depth - self.adapter_length = tuner_cfg['adapter_length'] if 'adapter_length' in tuner_cfg else 32 - self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None - self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None - - self.adapter_length = self.adapter_length[self.layer_num] if isinstance(self.adapter_length, - list) else self.adapter_length - assert isinstance(self.adapter_length, int) or ( - isinstance(self.adapter_length, tuple) and len(self.adapter_length) == 3) + self.adapter_length = tuner_cfg[ + 'adapter_length'] if 'adapter_length' in tuner_cfg else 32 + self.adapter_type = tuner_cfg[ + 'adapter_type'] if 'adapter_type' in tuner_cfg else None + self.adapter_weight = tuner_cfg[ + 'adapter_weight'] if 'adapter_weight' in tuner_cfg else None + + self.adapter_length = self.adapter_length[ + self.layer_num] if isinstance(self.adapter_length, + list) else self.adapter_length + assert isinstance(self.adapter_length, + int) or (isinstance(self.adapter_length, tuple) + and len(self.adapter_length) == 3) if isinstance(self.adapter_length, int): self.ln1 = nn.Linear(dim, self.adapter_length) else: - self.ln1 = nn.Linear(self.adapter_length[0], self.adapter_length[1]) + self.ln1 = nn.Linear(self.adapter_length[0], + self.adapter_length[1]) self.activate = act_layer() if isinstance(self.adapter_length, int): self.ln2 = nn.Linear(self.adapter_length, dim) else: - self.ln2 = nn.Linear(self.adapter_length[1], self.adapter_length[2]) + self.ln2 = nn.Linear(self.adapter_length[1], + self.adapter_length[2]) dim = self.adapter_length[2] self._xavier_init_weights(self.ln1) @@ -109,46 +151,64 @@ def forward(self, x): self.activate.to(x.device) self.ln2.to(x.device) self._prepared = True - + x_dtype = x.dtype x = x.to(self.ln1.weight.dtype) x_shortcut = x if len(x_shortcut.size()) == 4: B, C, N1, N2 = x.size() - x = x.view(x_shortcut.size()[0], x_shortcut.size()[1], -1).permute(0, 2, 1) + x = x.view(x_shortcut.size()[0], + x_shortcut.size()[1], -1).permute(0, 2, 1) x_adapter = self.ln2(self.activate(self.ln1(x))) if self.adapter_weight: - x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight) + x_adapter = apply_data_weight(x_adapter, self.scaling, + self.adapter_weight) if len(x_shortcut.size()) == 4: - x_adapter = x_adapter.permute(0, 2, 1).view(x_shortcut.size()[0], x_adapter.size()[-1], - x_shortcut.size()[2], x_shortcut.size()[3]) + x_adapter = x_adapter.permute(0, 2, + 1).view(x_shortcut.size()[0], + x_adapter.size()[-1], + x_shortcut.size()[2], + x_shortcut.size()[3]) x_out = x_shortcut + x_adapter return x_out.to(x_dtype) class ResGroupAdapter(nn.Module): - def __init__(self, dim, layer_num=-1, depth=-1, zero_init_last=False, stage="", tuner_cfg=None, act_layer=nn.GELU, + + def __init__(self, + dim, + layer_num=-1, + depth=-1, + zero_init_last=False, + stage='', + tuner_cfg=None, + act_layer=nn.GELU, **kwargs): super(ResGroupAdapter, self).__init__() self.dim = dim self.layer_num = layer_num self.depth = depth - self.adapter_type = tuner_cfg['adapter_type'] if 'adapter_type' in tuner_cfg else None - self.adapter_weight = tuner_cfg['adapter_weight'] if 'adapter_weight' in tuner_cfg else None + self.adapter_type = tuner_cfg[ + 'adapter_type'] if 'adapter_type' in tuner_cfg else None + self.adapter_weight = tuner_cfg[ + 'adapter_weight'] if 'adapter_weight' in tuner_cfg else None self.adapter_dim = tuner_cfg['dim'] if 'dim' in tuner_cfg else dim self.adapter_head = tuner_cfg['head'] if 'head' in tuner_cfg else 4 - self.adapter_scale_factor = tuner_cfg['scale_factor'] if 'scale_factor' in tuner_cfg else 2 + self.adapter_scale_factor = tuner_cfg[ + 'scale_factor'] if 'scale_factor' in tuner_cfg else 2 assert self.adapter_dim % self.adapter_head == 0, 'adapter dim should be divisible by adapter head' self.dim_mlp = self.adapter_dim // self.adapter_head - self.ln1 = nn.Linear(self.dim_mlp, self.dim_mlp * self.adapter_scale_factor) - self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor, self.dim_mlp) + self.ln1 = nn.Linear(self.dim_mlp, + self.dim_mlp * self.adapter_scale_factor) + self.ln2 = nn.Linear(self.dim_mlp * self.adapter_scale_factor, + self.dim_mlp) self.activate = act_layer() self._kaiming_init_weights(self.ln1) @@ -180,29 +240,35 @@ def forward(self, x): self.activate.to(x.device) self.ln2.to(x.device) self._prepared = True - + x_dtype = x.dtype x = x.to(self.ln1.weight.dtype) x_shortcut = x batch, inner_dim, height, width = x.shape - x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + x_adapter = x.permute(0, 2, 3, 1).reshape(batch, height * width, + inner_dim) - x_adapter = rearrange(x_adapter, "b n (c h) -> (b h) n c", h=self.adapter_head) + x_adapter = rearrange( + x_adapter, 'b n (c h) -> (b h) n c', h=self.adapter_head) x_adapter = self.ln2(self.activate(self.ln1(x_adapter))) - x_adapter = rearrange(x_adapter, "(b h) n c -> b n (c h)", h=self.adapter_head) + x_adapter = rearrange( + x_adapter, '(b h) n c -> b n (c h)', h=self.adapter_head) if self.adapter_weight: - x_adapter = apply_data_weight(x_adapter, self.scaling, self.adapter_weight) + x_adapter = apply_data_weight(x_adapter, self.scaling, + self.adapter_weight) - x_adapter = x_adapter.reshape(batch, height, width, -1).permute(0, 3, 1, 2).contiguous() + x_adapter = x_adapter.reshape(batch, height, width, + -1).permute(0, 3, 1, 2).contiguous() x_out = x_shortcut + x_adapter return x_out.to(x_dtype) class Identity(nn.Module): + def __init__(self): super().__init__() @@ -219,16 +285,23 @@ class Upsample(nn.Module): upsampling occurs in the inner-two dimensions. """ - def __init__(self, channels, use_conv=False, out_channels=None, padding=1, **kwargs): + def __init__(self, + channels, + use_conv=False, + out_channels=None, + padding=1, + **kwargs): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.use_conv = use_conv if use_conv: - self.conv = nn.Conv2d(self.channels, self.out_channels, 3, padding=padding) + self.conv = nn.Conv2d( + self.channels, self.out_channels, 3, padding=padding) self.init_weights() def init_weights(self): + def _init_weights(m): if isinstance(m, nn.Conv2d): nn.init.zeros_(m.weight) @@ -239,9 +312,11 @@ def _init_weights(m): def forward(self, x, target_size=None, *args, **kwargs): assert x.shape[1] == self.channels if target_size is None: - x = F.interpolate(x.float(), scale_factor=2, mode="nearest").type_as(x) + x = F.interpolate( + x.float(), scale_factor=2, mode='nearest').type_as(x) else: - x = F.interpolate(x.float(), target_size, mode="nearest").type_as(x) + x = F.interpolate( + x.float(), target_size, mode='nearest').type_as(x) if self.use_conv: x = self.conv(x) return x @@ -250,27 +325,27 @@ def forward(self, x, target_size=None, *args, **kwargs): def init_weight_type(dim, weight_type): if weight_type is None: scaling = None - elif weight_type == "gate": + elif weight_type == 'gate': scaling = nn.Linear(dim, 1) - elif weight_type == "scale": + elif weight_type == 'scale': scaling = nn.Parameter(torch.Tensor(1)) scaling.data.fill_(1) - elif weight_type == "scale_kv": + elif weight_type == 'scale_kv': scaling_k = nn.Parameter(torch.Tensor(1)) scaling_k.data.fill_(1) scaling_v = nn.Parameter(torch.Tensor(1)) scaling_v.data.fill_(1) scaling = (scaling_k, scaling_v) - elif weight_type == "scale_channel": + elif weight_type == 'scale_channel': scaling = nn.Parameter(torch.Tensor(dim)) scaling.data.fill_(1) - elif weight_type == "scale_kv_channel": + elif weight_type == 'scale_kv_channel': scaling_k = nn.Parameter(torch.Tensor(dim)) scaling_k.data.fill_(1) scaling_v = nn.Parameter(torch.Tensor(dim)) scaling_v.data.fill_(1) scaling = (scaling_k, scaling_v) - elif weight_type and weight_type.startswith("scalar"): + elif weight_type and weight_type.startswith('scalar'): scaling = float(weight_type.split('_')[-1]) else: scaling = None @@ -278,9 +353,11 @@ def init_weight_type(dim, weight_type): def apply_data_weight(data, scaling, weight_type): - if weight_type in ["gate"]: - scaling = torch.mean(torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1) - elif weight_type in ["scale", "scale_channel"] or weight_type.startswith('scalar'): + if weight_type in ['gate']: + scaling = torch.mean( + torch.sigmoid(scaling(data)), dim=1).view(-1, 1, 1) + elif weight_type in ['scale', 'scale_channel' + ] or weight_type.startswith('scalar'): scaling = scaling else: scaling = None @@ -291,7 +368,10 @@ def apply_data_weight(data, scaling, weight_type): def detach_tensors(feats): if type(feats) in [list, tuple]: - feats = [detach_tensors(feat) if feat is not None else None for feat in feats] + feats = [ + detach_tensors(feat) if feat is not None else None + for feat in feats + ] elif isinstance(feats, dict): feats = {key: detach_tensors(val) for key, val in feats.items()} elif isinstance(feats, torch.Tensor): @@ -316,4 +396,3 @@ def probe_output_hook(self, args, result): output = result probe_tensors(self, output, 'probe_output_data') return output - diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 2e509e9e11..5f25f879a1 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -1,21 +1,21 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import copy import inspect import re import types -import copy +from collections import OrderedDict from dataclasses import dataclass, field from functools import partial -from typing import Union, Callable, Any, List -from collections import OrderedDict from itertools import repeat +from typing import Any, Callable, List, Union import torch -from torch import nn import torchvision +from torch import nn from swift.utils.logger import get_logger -from .utils import SwiftConfig, SwiftOutput from ..utils.torch_utils import find_sub_module +from .utils import SwiftConfig, SwiftOutput logger = get_logger() @@ -45,13 +45,14 @@ class SideConfig(SwiftConfig): }) side_module_name: str = field( - default=1., metadata={'help': 'The name of the additive side networks'}) + default=1., + metadata={'help': 'The name of the additive side networks'}) hidden_pos: Union[str, int] = field( default=0, metadata={ 'help': - 'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)' + 'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)' }) def __post_init__(self): @@ -62,40 +63,51 @@ def __post_init__(self): class Side: @staticmethod - def prepare_model(model: nn.Module, config: SideConfig, adapter_name: str) -> SwiftOutput: + def prepare_model(model: nn.Module, config: SideConfig, + adapter_name: str) -> SwiftOutput: """Prepare a model with `SideConfig`""" module_keys = [key for key, _ in model.named_modules()] for module_key in module_keys: if re.fullmatch(config.target_modules, module_key): # noqa tgt_module = model.get_submodule(module_key) - logger.info(f"Matching target module [{module_key}] of type {type(tgt_module)}") + logger.info( + f'Matching target module [{module_key}] of type {type(tgt_module)}' + ) if isinstance(tgt_module, (nn.ModuleList, nn.ModuleDict)): raise Exception( - f"Type of {type(tgt_module)} may not be supported because of its customized forward") + f'Type of {type(tgt_module)} may not be supported because of its customized forward' + ) def _forward(self, *args, **kwargs): args_main = self.forward_origin(*args, **kwargs) if isinstance(args_main, (tuple, list, dict)): if isinstance(config.hidden_pos, str): - args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos]) + args_main[config.hidden_pos] = getattr( + self, f'side_{adapter_name}')( + *args, args_main[config.hidden_pos]) else: _type = type(args_main) args_main = list(args_main) - args_main[config.hidden_pos] = getattr(self, f'side_{adapter_name}')(*args, args_main[config.hidden_pos]) + args_main[config.hidden_pos] = getattr( + self, f'side_{adapter_name}')( + *args, args_main[config.hidden_pos]) args_main = _type(args_main) return args_main if isinstance(tgt_module, nn.Sequential): - tgt_module.tgt_module_keys = copy.deepcopy(list(tgt_module._modules.keys())) + tgt_module.tgt_module_keys = copy.deepcopy( + list(tgt_module._modules.keys())) def forward_seq(self, input, *args, **kwargs): for idx, module in enumerate(self): - if idx >= len(tgt_module.tgt_module_keys): continue + if idx >= len(tgt_module.tgt_module_keys): + continue input = module(input) return input - tgt_module.forward_origin = types.MethodType(forward_seq, tgt_module) + tgt_module.forward_origin = types.MethodType( + forward_seq, tgt_module) else: tgt_module.forward_origin = tgt_module.forward tgt_module.forward = types.MethodType(_forward, tgt_module) @@ -105,7 +117,8 @@ def forward_seq(self, input, *args, **kwargs): def state_dict_callback(state_dict, adapter_name): return { key: value - for key, value in state_dict.items() if f'side_{adapter_name}' in key + for key, value in state_dict.items() + if f'side_{adapter_name}' in key } def mark_trainable_callback(model): @@ -115,8 +128,10 @@ def mark_trainable_callback(model): mark_trainable_callback) @staticmethod - def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, f'side_{adapter_name}') + def activate_adapter(module: torch.nn.Module, adapter_name: str, + activate: bool): + modules: List[torch.nn.Module] = find_sub_module( + module, f'side_{adapter_name}') for _module in modules: _module.activate(activate) @@ -134,11 +149,7 @@ class SideModule(nn.Module): side_module_name: The name of the additive side networks. """ - def __init__( - self, - dim, - side_module_name='fcn4' - ): + def __init__(self, dim, side_module_name='fcn4'): super(SideModule, self).__init__() side_module_name = side_module_name.lower() @@ -149,13 +160,13 @@ def __init__( elif side_module_name == 'alexnet': mm = torchvision.models.alexnet(pretrained=True) self.side_net = nn.Sequential( - OrderedDict([ - ('features', mm.features), ('avgpool', mm.avgpool), - ('flatten', nn.Flatten()), - ('fc', nn.Linear(9216, dim, bias=False)) - ])) + OrderedDict([('features', mm.features), + ('avgpool', mm.avgpool), + ('flatten', nn.Flatten()), + ('fc', nn.Linear(9216, dim, bias=False))])) else: - raise ValueError(f'Unsupported side_module_name: {side_module_name}') + raise ValueError( + f'Unsupported side_module_name: {side_module_name}') self.alpha = nn.Parameter(torch.tensor(0.0)) self._activate = True @@ -237,27 +248,29 @@ class Mlp(nn.Module): """ def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - norm_layer=None, - bias=True, - drop=0., - use_conv=False, + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=None, + bias=True, + drop=0., + use_conv=False, ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features bias = tuple(repeat(bias, 2)) drop_probs = tuple(repeat(drop, 2)) - linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear + linear_layer = partial( + nn.Conv2d, kernel_size=1) if use_conv else nn.Linear self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) self.act = act_layer() self.drop1 = nn.Dropout(drop_probs[0]) - self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() + self.norm = norm_layer( + hidden_features) if norm_layer is not None else nn.Identity() self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) self.drop2 = nn.Dropout(drop_probs[1]) diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index f2f1903273..a8c6153f0d 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -98,12 +98,14 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: logger.info(''.join(s)) -def find_sub_module(module: torch.nn.Module, module_name: str) -> List[torch.nn.Module]: +def find_sub_module(module: torch.nn.Module, + module_name: str) -> List[torch.nn.Module]: _modules = list() for name, sub_module in module.named_modules(): if not name: continue - if module_name == name or getattr(sub_module, 'adapter_name', None) == module_name: + if module_name == name or getattr(sub_module, 'adapter_name', + None) == module_name: _modules.append(sub_module) else: _modules.extend(find_sub_module(sub_module, module_name)) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 5992cddcbe..79082f5c92 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -1,4 +1,5 @@ import copy +import math import os import shutil import tempfile @@ -11,8 +12,9 @@ SbertForSequenceClassification) from peft.utils import WEIGHTS_NAME from torch import nn -import math -from swift import AdapterConfig, LoRAConfig, Swift, SwiftModel, push_to_hub, SideConfig, PromptConfig, ResTuningConfig + +from swift import (AdapterConfig, LoRAConfig, PromptConfig, ResTuningConfig, + SideConfig, Swift, SwiftModel, push_to_hub) class TestSwift(unittest.TestCase): @@ -30,6 +32,7 @@ def tearDown(self): def test_swift_lora_forward(self): from swift.tuners.lora import Linear + def reset_parameters(self): nn.Linear.reset_parameters(self) if hasattr(self, 'lora_A'): @@ -52,9 +55,12 @@ def reset_parameters(self): outputs_deactivate = model(**inputs) model.activate_adapter('default') outputs_reactivate = model(**inputs) - self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) - self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) - self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + self.assertTrue( + torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue( + not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue( + torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_adapter_forward(self): model = Model.from_pretrained( @@ -74,9 +80,12 @@ def test_swift_adapter_forward(self): outputs_deactivate = model(**inputs) model.activate_adapter('default') outputs_reactivate = model(**inputs) - self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) - self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) - self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + self.assertTrue( + torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue( + not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue( + torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_prompt_forward(self): model = Model.from_pretrained( @@ -96,9 +105,12 @@ def test_swift_prompt_forward(self): outputs_deactivate = model(**inputs) model.activate_adapter('default') outputs_reactivate = model(**inputs) - self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) - self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) - self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + self.assertTrue( + torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue( + not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue( + torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_restuner_forward(self): model = Model.from_pretrained( @@ -112,7 +124,7 @@ def test_swift_restuner_forward(self): stem_modules=r'.*layer\.\d+$', target_modules=r'.*pooler', target_modules_hook='input', - tuner_cfg="res_adapter", + tuner_cfg='res_adapter', ) outputs = model(**inputs) model = Swift.prepare_model(model, config=restuner_config) @@ -121,9 +133,12 @@ def test_swift_restuner_forward(self): outputs_deactivate = model(**inputs) model.activate_adapter('default') outputs_reactivate = model(**inputs) - self.assertTrue(torch.allclose(outputs.logits, outputs_deactivate.logits)) - self.assertTrue(not torch.allclose(outputs.logits, outputs_lora.logits)) - self.assertTrue(torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) + self.assertTrue( + torch.allclose(outputs.logits, outputs_deactivate.logits)) + self.assertTrue( + not torch.allclose(outputs.logits, outputs_lora.logits)) + self.assertTrue( + torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_lora_injection(self): model = SbertForSequenceClassification(SbertConfig()) @@ -202,15 +217,14 @@ def test_swift_side_bert(self): model2 = copy.deepcopy(model) result_origin = model(**inputs).logits print( - f'test_swift_side_bert result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}' - ) + f'test_swift_side_bert result_origin shape: {result_origin.shape}, ' + f'result_origin sum: {torch.sum(result_origin)}') side_config = SideConfig( dim=model.config.hidden_size, target_modules=r'.*encoder.encoder', side_module_name='mlp', - hidden_pos='last_hidden_state' - ) + hidden_pos='last_hidden_state') model = Swift.prepare_model(model, config=side_config) result_activate = model(**inputs).logits diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py index 421544d0df..43522fbbe3 100644 --- a/tests/tuners/test_swift_restuning.py +++ b/tests/tuners/test_swift_restuning.py @@ -6,8 +6,7 @@ import torch -from swift import ResTuningConfig -from swift import Swift, SwiftModel +from swift import ResTuningConfig, Swift, SwiftModel class TestSwiftResTuning(unittest.TestCase): @@ -37,18 +36,24 @@ def model_comparison(self, model, model2): model_key = list(model.state_dict().keys()) model2_key = list(model2.state_dict().keys()) self.assertTrue(model_key == model2_key) - model_val = torch.sum(torch.stack([torch.sum(val) for val in model.state_dict().values()])) - model2_val = torch.sum(torch.stack([torch.sum(val) for val in model2.state_dict().values()])) + model_val = torch.sum( + torch.stack( + [torch.sum(val) for val in model.state_dict().values()])) + model2_val = torch.sum( + torch.stack( + [torch.sum(val) for val in model2.state_dict().values()])) self.assertTrue(torch.isclose(model_val, model2_val)) def test_swift_restuning_vit(self): from transformers import AutoModelForImageClassification - model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224") + model = AutoModelForImageClassification.from_pretrained( + 'google/vit-base-patch16-224') model_swift_1 = copy.deepcopy(model) model_swift_2 = copy.deepcopy(model) result_origin = model(torch.ones((1, 3, 224, 224))).logits print( - f"test_swift_restuning_vit result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}") + f'test_swift_restuning_vit result_origin shape: {result_origin.shape}, ' + f'result_origin sum: {torch.sum(result_origin)}') # load type - 1 self.set_random_seed() @@ -57,15 +62,17 @@ def test_swift_restuning_vit(self): root_modules=r'.*vit.encoder.layer.0$', stem_modules=r'.*vit.encoder.layer\.\d+$', target_modules=r'.*vit.layernorm', - target_modules_hook="input", - tuner_cfg="res_adapter", + target_modules_hook='input', + tuner_cfg='res_adapter', ) - model_swift_1 = Swift.prepare_model(model_swift_1, config=restuning_config_1) + model_swift_1 = Swift.prepare_model( + model_swift_1, config=restuning_config_1) self.assertTrue(isinstance(model_swift_1, SwiftModel)) print(model_swift_1.get_trainable_parameters()) result_swift_1 = model_swift_1(torch.ones((1, 3, 224, 224))).logits print( - f"test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, result_swift_1 sum: {torch.sum(result_swift_1)}") + f'test_swift_restuning_vit result_swift_1 shape: {result_swift_1.shape}, ' + f'result_swift_1 sum: {torch.sum(result_swift_1)}') # load type - 2 self.set_random_seed() @@ -74,18 +81,21 @@ def test_swift_restuning_vit(self): root_modules=r'.*vit.encoder.layer.0$', stem_modules=r'.*vit.encoder.layer\.\d+$', target_modules=r'.*vit.encoder', - target_modules_hook="output", - target_hidden_pos="last_hidden_state", - tuner_cfg="res_adapter", + target_modules_hook='output', + target_hidden_pos='last_hidden_state', + tuner_cfg='res_adapter', ) - model_swift_2 = Swift.prepare_model(model_swift_2, config=restuning_config_2) + model_swift_2 = Swift.prepare_model( + model_swift_2, config=restuning_config_2) self.assertTrue(isinstance(model_swift_2, SwiftModel)) print(model_swift_2.get_trainable_parameters()) result_swift_2 = model_swift_2(torch.ones((1, 3, 224, 224))).logits print( - f"test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, result_swift_2 sum: {torch.sum(result_swift_2)}") + f'test_swift_restuning_vit result_swift_2 shape: {result_swift_2.shape}, ' + f'result_swift_2 sum: {torch.sum(result_swift_2)}') - self.assertTrue(all(torch.isclose(result_swift_1, result_swift_2).flatten())) + self.assertTrue( + all(torch.isclose(result_swift_1, result_swift_2).flatten())) model_swift_1.save_pretrained(self.tmp_dir) self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) @@ -94,37 +104,43 @@ def test_swift_restuning_vit(self): def test_swift_restuning_diffusers_sd(self): from diffusers import UNet2DConditionModel - model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") + model = UNet2DConditionModel.from_pretrained( + 'runwayml/stable-diffusion-v1-5', subfolder='unet') model.requires_grad_(False) model2 = copy.deepcopy(model) self.set_random_seed() input_data = { - "sample": torch.ones((1, 4, 64, 64)), - "timestep": 10, - "encoder_hidden_states": torch.ones((1, 77, 768)) + 'sample': torch.ones((1, 4, 64, 64)), + 'timestep': 10, + 'encoder_hidden_states': torch.ones((1, 77, 768)) } result_origin = model(**input_data).sample print( - f"test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, result_origin sum: {torch.sum(result_origin)}") + f'test_swift_restuning_diffusers_sd result_origin shape: {result_origin.shape}, ' + f'result_origin sum: {torch.sum(result_origin)}') self.set_random_seed() restuning_config = ResTuningConfig( dims=[1280, 1280, 1280, 640, 320], root_modules='mid_block', - stem_modules=['mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2', 'up_blocks.3'], + stem_modules=[ + 'mid_block', 'up_blocks.0', 'up_blocks.1', 'up_blocks.2', + 'up_blocks.3' + ], target_modules='conv_norm_out', - tuner_cfg="res_group_adapter", + tuner_cfg='res_group_adapter', use_upsample=True, upsample_out_channels=[1280, 1280, 640, 320, None], - zero_init_last=True - ) + zero_init_last=True) model = Swift.prepare_model(model, config=restuning_config) self.assertTrue(isinstance(model, SwiftModel)) print(model.get_trainable_parameters()) result = model(**input_data).sample - print(f"test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}") + print( + f'test_swift_restuning_diffusers_sd result shape: {result.shape}, result sum: {torch.sum(result)}' + ) model.save_pretrained(self.tmp_dir) self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) model2 = Swift.from_pretrained(model2, self.tmp_dir) diff --git a/tests/utils/test_torch_utils.py b/tests/utils/test_torch_utils.py index 3517d7f475..106f5148eb 100644 --- a/tests/utils/test_torch_utils.py +++ b/tests/utils/test_torch_utils.py @@ -1,12 +1,15 @@ import unittest + from modelscope import Model + from swift.utils.torch_utils import find_sub_module class TestTorchUtils(unittest.TestCase): def test_find_sub_module(self): - model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base') + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') self.assertTrue(find_sub_module(model, 'query') is not None) From c13ea0e86ffe497b6fb424f1c456f4a7dd2fae9b Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 17:21:53 +0800 Subject: [PATCH 34/70] fix --- swift/trainers/trainers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index b31da08f2d..7ed0774d13 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -74,6 +74,8 @@ def prediction_step( if len(gen_kwargs) == 0 and hasattr(self, '_gen_kwargs'): gen_kwargs = self._gen_kwargs.copy() + if hasattr(self.model, 'generation_config'): + gen_kwargs.update(self.model.generation_config.to_dict()) if gen_kwargs.get('max_length') is None and gen_kwargs.get( 'max_new_tokens') is None: From 00ad79ba5238bb6b8b6f41597c3f934360ba4f9c Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 20:01:46 +0800 Subject: [PATCH 35/70] add logger --- swift/tuners/adapter.py | 6 ++++++ swift/tuners/lora.py | 5 +++-- swift/tuners/prompt.py | 6 ++++++ swift/tuners/restuning.py | 2 +- swift/tuners/side.py | 6 ++++-- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 12f3d30641..90abd3e4c1 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -9,9 +9,12 @@ from torch import nn from transformers.activations import ACT2CLS +from swift import get_logger from swift.utils.torch_utils import find_sub_module from .utils import SwiftConfig, SwiftOutput +logger = get_logger() + @dataclass class AdapterConfig(SwiftConfig): @@ -128,6 +131,9 @@ def _feed_forward_chunk(self, attention_output): config.adapter_length, ACT2CLS[config.act_layer]) setattr(module, f'adapter_{adapter_name}', adapter_module) + logger.info( + f'Adapter modules(module_key): {module_key}.adapter_{adapter_name}' + ) def state_dict_callback(state_dict, adapter_name: str): return { diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 1a52628bc9..13301134dc 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -1,7 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. -import logging import math import re from dataclasses import dataclass, field @@ -14,6 +13,7 @@ is_bnb_available) from peft.utils import get_auto_gptq_quant_linear, get_quantization_config +from swift import get_logger from ..utils.torch_utils import find_sub_module from .utils import SwiftConfig, SwiftOutput @@ -28,7 +28,7 @@ if is_auto_gptq_available(): from peft.tuners.lora import QuantLinear -logger = logging.getLogger() +logger = get_logger() @dataclass @@ -257,6 +257,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, modules[module_key] = adapter_name model.lora_module_map.update(modules) + logger.info(f'Lora modules(module_key -> adapter_name): {modules}') @staticmethod def unpatch_lora(model, config: LoRAConfig): diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 3c64479369..00e5c56863 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -8,9 +8,12 @@ import torch from torch import nn +from swift import get_logger from ..utils.torch_utils import find_sub_module from .utils import SwiftConfig, SwiftOutput +logger = get_logger() + @dataclass class PromptConfig(SwiftConfig): @@ -149,6 +152,9 @@ def _forward(self, *args, **kwargs): config.attention_mask_value, config.attach_front) setattr(module, f'prompt_{adapter_name}', prompt_module) + logger.info( + f'Prompt modules(module_key): {module_key}.prompt_{adapter_name}' + ) match_module_keys.append(module_key) def state_dict_callback(state_dict, adapter_name): diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index d8ddbc5aab..38842a78ce 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from swift.utils.logger import get_logger +from swift import get_logger from ..utils.torch_utils import find_sub_module from .restuning_components import (ResTuner, detach_tensors, probe_input_pre_hook, probe_output_hook) diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 5f25f879a1..8d3c869730 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -1,13 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import copy -import inspect import re import types from collections import OrderedDict from dataclasses import dataclass, field from functools import partial from itertools import repeat -from typing import Any, Callable, List, Union +from typing import List, Union import torch import torchvision @@ -113,6 +112,9 @@ def forward_seq(self, input, *args, **kwargs): tgt_module.forward = types.MethodType(_forward, tgt_module) side_module = SideModule(config.dim, config.side_module_name) setattr(tgt_module, f'side_{adapter_name}', side_module) + logger.info( + f'Side modules(module_key): {module_key}.side_{adapter_name}' + ) def state_dict_callback(state_dict, adapter_name): return { From f3a5126ece358157b7b7eaf59c5562109221de63 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 20:57:46 +0800 Subject: [PATCH 36/70] add perf item --- examples/pytorch/llm/src/llm_sft.py | 3 +++ swift/trainers/trainers.py | 23 +++++++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index bec7ee7069..cf88dd3a11 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -420,6 +420,9 @@ def _decode(tokens, ignore_pad_token_for_loss=False): ) trainer.train(trainer_args.resume_from_checkpoint) + for i in range(torch.cuda.device_count()): + trainer.perf['memory'][f'device:{i}'] = torch.cuda.max_memory_reserved( + i) logger.info(trainer.perf) # ### Visualization diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 7ed0774d13..b1a2df0de7 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -27,26 +27,19 @@ def __init__(self, *args, **kwargs): 0., 'gen_len': 0, - 'eval_memory': - 0., - 'train_memory': + 'memory': {}, + 'train_time': 0., 'model': self.model.get_trainable_parameters() if hasattr( self.model, 'get_trainable_parameters') else None, } - def train( - self, - *args, - **kwargs, - ): - training_output = super().train(*args, **kwargs) - if self.perf['train_memory'] is None: - self.perf['train_memory'] = sum([ - torch.cuda.memory_allocated(i) - for i in range(torch.cuda.device_count()) - ]) + def training_step(self, *args, **kwargs) -> torch.Tensor: + train_time = time.time() + training_output = super().training_step(*args, **kwargs) + train_time = time.time() - train_time + self.perf['train_time'] = self.perf['train_time'] + train_time return training_output def prediction_step( @@ -114,8 +107,6 @@ def prediction_step( gen_len = len(generated_tokens[0]) self.perf['gen_time'] = self.perf['gen_time'] + gen_time self.perf['gen_len'] = self.perf['gen_len'] + gen_len - self.perf['eval_memory'] = max(torch.cuda.memory_allocated(), - self.perf['eval_memory']) # in case the batch is shorter than max length, the output should be padded if gen_kwargs.get('max_length') is not None and generated_tokens.shape[ From 5655f901f45c1b67fb51295acdcb9f28a47f4e56 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 22:10:17 +0800 Subject: [PATCH 37/70] fix comments --- examples/pytorch/llm/src/llm_sft.py | 75 ++++--------------- examples/pytorch/llm/src/utils/__init__.py | 1 + .../pytorch/llm/src/utils/metric_utils.py | 59 +++++++++++++++ examples/pytorch/llm/src/utils/preprocess.py | 2 +- swift/tuners/adapter.py | 4 +- swift/tuners/lora.py | 6 +- swift/tuners/prompt.py | 4 +- swift/tuners/restuning.py | 4 +- swift/tuners/side.py | 4 +- 9 files changed, 88 insertions(+), 71 deletions(-) create mode 100644 examples/pytorch/llm/src/utils/metric_utils.py diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index cf88dd3a11..336f7391a7 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -6,12 +6,9 @@ from functools import partial from typing import Dict, List, Optional -import jieba -import numpy as np import torch import torch.distributed as dist -from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu -from rouge.rouge import Rouge +from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, broadcast_string, find_all_linear_for_lora, get_dataset, @@ -36,7 +33,8 @@ class SftArguments: default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G - sft_type: str = field(default='lora') + sft_type: str = field( + default='lora', metadata={'choices': ['lora', 'full']}) template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -119,6 +117,13 @@ class SftArguments: "This parameter is used only when model_type.startswith('qwen-7b')" }) + # generation config, only useful when `predict_with_generate=True` + do_sample: bool = True + top_p: float = 0.7 + max_new_tokens: int = None + temperature: float = 0.95 + top_k: int = 20 + def __post_init__(self): if is_dist(): rank, local_rank, _, _ = get_dist_setting() @@ -263,10 +268,11 @@ def llm_sft(args: SftArguments) -> None: args.dataset_seed) generation_config = { - 'do_sample': True, - 'top_p': 0.7, - 'max_length': args.max_length, - 'temperature': 0.95 + 'do_sample': args.do_sample, + 'top_p': args.top_p, + 'max_new_tokens': args.max_new_tokens, + 'temperature': args.temperature, + 'top_k': args.top_k, } preprocess_func = get_preprocess( @@ -359,55 +365,6 @@ def llm_sft(args: SftArguments) -> None: logger.info(f'trainer_args: {trainer_args}') - def compute_metrics(prediction): - preds, labels = prediction[0], prediction[1] - - score_dict = { - 'rouge-1': [], - 'rouge-2': [], - 'rouge-l': [], - 'bleu-4': [] - } - - def _decode(tokens, ignore_pad_token_for_loss=False): - if ignore_pad_token_for_loss: - tokens = np.where(tokens != -100, tokens, - tokenizer.pad_token_id) - tokens = np.where(tokens < tokenizer.vocab_size, tokens, - tokenizer.pad_token_id) - return [ - t for t in tokenizer.batch_decode( - tokens, skip_special_tokens=True) - ] - - for pred, label in zip(preds, labels): - pred = ''.join(_decode(pred, False)) - label = ''.join(_decode(label, True)) - hypothesis = list(jieba.cut(pred)) - if len(hypothesis) == 0 or ''.join(hypothesis) == '.': - hypothesis = [tokenizer.decode(tokenizer.eos_token_id)] - reference = list(jieba.cut(label)) - try: - rouge = Rouge() - scores = rouge.get_scores(' '.join(hypothesis), - ' '.join(reference)) - result = scores[0] - - for k, v in result.items(): - score_dict[k].append(round(v['f'] * 100, 4)) - bleu_score = sentence_bleu( - [list(label)], - list(pred), - smoothing_function=SmoothingFunction().method3) - score_dict['bleu-4'].append(round(bleu_score * 100, 4)) - except Exception as e: - logger.error(e) - logger.error(f'eval error {hypothesis}, {reference}') - - for k, v in score_dict.items(): - score_dict[k] = float(np.mean(v)) - return score_dict - trainer = Seq2SeqTrainer( model=model, args=trainer_args, @@ -415,7 +372,7 @@ def _decode(tokens, ignore_pad_token_for_loss=False): train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, - compute_metrics=compute_metrics + compute_metrics=partial(compute_nlg_metrics, tokenizer=tokenizer) if args.predict_with_generate else None, ) diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index 935cec0479..ef4909dab0 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -1,4 +1,5 @@ from .dataset import DATASET_MAPPING, get_dataset, process_dataset +from .metric_utils import compute_nlg_metrics from .model import MODEL_MAPPING, get_model_tokenizer from .preprocess import TEMPLATE_MAPPING, get_preprocess from .utils import (broadcast_string, download_dataset, diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py new file mode 100644 index 0000000000..0220128212 --- /dev/null +++ b/examples/pytorch/llm/src/utils/metric_utils.py @@ -0,0 +1,59 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import jieba +import numpy as np +from swift import get_logger +from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu +from rouge.rouge import Rouge + +logger = get_logger() + + +def compute_nlg_metrics(tokenizer, prediction): + preds, labels = prediction[0], prediction[1] + + score_dict = { + 'rouge-1': [], + 'rouge-2': [], + 'rouge-l': [], + 'bleu-4': [] + } + + def _decode(tokens, ignore_pad_token_for_loss=False): + if ignore_pad_token_for_loss: + tokens = np.where(tokens != -100, tokens, + tokenizer.pad_token_id) + tokens = np.where(tokens < tokenizer.vocab_size, tokens, + tokenizer.pad_token_id) + return [ + t for t in tokenizer.batch_decode( + tokens, skip_special_tokens=True) + ] + + for pred, label in zip(preds, labels): + pred = ''.join(_decode(pred, False)) + label = ''.join(_decode(label, True)) + hypothesis = list(jieba.cut(pred)) + if len(hypothesis) == 0 or ''.join(hypothesis) == '.': + hypothesis = [tokenizer.decode(tokenizer.eos_token_id)] + reference = list(jieba.cut(label)) + try: + rouge = Rouge() + scores = rouge.get_scores(' '.join(hypothesis), + ' '.join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v['f'] * 100, 4)) + bleu_score = sentence_bleu( + [list(label)], + list(pred), + smoothing_function=SmoothingFunction().method3) + score_dict['bleu-4'].append(round(bleu_score * 100, 4)) + except Exception as e: + logger.error(e) + logger.error(f'eval error {hypothesis}, {reference}') + + for k, v in score_dict.items(): + score_dict[k] = float(np.mean(v)) + return score_dict diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 92decc5f1b..d75d131c48 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -191,7 +191,7 @@ def get_preprocess( tokenizer: PreTrainedTokenizer, system: Optional[str] = None, max_length: Optional[int] = None, - validate_generation=False, + validate_generation: Optional[bool] = False, ) -> Callable[[Dict[str, Any]], Dict[str, List[int]]]: def preprocess(example: Dict[str, Any]) -> Dict[str, List[int]]: diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 90abd3e4c1..d458e75f6e 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -154,7 +154,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'adapter_{adapter_name}') for _module in modules: - _module.activate(activate) + _module.set_activation(activate) class AdapterModule(nn.Module): @@ -195,7 +195,7 @@ def _init_weights(m): self.apply(_init_weights) - def activate(self, activate=True): + def set_activation(self, activate=True): self._activate = activate def forward(self, x, identity=None): diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 13301134dc..f4600d331c 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -110,7 +110,7 @@ def state_dict_callback(state_dict, adapter_name): adapter_name, config.bias) def mark_trainable_callback(model): - mark_lora_as_trainable(model, config.bias) + mark_lora_as_trainable(model, adapter_name, config.bias) return SwiftOutput(config, state_dict_callback, mark_trainable_callback) @@ -121,7 +121,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) for _module in modules: if isinstance(_module, LoRALayer): - _module.activate(activate) + _module.set_activation(activate) else: _module.active_adapter = 'default' if activate else 'invalid' @@ -351,7 +351,7 @@ def __init__( self.merged = False self.merge_weights = merge_weights - def activate(self, activate=True): + def set_activation(self, activate=True): if activate: self.r = self.old_r else: diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 00e5c56863..8d0bd6c796 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -176,7 +176,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'prompt_{adapter_name}') for _module in modules: - _module.activate(activate) + _module.set_activation(activate) class PromptModule(nn.Module): @@ -229,7 +229,7 @@ def forward(self, x): dim=1) return x - def activate(self, activate=True): + def set_activation(self, activate=True): self._activate = activate def patch_attention_mask(self, m): diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index 38842a78ce..7858561fa6 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -297,7 +297,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'restuning_{adapter_name}') for _module in modules: - _module.activate(activate) + _module.set_activation(activate) class ResTuningBypassModule(nn.Module): @@ -330,7 +330,7 @@ def __init__( for i in range(depth) ]) - def activate(self, activate=True): + def set_activation(self, activate=True): self._activate = activate def forward(self, x_list, origin_arg, **kwargs): diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 8d3c869730..0bf2b548ad 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -135,7 +135,7 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'side_{adapter_name}') for _module in modules: - _module.activate(activate) + _module.set_activation(activate) class SideModule(nn.Module): @@ -172,7 +172,7 @@ def __init__(self, dim, side_module_name='fcn4'): self.alpha = nn.Parameter(torch.tensor(0.0)) self._activate = True - def activate(self, activate=True): + def set_activation(self, activate=True): self._activate = activate def forward(self, x, x_main): From 87427514b628cf85d1c716ad532e42d8ffea75d8 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 22:23:48 +0800 Subject: [PATCH 38/70] fix comments --- examples/pytorch/llm/src/llm_sft.py | 8 ++++---- examples/pytorch/llm/src/utils/metric_utils.py | 17 ++++++----------- examples/pytorch/llm/src/utils/model.py | 4 ++-- examples/pytorch/llm/src/utils/preprocess.py | 6 ++---- swift/tuners/adapter.py | 12 ++++++------ 5 files changed, 20 insertions(+), 27 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 336f7391a7..d5bc692e71 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -275,20 +275,20 @@ def llm_sft(args: SftArguments) -> None: 'top_k': args.top_k, } - preprocess_func = get_preprocess( + preprocess_func_train = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, validate_generation=False) - train_dataset = train_dataset.map(preprocess_func) - preprocess_func = get_preprocess( + train_dataset = train_dataset.map(preprocess_func_train) + preprocess_func_eval = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, validate_generation=True) - val_dataset = val_dataset.map(preprocess_func) + val_dataset = val_dataset.map(preprocess_func_eval) del dataset # Data analysis stat_dataset(train_dataset) diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py index 0220128212..9d96c8a1e0 100644 --- a/examples/pytorch/llm/src/utils/metric_utils.py +++ b/examples/pytorch/llm/src/utils/metric_utils.py @@ -2,32 +2,27 @@ import jieba import numpy as np -from swift import get_logger from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu from rouge.rouge import Rouge +from swift import get_logger + logger = get_logger() def compute_nlg_metrics(tokenizer, prediction): preds, labels = prediction[0], prediction[1] - score_dict = { - 'rouge-1': [], - 'rouge-2': [], - 'rouge-l': [], - 'bleu-4': [] - } + score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []} def _decode(tokens, ignore_pad_token_for_loss=False): if ignore_pad_token_for_loss: - tokens = np.where(tokens != -100, tokens, - tokenizer.pad_token_id) + tokens = np.where(tokens != -100, tokens, tokenizer.pad_token_id) tokens = np.where(tokens < tokenizer.vocab_size, tokens, tokenizer.pad_token_id) return [ - t for t in tokenizer.batch_decode( - tokens, skip_special_tokens=True) + t + for t in tokenizer.batch_decode(tokens, skip_special_tokens=True) ] for pred, label in zip(preds, labels): diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index 10e1c7f8b9..abfe1140e4 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -176,7 +176,7 @@ class LoRATM(NamedTuple): class AdapterTM(NamedTuple): - # default lora target modules. qkv + # default adapter target modules. baichuan = ['mlp'] chatglm2 = ['mlp'] llama2 = ['mlp'] @@ -185,7 +185,7 @@ class AdapterTM(NamedTuple): class ResTunerTM(NamedTuple): - # default lora target modules. qkv + # default res-tuning config. baichuan = { 'root_modules': r'.*layers.0$', 'stem_modules': r'.*layers\.\d+$', diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index d75d131c48..61158fe09b 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -131,7 +131,8 @@ def _preprocess( history: Optional[History] = None, system: Optional[str] = None, max_length: Optional[int] = None, - validate_generation=True, # do cross-validation with `model.generate()` + validate_generation: Optional[ + bool] = True, # do cross-validation with `model.generate()` ) -> Dict[str, List[int]]: if history is None: history = [] @@ -180,9 +181,6 @@ def _preprocess( if labels is not None: labels = labels[-max_length:] - # if validate_generation: - # input_ids = [tokenizer.pad_token_id] * (64-len(input_ids)) + input_ids - return {'input_ids': input_ids, 'labels': labels} diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index d458e75f6e..490a108d1c 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -179,9 +179,9 @@ def __init__( self.dim = dim self.adapter_length = adapter_length # self.adapter_type = adapter_type - self.ln1 = nn.Linear(dim, adapter_length) + self.linear1 = nn.Linear(dim, adapter_length) self.act = act_layer() - self.ln2 = nn.Linear(adapter_length, dim) + self.linear2 = nn.Linear(adapter_length, dim) self.init_weights() self._prepared = False self._activate = True @@ -202,14 +202,14 @@ def forward(self, x, identity=None): if not self._activate: return 0. if not self._prepared: - self.ln1.to(x.device) + self.linear1.to(x.device) self.act.to(x.device) - self.ln2.to(x.device) + self.linear2.to(x.device) self._prepared = True x_dtype = x.dtype - x = x.to(self.ln1.weight.dtype) - out = self.ln2(self.act(self.ln1(x))) + x = x.to(self.linear1.weight.dtype) + out = self.linear2(self.act(self.linear1(x))) if identity is None: identity = x identity = identity.to(out.dtype) From 8470274f46e71f6824b371aa91b250db7c5314cd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 22:39:31 +0800 Subject: [PATCH 39/70] update readme --- README.md | 5 ++++- README_CN.md | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e5d72aa36c..bdc5d17eb6 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,15 @@ Currently supported approches (and counting): 1. LoRA: [LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685) 2. Adapter: [Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751) 3. Prompt Tuning: [Visual Prompt Tuning](https://arxiv.org/abs/2203.12119) -4. All tuners offered on [Peft](https://github.com/huggingface/peft). +4. Side: [Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503) +5. ResTuning-Bypass +7. All tuners offered on [Peft](https://github.com/huggingface/peft) Key features: 1. By integrating the ModelScope library, models can be readily obatined via a model-id. 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result. +3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with different tuners in different threads in a time-sharing manner. ## LLM SFT Example [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) diff --git a/README_CN.md b/README_CN.md index 878c90ad31..1ebe678276 100644 --- a/README_CN.md +++ b/README_CN.md @@ -20,11 +20,14 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 1. LoRA:[LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS](https://arxiv.org/abs/2106.09685) 2. Adapter:[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751) 3. Prompt Tuning: [Visual Prompt Tuning](https://arxiv.org/abs/2203.12119) -4. 所有在[Peft](https://github.com/huggingface/peft)上提供的tuners。 +4. Side: [Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503) +5. ResTuning-Bypass +6. 所有在[Peft](https://github.com/huggingface/peft)上提供的tuners 关键特点: 1. 通过集成ModelScope库,可以通过model id轻松获取模型。 2. SWIFT提供的tuners可以组合在一起,以便在模型上探索多个tuners,以获得最佳结果。 +3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活,用户可以用一个模型在不同线程中分时使用不同的tuners。 ## 大模型微调的例子 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) From f1d6de31f8c112a214b55a855c125f5faa55153e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Sun, 10 Sep 2023 23:10:37 +0800 Subject: [PATCH 40/70] Fixbug --- swift/tuners/adapter.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 490a108d1c..cfbe13fdd4 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -98,19 +98,16 @@ def _forward(self, *args, **kwargs): if isinstance(config.hidden_pos, int): _type = type(args) args = list(args) - args[config.hidden_pos] = args[ - config.hidden_pos] + getattr( - self, f'adapter_{adapter_name}')( - args[config.hidden_pos]) + args[config.hidden_pos] = getattr( + self, f'adapter_{adapter_name}')( + args[config.hidden_pos]) args = _type(args) else: - args[config.hidden_pos] = args[ - config.hidden_pos] + getattr( - self, f'adapter_{adapter_name}')( - args[config.hidden_pos]) + args[config.hidden_pos] = getattr( + self, f'adapter_{adapter_name}')( + args[config.hidden_pos]) elif isinstance(args, torch.Tensor): - args = args + getattr(self, f'adapter_{adapter_name}')( - args) + args = getattr(self, f'adapter_{adapter_name}')(args) return args def _feed_forward_chunk(self, attention_output): From 564b7d72071c02eadd7e08478a485bd7dbb8e6c7 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 11 Sep 2023 00:08:06 +0800 Subject: [PATCH 41/70] fix --- examples/pytorch/llm/src/utils/metric_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py index 9d96c8a1e0..d4f964a5e6 100644 --- a/examples/pytorch/llm/src/utils/metric_utils.py +++ b/examples/pytorch/llm/src/utils/metric_utils.py @@ -10,7 +10,7 @@ logger = get_logger() -def compute_nlg_metrics(tokenizer, prediction): +def compute_nlg_metrics(prediction, tokenizer): preds, labels = prediction[0], prediction[1] score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []} From a6cf6321764d6bf99ae634657631110188b062a6 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 11 Sep 2023 17:07:04 +0800 Subject: [PATCH 42/70] fix comments --- examples/pytorch/llm/src/llm_sft.py | 16 +++++++++------- swift/trainers/trainers.py | 7 +++++++ swift/utils/torch_utils.py | 3 +-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index d5bc692e71..95410ab487 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -33,8 +33,7 @@ class SftArguments: default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) # qwen-7b: lora+4bitQ: 10G, lora+8bitQ: 14G, lora: 22G; full: 95G - sft_type: str = field( - default='lora', metadata={'choices': ['lora', 'full']}) + sft_type: str = field(default='lora') template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -76,7 +75,6 @@ class SftArguments: gradient_checkpointing: bool = False batch_size: int = 1 - eval_batch_size: int = 1 num_train_epochs: int = 1 # if max_steps >= 0, override num_train_epochs max_steps: int = -1 @@ -137,6 +135,11 @@ def __post_init__(self): # Initialize in advance dist.init_process_group(backend=self.ddp_backend) + from swift import SWIFT_MAPPING + all_types = [key.lower() for key in SWIFT_MAPPING.keys()] + ['full'] + sft_type = [_type.strip() for _type in self.sft_type.split(',')] + assert all([_type.lower() in all_types for _type in sft_type]), \ + f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}' if self.sft_type == 'full': assert self.quantization_bit is None, 'not supported' assert self.dtype != 'fp16', 'please use bf16 or fp32' @@ -270,6 +273,7 @@ def llm_sft(args: SftArguments) -> None: generation_config = { 'do_sample': args.do_sample, 'top_p': args.top_p, + 'max_length': None, 'max_new_tokens': args.max_new_tokens, 'temperature': args.temperature, 'top_k': args.top_k, @@ -315,7 +319,8 @@ def llm_sft(args: SftArguments) -> None: do_eval=True, evaluation_strategy='steps', per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=args.eval_batch_size, + per_device_eval_batch_size=1 + if args.predict_with_generate else args.batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, @@ -377,9 +382,6 @@ def llm_sft(args: SftArguments) -> None: ) trainer.train(trainer_args.resume_from_checkpoint) - for i in range(torch.cuda.device_count()): - trainer.perf['memory'][f'device:{i}'] = torch.cuda.max_memory_reserved( - i) logger.info(trainer.perf) # ### Visualization diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index b1a2df0de7..5c2de223af 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -34,12 +34,19 @@ def __init__(self, *args, **kwargs): self.model.get_trainable_parameters() if hasattr( self.model, 'get_trainable_parameters') else None, } + self._iter_perf = 0 def training_step(self, *args, **kwargs) -> torch.Tensor: train_time = time.time() training_output = super().training_step(*args, **kwargs) train_time = time.time() - train_time self.perf['train_time'] = self.perf['train_time'] + train_time + self._iter_perf += 1 + if self._iter_perf > 20 and not self.perf[ + 'memory'] and torch.cuda.device_count() > 0: + for i in range(torch.cuda.device_count()): + self.perf['memory'][ + f'device:{i}'] = torch.cuda.memory_reserved(i) return training_output def prediction_step( diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index a8c6153f0d..7a177ce903 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -92,9 +92,8 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: s = [ f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ', f'{n_buffers:.4f}M Buffers, ', - f'Trainable percentage: {100 * n_grads / n_params:.2f}%' + f'Trainable percentage: {100 * n_grads / n_params:.2f}%.' ] - s += '.' logger.info(''.join(s)) From 985f4a0a4f5f85b59424fbec23ac033e4741edf5 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 11 Sep 2023 20:13:02 +0800 Subject: [PATCH 43/70] fix CI --- examples/pytorch/llm/src/llm_sft.py | 63 +++++------------ examples/pytorch/llm/src/utils/swift_utils.py | 54 +++++++++++++++ swift/__init__.py | 22 +++--- swift/tuners/__init__.py | 8 +-- swift/tuners/adapter.py | 2 +- swift/tuners/prompt.py | 2 +- tests/tuners/test_swift_base.py | 69 +++++++++++++++++++ 7 files changed, 156 insertions(+), 64 deletions(-) create mode 100644 examples/pytorch/llm/src/utils/swift_utils.py diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 95410ab487..c82e35ffdc 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -4,21 +4,21 @@ # os.environ['CUDA_VISIBLE_DEVICES'] = '0' from dataclasses import dataclass, field from functools import partial -from typing import Dict, List, Optional +from typing import List, Optional import torch import torch.distributed as dist from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics +from examples.pytorch.llm.src.utils.swift_utils import prepare_model from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, find_all_linear_for_lora, get_dataset, - get_dist_setting, get_model_tokenizer, get_preprocess, - is_dist, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers) - -from swift import (AdapterConfig, HubStrategy, LoRAConfig, ResTuningConfig, - Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, - SwiftConfig, get_logger) + broadcast_string, get_dataset, get_dist_setting, + get_model_tokenizer, get_preprocess, is_dist, is_master, + plot_images, process_dataset, select_bnb, select_dtype, + show_layers) + +from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, + Swift, get_logger) from swift.hub import HubApi, ModelScopeConfig from swift.utils import (add_version_to_work_dir, parse_args, print_model_info, seed_everything) @@ -135,8 +135,12 @@ def __post_init__(self): # Initialize in advance dist.init_process_group(backend=self.ddp_backend) - from swift import SWIFT_MAPPING - all_types = [key.lower() for key in SWIFT_MAPPING.keys()] + ['full'] + from swift import SwiftTuners + all_types = [ + SwiftTuners.LORA.lower(), + SwiftTuners.ADAPTER.lower(), + SwiftTuners.RESTUNING.lower() + ] + ['full'] sft_type = [_type.strip() for _type in self.sft_type.split(',')] assert all([_type.lower() in all_types for _type in sft_type]), \ f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}' @@ -215,42 +219,7 @@ def llm_sft(args: SftArguments) -> None: args.model_type, torch_dtype=args.torch_dtype, **kwargs) if args.resume_from_ckpt is None: - swift_config: Dict[str, SwiftConfig] = dict() - for sft_type in args.sft_type.split(','): - if sft_type == 'lora': - if 'ALL' in args.lora_target_modules: - assert len(args.lora_target_modules) == 1 - args.lora_target_modules = find_all_linear_for_lora( - model, args.quantization_bit, args.model_type) - logger.info( - f'Setting lora_target_modules: {args.lora_target_modules}' - ) - - lora_config = LoRAConfig( - r=args.lora_rank, - target_modules=args.lora_target_modules, - lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout_p) - logger.info(f'lora_config: {lora_config}') - swift_config['lora'] = lora_config - elif sft_type == 'adapter': - adapter_config = AdapterConfig( - dim=model.config.hidden_size, - target_modules=MODEL_MAPPING[args.model_type].get( - 'adapter_TM', ['mlp']), - method_name='forward', - hidden_pos=0, - adapter_length=args.adapter_length, - ) - logger.info(f'adapter_config: {adapter_config}') - swift_config['adapter'] = adapter_config - elif sft_type == 'restuner': - restuner_config = ResTuningConfig( - dims=model.config.hidden_size, - **MODEL_MAPPING[args.model_type]['restuner_TM']) - logger.info(f'restuner_config: {restuner_config}') - swift_config['restuner'] = restuner_config - model = Swift.prepare_model(model, swift_config) + model = prepare_model(model, args) else: model = Swift.from_pretrained( model, args.resume_from_ckpt, is_trainable=True) diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py new file mode 100644 index 0000000000..ee286a1b75 --- /dev/null +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -0,0 +1,54 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Dict, Any + +import torch.nn + +from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, + Swift, + SwiftConfig, get_logger) +from .model import (MODEL_MAPPING) +from .utils import find_all_linear_for_lora +from swift import SwiftTuners + +logger = get_logger() + + +def prepare_model(model: torch.nn.Module, + args: Any, + ): + swift_config: Dict[str, SwiftConfig] = dict() + for sft_type in [_type.strip() for _type in args.sft_type.split(',')]: + if sft_type.lower() == SwiftTuners.LORA.lower(): + if 'ALL' in args.lora_target_modules: + assert len(args.lora_target_modules) == 1 + args.lora_target_modules = find_all_linear_for_lora( + model, args.quantization_bit, args.model_type) + logger.info( + f'Setting lora_target_modules: {args.lora_target_modules}' + ) + + lora_config = LoRAConfig( + r=args.lora_rank, + target_modules=args.lora_target_modules, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout_p) + logger.info(f'lora_config: {lora_config}') + swift_config['lora'] = lora_config + elif sft_type.lower() == SwiftTuners.ADAPTER.lower(): + adapter_config = AdapterConfig( + dim=model.config.hidden_size, + target_modules=MODEL_MAPPING[args.model_type].get( + 'adapter_TM', ['mlp']), + method_name='forward', + hidden_pos=0, + adapter_length=args.adapter_length, + ) + logger.info(f'adapter_config: {adapter_config}') + swift_config['adapter'] = adapter_config + elif sft_type.lower() == SwiftTuners.RESTUNING.lower(): + restuner_config = ResTuningConfig( + dims=model.config.hidden_size, + **MODEL_MAPPING[args.model_type]['restuner_TM']) + logger.info(f'restuner_config: {restuner_config}') + swift_config['restuner'] = restuner_config + return Swift.prepare_model(model, swift_config) diff --git a/swift/__init__.py b/swift/__init__.py index 6e866d6515..9049f2e70d 100644 --- a/swift/__init__.py +++ b/swift/__init__.py @@ -5,16 +5,15 @@ if TYPE_CHECKING: from .version import __version__, __release_datetime__ - from .tuners import (Adapter, AdapterConfig, AdapterModule, SwiftModel, - LoRA, LoRAConfig, SWIFT_MAPPING, LoraConfig, - PeftConfig, PeftModel, PeftModelForCausalLM, - ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM, - PeftModelForSequenceClassification, - PeftModelForTokenClassification, PrefixTuningConfig, - PromptEncoderConfig, PromptLearningConfig, - PromptTuningConfig, get_peft_config, get_peft_model, - get_peft_model_state_dict, Prompt, PromptConfig, - PromptModule, SwiftConfig, SwiftOutput, Swift) + from .tuners import ( + Adapter, AdapterConfig, AdapterModule, SwiftModel, LoRA, LoRAConfig, + SWIFT_MAPPING, LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM, + ResTuningConfig, SideConfig, PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, PeftModelForTokenClassification, + PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig, + PromptTuningConfig, get_peft_config, get_peft_model, + get_peft_model_state_dict, Prompt, PromptConfig, PromptModule, + SwiftConfig, SwiftOutput, Swift, SwiftTuners) from .hub import snapshot_download, push_to_hub, push_to_hub_async, push_to_hub_in_queue from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType, @@ -38,7 +37,8 @@ 'PromptEncoderConfig', 'PromptLearningConfig', 'PromptTuningConfig', 'get_peft_config', 'get_peft_model', 'get_peft_model_state_dict', 'Prompt', 'PromptConfig', - 'PromptModule', 'SwiftConfig', 'SwiftOutput', 'Swift' + 'PromptModule', 'SwiftConfig', 'SwiftOutput', 'Swift', + 'SwiftTuners' ], 'trainers': [ 'EvaluationStrategy', 'FSDPOption', 'HPSearchBackend', diff --git a/swift/tuners/__init__.py b/swift/tuners/__init__.py index 6ebb813e90..1ecc496850 100644 --- a/swift/tuners/__init__.py +++ b/swift/tuners/__init__.py @@ -7,9 +7,9 @@ from .adapter import Adapter, AdapterConfig, AdapterModule from .base import SwiftModel, Swift from .lora import LoRA, LoRAConfig - from .mapping import SWIFT_MAPPING + from .mapping import SWIFT_MAPPING, SwiftTuners from .side import Side, SideConfig, SideModule - from .restuning import ResTuning, ResTuningConfig, ResTuningModule + from .restuning import ResTuning, ResTuningConfig, ResTuningBypassModule from .peft import (LoraConfig, PeftConfig, PeftModel, PeftModelForCausalLM, PeftModelForSeq2SeqLM, PeftModelForSequenceClassification, @@ -24,9 +24,9 @@ 'adapter': ['Adapter', 'AdapterConfig', 'AdapterModule'], 'base': ['SwiftModel', 'Swift'], 'lora': ['LoRA', 'LoRAConfig'], - 'mapping': ['SWIFT_MAPPING'], + 'mapping': ['SWIFT_MAPPING', 'SwiftTuners'], 'side': ['Side', 'SideConfig', 'SideModule'], - 'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningModule'], + 'restuning': ['ResTuning', 'ResTuningConfig', 'ResTuningBypassModule'], 'peft': [ 'LoraConfig', 'PeftConfig', 'PeftModel', 'PeftModelForCausalLM', 'PeftModelForSeq2SeqLM', 'PeftModelForSequenceClassification', diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index cfbe13fdd4..1da707b1a0 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -197,7 +197,7 @@ def set_activation(self, activate=True): def forward(self, x, identity=None): if not self._activate: - return 0. + return x if not self._prepared: self.linear1.to(x.device) self.act.to(x.device) diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 8d0bd6c796..a9d223aa20 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -38,7 +38,7 @@ class PromptConfig(SwiftConfig): extract_embedding: Whether the embedding is extracted at final stage to keep the same dims with inputs """ - dim: int = field( + dim: Union[int, List[int]] = field( default=None, metadata={'help': 'The dimension of the hidden states'}) target_modules: str = field( diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 79082f5c92..3f3c19ccc6 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -4,6 +4,7 @@ import shutil import tempfile import unittest +from concurrent.futures import ThreadPoolExecutor from time import time import torch @@ -208,6 +209,74 @@ def test_swift_multiple_adapters(self): torch.isclose(state_dict[key], state_dict2[key]).flatten().detach().cpu())) + def test_swift_multiple_adapters_switching(self): + from swift.tuners.lora import Linear + from swift.tuners.adapter import AdapterModule + + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.ones_(self.lora_A) + nn.init.ones_(self.lora_B) + + Linear.reset_parameters = reset_parameters + + def init_weights(self): + + def _init_weights(m): + if isinstance(m, nn.Linear): + nn.init.ones_(m.weight) + nn.init.ones_(m.bias) + + self.apply(_init_weights) + + AdapterModule.init_weights = init_weights + + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + inputs = preprocessor('how are you') + model1 = copy.deepcopy(model) + model2 = copy.deepcopy(model) + model1 = Swift.prepare_model( + model1, + config={ + 'lora': LoRAConfig(target_modules=['query', 'key', 'value']) + }) + model2 = Swift.prepare_model( + model2, + config={ + 'adapter': + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0) + }) + model = Swift.prepare_model( + model, + config={ + 'lora': + LoRAConfig(target_modules=['query', 'key', 'value']), + 'adapter': + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0) + }) + model.deactivate_adapter('adapter') + outputs1 = model(**inputs) + outputs2 = model1(**inputs) + self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits)) + model.activate_adapter('adapter') + model.deactivate_adapter('lora') + outputs1 = model(**inputs) + outputs2 = model2(**inputs) + self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits)) + def test_swift_side_bert(self): model = Model.from_pretrained( 'damo/nlp_structbert_sentence-similarity_chinese-base') From 985780c4d6c468a8d3504dc4695a8eeb6af608ad Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Mon, 11 Sep 2023 23:51:47 +0800 Subject: [PATCH 44/70] support thread local --- examples/pytorch/llm/src/utils/swift_utils.py | 20 ++-- swift/tuners/adapter.py | 12 +-- swift/tuners/base.py | 15 +++ swift/tuners/lora.py | 101 ++++++++++++++---- swift/tuners/prompt.py | 14 ++- swift/tuners/restuning.py | 12 +-- swift/tuners/side.py | 12 +-- swift/tuners/utils.py | 19 ++++ tests/tuners/test_swift_base.py | 24 +++++ 9 files changed, 169 insertions(+), 60 deletions(-) diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py index ee286a1b75..3f11634f00 100644 --- a/examples/pytorch/llm/src/utils/swift_utils.py +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -1,21 +1,20 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Dict, Any +from typing import Any, Dict import torch.nn -from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, - Swift, - SwiftConfig, get_logger) -from .model import (MODEL_MAPPING) +from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, Swift, + SwiftConfig, SwiftTuners, get_logger) +from .model import MODEL_MAPPING from .utils import find_all_linear_for_lora -from swift import SwiftTuners logger = get_logger() -def prepare_model(model: torch.nn.Module, - args: Any, - ): +def prepare_model( + model: torch.nn.Module, + args: Any, +): swift_config: Dict[str, SwiftConfig] = dict() for sft_type in [_type.strip() for _type in args.sft_type.split(',')]: if sft_type.lower() == SwiftTuners.LORA.lower(): @@ -24,8 +23,7 @@ def prepare_model(model: torch.nn.Module, args.lora_target_modules = find_all_linear_for_lora( model, args.quantization_bit, args.model_type) logger.info( - f'Setting lora_target_modules: {args.lora_target_modules}' - ) + f'Setting lora_target_modules: {args.lora_target_modules}') lora_config = LoRAConfig( r=args.lora_rank, diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 1da707b1a0..26b6adabf8 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -11,7 +11,7 @@ from swift import get_logger from swift.utils.torch_utils import find_sub_module -from .utils import SwiftConfig, SwiftOutput +from .utils import ActivationMixin, SwiftConfig, SwiftOutput logger = get_logger() @@ -151,10 +151,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'adapter_{adapter_name}') for _module in modules: + _module: ActivationMixin _module.set_activation(activate) -class AdapterModule(nn.Module): +class AdapterModule(nn.Module, ActivationMixin): """The implementation of adapter tuning method. Adapters project input tokens by an MLP layer. @@ -173,6 +174,7 @@ def __init__( act_layer=nn.GELU, ): super(AdapterModule, self).__init__() + super(nn.Module, self).__init__() self.dim = dim self.adapter_length = adapter_length # self.adapter_type = adapter_type @@ -181,7 +183,6 @@ def __init__( self.linear2 = nn.Linear(adapter_length, dim) self.init_weights() self._prepared = False - self._activate = True def init_weights(self): @@ -192,11 +193,8 @@ def _init_weights(m): self.apply(_init_weights) - def set_activation(self, activate=True): - self._activate = activate - def forward(self, x, identity=None): - if not self._activate: + if not self.is_activated(): return x if not self._prepared: self.linear1.to(x.device) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index dd3f984dc0..fbae7f93bd 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -385,8 +385,21 @@ def save_pretrained(self, def base_model(self): return self.model + def set_active_adapters(self, adapter_names: List[str]): + if not adapter_names: + return + + adapter_names = set(adapter_names) + for adapter_name in (adapter_names & set(self.adapters.keys())): + self.activate_adapter(adapter_name) + + for adapter_name in (set(self.adapters.keys()) - adapter_names): + self.deactivate_adapter(adapter_name) + def activate_adapter(self, adapter_name): if adapter_name not in self.adapters: + logger.warning( + f'{adapter_name} not in adapters: {self.adapters.keys()}') return from .mapping import SWIFT_MAPPING @@ -395,6 +408,8 @@ def activate_adapter(self, adapter_name): def deactivate_adapter(self, adapter_name): if adapter_name not in self.adapters: + logger.warning( + f'{adapter_name} not in adapters: {self.adapters.keys()}') return from .mapping import SWIFT_MAPPING diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index f4600d331c..af78a831d9 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -15,19 +15,87 @@ from swift import get_logger from ..utils.torch_utils import find_sub_module -from .utils import SwiftConfig, SwiftOutput +from .utils import ActivationMixin, SwiftConfig, SwiftOutput if is_bnb_available(): import bitsandbytes as bnb from peft.tuners.lora import Linear8bitLt + class Linear8bitLtSwift(ActivationMixin, Linear8bitLt): + + def __init__( + self, + adapter_name, + in_features, + out_features, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + **kwargs, + ): + super(ActivationMixin, + self).__init__(adapter_name, in_features, out_features, r, + lora_alpha, lora_dropout, **kwargs) + super(Linear8bitLtSwift, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self.is_activated(): + return bnb.nn.Linear8bitLt.forward(self, x) + return super().forward(x) + + if is_bnb_4bit_available(): from peft.tuners.lora import Linear4bit + class Linear4bitSwift(ActivationMixin, Linear4bit): + + def __init__( + self, + adapter_name, + in_features, + out_features, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + **kwargs, + ): + super(ActivationMixin, + self).__init__(adapter_name, in_features, out_features, r, + lora_alpha, lora_dropout, **kwargs) + super(Linear4bitSwift, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self.is_activated(): + return bnb.nn.Linear4bit.forward(self, x) + return super().forward(x) + + if is_auto_gptq_available(): from peft.tuners.lora import QuantLinear + class QuantLinearSwift(ActivationMixin, QuantLinear): + + def __init__( + self, + adapter_name, + quant_linear_module, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + **kwargs, + ): + super(ActivationMixin, + self).__init__(adapter_name, quant_linear_module, r, + lora_alpha, lora_dropout, **kwargs) + super(QuantLinearSwift, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if not self.is_activated(): + return self.quant_linear_module(x) + return super().forward(x) + + logger = get_logger() @@ -120,10 +188,8 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) for _module in modules: - if isinstance(_module, LoRALayer): - _module.set_activation(activate) - else: - _module.active_adapter = 'default' if activate else 'invalid' + _module: ActivationMixin + _module.set_activation(activate) @staticmethod def _dynamic_patch_lora(model, replace_modules, use_merged_linear, @@ -174,7 +240,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, 'index': sub_module.index, }) - lora_module = Linear8bitLt( + lora_module = Linear8bitLtSwift( 'default', sub_module.in_features, sub_module.out_features, @@ -193,7 +259,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, 'quant_type': sub_module.weight.quant_type, }) - lora_module = Linear4bit( + lora_module = Linear4bitSwift( 'default', sub_module.in_features, sub_module.out_features, @@ -202,7 +268,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, **four_bit_kwargs) elif AutoGPTQQuantLinear is not None and isinstance( sub_module, AutoGPTQQuantLinear): - lora_module = QuantLinear('default', sub_module, **kwargs) + lora_module = QuantLinearSwift('default', sub_module, + **kwargs) sub_module.weight = sub_module.qweight elif isinstance(sub_module, torch.nn.Linear): if use_merged_linear: @@ -330,7 +397,7 @@ def unpatch_lora(model, config: LoRAConfig): model.lora_module_map.pop(module_key, None) -class LoRALayer: +class LoRALayer(ActivationMixin): def __init__( self, @@ -339,8 +406,8 @@ def __init__( lora_dropout: float, merge_weights: bool, ): + super().__init__() self.r = r - self.old_r = r self.lora_alpha = lora_alpha # Optional dropout if lora_dropout > 0.: @@ -351,12 +418,6 @@ def __init__( self.merged = False self.merge_weights = merge_weights - def set_activation(self, activate=True): - if activate: - self.r = self.old_r - else: - self.r = 0 - class Embedding(nn.Embedding, LoRALayer): # LoRA implemented in a dense layer @@ -420,7 +481,7 @@ def eval(self): self.merged = True def forward(self, x: torch.Tensor): - if self.r > 0 and not self.merged: + if self.r > 0 and not self.merged and self.is_activated(): result = nn.Embedding.forward(self, x) if self.r > 0: after_A = F.embedding(x, self.lora_A.T, self.padding_idx, @@ -511,7 +572,7 @@ def forward(self, x: torch.Tensor): def T(w): return w.T if self.fan_in_fan_out else w - if self.r > 0 and not self.merged: + if self.r > 0 and not self.merged and self.is_activated(): result = F.linear(x, T(self.weight), bias=self.bias) if self.r > 0: x_dtype = x.dtype @@ -631,7 +692,7 @@ def forward(self, x: torch.Tensor): def T(w): return w.T if self.fan_in_fan_out else w - if self.merged: + if self.merged or not self.is_activated(): return F.linear(x, T(self.weight), bias=self.bias) else: result = F.linear(x, T(self.weight), bias=self.bias) @@ -713,7 +774,7 @@ def eval(self): self.merged = True def forward(self, x: torch.Tensor): - if self.r > 0 and not self.merged: + if self.r > 0 and not self.merged and self.is_activated(): return F.conv2d( x, self.weight + # noqa diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index a9d223aa20..661eb4dbbe 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -10,7 +10,7 @@ from swift import get_logger from ..utils.torch_utils import find_sub_module -from .utils import SwiftConfig, SwiftOutput +from .utils import ActivationMixin, SwiftConfig, SwiftOutput logger = get_logger() @@ -176,10 +176,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'prompt_{adapter_name}') for _module in modules: + _module: ActivationMixin _module.set_activation(activate) -class PromptModule(nn.Module): +class PromptModule(nn.Module, ActivationMixin): """The implementation of vision prompt tuning method. Visual prompt tuning (VPT) is proposed to initialize tunable prompt tokens @@ -200,17 +201,17 @@ def __init__(self, mask_values=0., attach_front=True): super(PromptModule, self).__init__() + super(nn.Module, self).__init__() self.dim = dim self.layer_num = layer_num self.prompt_length = prompt_length self.mask_values = mask_values self.attach_front = attach_front - self._activate = True self.prompt_token = nn.Parameter(torch.zeros(1, prompt_length, dim)) nn.init.xavier_uniform_(self.prompt_token) def forward(self, x): - if not self._activate: + if not self.is_activated(): return x prompt_token = self.prompt_token.expand(x.shape[0], -1, -1).to(x.device) @@ -229,11 +230,8 @@ def forward(self, x): dim=1) return x - def set_activation(self, activate=True): - self._activate = activate - def patch_attention_mask(self, m): - if not self._activate: + if not self.is_activated(): return m prefix_attention_mask = torch.full((*m.shape[:-1], self.prompt_length), self.mask_values).to(m.device) diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index 7858561fa6..e40290e06d 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -12,7 +12,7 @@ from ..utils.torch_utils import find_sub_module from .restuning_components import (ResTuner, detach_tensors, probe_input_pre_hook, probe_output_hook) -from .utils import SwiftConfig, SwiftOutput +from .utils import ActivationMixin, SwiftConfig, SwiftOutput logger = get_logger() @@ -297,10 +297,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'restuning_{adapter_name}') for _module in modules: + _module: ActivationMixin _module.set_activation(activate) -class ResTuningBypassModule(nn.Module): +class ResTuningBypassModule(nn.Module, ActivationMixin): """The implementation of ResTuningBypass method. """ @@ -314,8 +315,8 @@ def __init__( tuner_cfg=None, ): super(ResTuningBypassModule, self).__init__() + super(nn.Module, self).__init__() - self._activate = True self.bypass_blocks = nn.Sequential(*[ ResTunerBypassBlock( dim=dims[i] if isinstance(dims, list) else dims, @@ -330,11 +331,8 @@ def __init__( for i in range(depth) ]) - def set_activation(self, activate=True): - self._activate = activate - def forward(self, x_list, origin_arg, **kwargs): - if not self._activate: + if not self.is_activated(): return origin_arg x_bypass = detach_tensors(x_list.pop(0)) x_bypass = x_bypass[0] if isinstance(x_bypass, diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 0bf2b548ad..5e766b72b0 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -14,7 +14,7 @@ from swift.utils.logger import get_logger from ..utils.torch_utils import find_sub_module -from .utils import SwiftConfig, SwiftOutput +from .utils import ActivationMixin, SwiftConfig, SwiftOutput logger = get_logger() @@ -135,10 +135,11 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, modules: List[torch.nn.Module] = find_sub_module( module, f'side_{adapter_name}') for _module in modules: + _module: ActivationMixin _module.set_activation(activate) -class SideModule(nn.Module): +class SideModule(nn.Module, ActivationMixin): """The implementation of vision side-tuning method. Side-Tuning only needs to train one side network and @@ -153,6 +154,7 @@ class SideModule(nn.Module): def __init__(self, dim, side_module_name='fcn4'): super(SideModule, self).__init__() + super(nn.Module, self).__init__() side_module_name = side_module_name.lower() if side_module_name == 'fcn4': @@ -170,13 +172,9 @@ def __init__(self, dim, side_module_name='fcn4'): raise ValueError( f'Unsupported side_module_name: {side_module_name}') self.alpha = nn.Parameter(torch.tensor(0.0)) - self._activate = True - - def set_activation(self, activate=True): - self._activate = activate def forward(self, x, x_main): - if not self._activate: + if not self.is_activated(): return x_main alpha_squashed = torch.sigmoid(self.alpha) x_side = self.side_net(x) diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py index 0e0c4bed4f..8dee34b0b0 100644 --- a/swift/tuners/utils.py +++ b/swift/tuners/utils.py @@ -2,8 +2,10 @@ # Copyright 2023-present the HuggingFace Inc. team. import os +import threading from dataclasses import asdict, dataclass, field from types import FunctionType +from typing import Dict import json from peft.utils import CONFIG_NAME @@ -125,3 +127,20 @@ class SwiftOutput: config: SwiftConfig = None state_dict_callback: FunctionType = None mark_trainable_callback: FunctionType = None + + +class ActivationMixin: + + USE_UNIQUE_THREAD = 'USE_UNIQUE_THREAD' + + def __init__(self): + self._thread_inf: Dict[int, bool] = {} + self._unique_thread = os.environ.get(ActivationMixin.USE_UNIQUE_THREAD) + + def set_activation(self, activate=True): + tid = 0 if self._unique_thread else threading.get_ident() + self._thread_inf[tid] = activate + + def is_activated(self): + tid = 0 if self._unique_thread else threading.get_ident() + return self._thread_inf.get(tid, True) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 3f3c19ccc6..ba2f0f100e 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -277,6 +277,30 @@ def _init_weights(m): outputs2 = model2(**inputs) self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits)) + def thread_func1(): + model.set_active_adapters(['lora']) + outputs_single = model1(**inputs) + outputs_t1 = model(**inputs) + self.assertTrue( + torch.allclose(outputs_single.logits, outputs_t1.logits)) + + def thread_func2(): + model.set_active_adapters(['adapter']) + outputs_single = model2(**inputs) + outputs_t2 = model(**inputs) + self.assertTrue( + torch.allclose(outputs_single.logits, outputs_t2.logits)) + + with ThreadPoolExecutor(2) as executor: + f1 = executor.submit(thread_func1) + f2 = executor.submit(thread_func2) + e1 = f1.exception() + e2 = f2.exception() + if e1 is not None: + raise e1 + if e2 is not None: + raise e2 + def test_swift_side_bert(self): model = Model.from_pretrained( 'damo/nlp_structbert_sentence-similarity_chinese-base') From 9a2777c01a32daf410fa0517fb979850005a736e Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 12 Sep 2023 11:18:05 +0800 Subject: [PATCH 45/70] fix CI --- tests/tuners/test_swift_restuning.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/tuners/test_swift_restuning.py b/tests/tuners/test_swift_restuning.py index 43522fbbe3..016c8d7361 100644 --- a/tests/tuners/test_swift_restuning.py +++ b/tests/tuners/test_swift_restuning.py @@ -6,7 +6,7 @@ import torch -from swift import ResTuningConfig, Swift, SwiftModel +from swift import ResTuningConfig, Swift, SwiftModel, snapshot_download class TestSwiftResTuning(unittest.TestCase): @@ -45,9 +45,9 @@ def model_comparison(self, model, model2): self.assertTrue(torch.isclose(model_val, model2_val)) def test_swift_restuning_vit(self): + model_dir = snapshot_download('AI-ModelScope/vit-base-patch16-224') from transformers import AutoModelForImageClassification - model = AutoModelForImageClassification.from_pretrained( - 'google/vit-base-patch16-224') + model = AutoModelForImageClassification.from_pretrained(model_dir) model_swift_1 = copy.deepcopy(model) model_swift_2 = copy.deepcopy(model) result_origin = model(torch.ones((1, 3, 224, 224))).logits @@ -103,9 +103,10 @@ def test_swift_restuning_vit(self): self.model_comparison(model_swift_1, model_loaded) def test_swift_restuning_diffusers_sd(self): + model_dir = snapshot_download('AI-ModelScope/stable-diffusion-v1-5') from diffusers import UNet2DConditionModel model = UNet2DConditionModel.from_pretrained( - 'runwayml/stable-diffusion-v1-5', subfolder='unet') + model_dir, subfolder='unet') model.requires_grad_(False) model2 = copy.deepcopy(model) self.set_random_seed() From b5f46d256860f10a6d4bd9f711aaf5e058624d23 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 12 Sep 2023 22:24:22 +0800 Subject: [PATCH 46/70] fix bug --- swift/tuners/base.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index fbae7f93bd..a4ee75ba8a 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -198,7 +198,7 @@ def load_state_file(path): def from_pretrained(cls, model: nn.Module, model_id: str = None, - adapter_name: Union[str, List[str]] = 'default', + adapter_name: Union[str, List[str]] = None, inference_mode: bool = False, revision: str = None, **kwargs): @@ -230,6 +230,12 @@ def from_pretrained(cls, ) if not os.path.exists(model_id): model_dir = snapshot_download(model_id, revision=revision) + if adapter_name is None: + adapter_name = [ + sub_dir for sub_dir in os.listdir(model_dir) + if os.path.isdir(os.path.join(model_dir, sub_dir)) and + os.path.isfile(os.path.join(model_dir, sub_dir, CONFIG_NAME)) + ] for _name in adapter_name if isinstance(adapter_name, list) else [adapter_name]: sub_folder = os.path.join(model_dir, _name) @@ -466,7 +472,7 @@ def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig, @staticmethod def from_pretrained(model: nn.Module, model_id: str = None, - adapter_name: Union[str, List[str]] = 'default', + adapter_name: Union[str, List[str]] = None, revision: str = None, **kwargs): """Prepare a model by a model_id in the ModelScope hub or a local dir. @@ -489,8 +495,9 @@ def from_pretrained(model: nn.Module, _json = json.load(f) is_peft_model = PEFT_TYPE_KEY in _json - _name = adapter_name if isinstance(adapter_name, - str) else adapter_name[0] + _name = adapter_name if isinstance( + adapter_name, str) or adapter_name is None else adapter_name[0] + _name = _name or '' if os.path.exists(os.path.join(model_id, _name, CONFIG_NAME)): with open(os.path.join(model_id, _name, CONFIG_NAME), 'r') as f: _json = json.load(f) @@ -500,7 +507,7 @@ def from_pretrained(model: nn.Module, model, model_id, revision=revision, - adapter_name=adapter_name, + adapter_name=adapter_name or 'default', **kwargs) else: return SwiftModel.from_pretrained( From 3da517b7144fa57ed777dbac7ddfbc8ab31cf330 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 12 Sep 2023 22:34:45 +0800 Subject: [PATCH 47/70] fix bug --- swift/tuners/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py index 8dee34b0b0..b43c23b6fe 100644 --- a/swift/tuners/utils.py +++ b/swift/tuners/utils.py @@ -135,7 +135,7 @@ class ActivationMixin: def __init__(self): self._thread_inf: Dict[int, bool] = {} - self._unique_thread = os.environ.get(ActivationMixin.USE_UNIQUE_THREAD) + self._unique_thread = int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0')) def set_activation(self, activate=True): tid = 0 if self._unique_thread else threading.get_ident() From a9426f38c4c2d37f06c44c56144f298a263883b2 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 13:09:49 +0800 Subject: [PATCH 48/70] support tuner on one module --- swift/tuners/adapter.py | 6 +++--- swift/tuners/base.py | 12 +++++++++--- swift/tuners/lora.py | 3 +++ swift/tuners/prompt.py | 4 ++-- swift/tuners/restuning.py | 13 +++++++------ swift/tuners/side.py | 10 +++++----- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 26b6adabf8..0a33011dc6 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -93,7 +93,7 @@ def prepare_model(model: nn.Module, config: AdapterConfig, module = model.get_submodule(module_key) def _forward(self, *args, **kwargs): - args = self.forward_origin(*args, **kwargs) + args = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) if isinstance(args, (tuple, list, dict)): if isinstance(config.hidden_pos, int): _type = type(args) @@ -115,9 +115,9 @@ def _feed_forward_chunk(self, attention_output): # TODO The `config.method_name` method should not be replaced twice. - module.forward_origin = getattr(module, config.method_name) + setattr(module, f'forward_origin_{adapter_name}', getattr(module, config.method_name)) num_args_in_forward_chunk_fn = len( - inspect.signature(module.forward_origin).parameters) + inspect.signature(getattr(module, f'forward_origin_{adapter_name}')).parameters) if config.method_name == 'feed_forward_chunk' and num_args_in_forward_chunk_fn == 1: setattr(module, config.method_name, types.MethodType(_feed_forward_chunk, module)) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index a4ee75ba8a..c7c98cc06a 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -28,7 +28,7 @@ class SwiftModel(nn.Module): """The Swift wrapper model. Args: - model (`torch.nn.Module`) A module to be tuned by Swift. + model (`Union[nn.Module, 'SwiftModel']`) A module to be tuned by Swift. config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of adapter_name: SwiftConfig. If it's a config class, the adapter_name will be `default` extra_state_keys (`List[str]`, `optional`) A list of regex to match the extra state keys to be saved. @@ -36,12 +36,19 @@ class SwiftModel(nn.Module): """ def __init__(self, - model: nn.Module, + model: Union[nn.Module, 'SwiftModel'], config: Union[SwiftConfig, Dict[str, SwiftConfig]], extra_state_keys: List[str] = None, inference_mode: bool = False, **kwargs): super().__init__() + self.adapters = {} + if isinstance(model, SwiftModel): + self.adapters = model.adapters + extra_state_keys = extra_state_keys or [] + extra_state_keys.extend(model.extra_state_keys) + model = model.base_model + if (getattr(model, 'hf_device_map', None) is not None) and ( len(set(model.hf_device_map.values()) & {'cpu', 'disk'}) > 0): from accelerate.hooks import remove_hook_from_submodules @@ -50,7 +57,6 @@ def __init__(self, for _, p in model.named_parameters(): p.requires_grad = False - self.adapters = {} if isinstance(config, SwiftConfig): self.adapters[DEFAULT_ADAPTER] = self._prepare_model( model, config, DEFAULT_ADAPTER) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index af78a831d9..9bdef953d7 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -12,6 +12,7 @@ from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available, is_bnb_available) from peft.utils import get_auto_gptq_quant_linear, get_quantization_config +from peft.tuners.lora import LoraLayer from swift import get_logger from ..utils.torch_utils import find_sub_module @@ -311,6 +312,8 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, dilation=sub_module.dilation, groups=sub_module.groups, **kwargs) + elif isinstance(sub_module, (LoRALayer, LoraLayer)): + if lora_module is not None: lora_module.weight = sub_module.weight diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 661eb4dbbe..35e0217e4f 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -132,7 +132,7 @@ def _forward(self, *args, **kwargs): else: kwargs[config.attention_mask_pos] = attention_mask - forward_output = self.forward_origin(*args, **kwargs) + forward_output = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) if config.extract_embedding: forward_output = getattr( self, @@ -140,7 +140,7 @@ def _forward(self, *args, **kwargs): return forward_output - module.forward_origin = module.forward + setattr(module, f'forward_origin_{adapter_name}', module.forward) module.forward = types.MethodType(_forward, module) if isinstance(config.dim, list): input_dim = config.dim[len(match_module_keys)] diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index e40290e06d..cf06e20307 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -142,9 +142,9 @@ def _forward_target(self, *args, **kwargs): args_main = _forward_restuning(self, _arg) args[0 if self.target_hidden_pos is None else self. target_hidden_pos] = args_main - args_main = self.forward_origin(*args, **kwargs) + args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) else: - _args_main = self.forward_origin(*args, **kwargs) + _args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) _arg = _args_main[0 if self.target_hidden_pos is None else self .target_hidden_pos] if isinstance( _args_main, @@ -266,13 +266,14 @@ def _forward_restuning(self, origin_arg): tgt_module.stem_module_ins_list = stem_module_ins_list target_module_ins = tgt_module - if isinstance(tgt_module, nn.Sequential): + if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'origin_module_keys'): tgt_module.origin_module_keys = copy.deepcopy( list(tgt_module._modules.keys())) - tgt_module.forward_origin = types.MethodType( - _forward_seq, tgt_module) + + setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType( + _forward_seq, tgt_module)) else: - tgt_module.forward_origin = tgt_module.forward + setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward) tgt_module.forward = types.MethodType(_forward_target, tgt_module) if target_module_ins is None: diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 5e766b72b0..d03db26fe5 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -79,7 +79,7 @@ def prepare_model(model: nn.Module, config: SideConfig, ) def _forward(self, *args, **kwargs): - args_main = self.forward_origin(*args, **kwargs) + args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) if isinstance(args_main, (tuple, list, dict)): if isinstance(config.hidden_pos, str): args_main[config.hidden_pos] = getattr( @@ -94,7 +94,7 @@ def _forward(self, *args, **kwargs): args_main = _type(args_main) return args_main - if isinstance(tgt_module, nn.Sequential): + if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'tgt_module_keys'): tgt_module.tgt_module_keys = copy.deepcopy( list(tgt_module._modules.keys())) @@ -105,10 +105,10 @@ def forward_seq(self, input, *args, **kwargs): input = module(input) return input - tgt_module.forward_origin = types.MethodType( - forward_seq, tgt_module) + setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType( + forward_seq, tgt_module)) else: - tgt_module.forward_origin = tgt_module.forward + setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward) tgt_module.forward = types.MethodType(_forward, tgt_module) side_module = SideModule(config.dim, config.side_module_name) setattr(tgt_module, f'side_{adapter_name}', side_module) From d40c6aa74c6b3c5f8b2f898c23b33bb6b939f94c Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 14:36:52 +0800 Subject: [PATCH 49/70] fix lora --- swift/tuners/adapter.py | 11 ++++++--- swift/tuners/lora.py | 50 +++++++++++++++++++------------------- swift/tuners/prompt.py | 7 ++++-- swift/tuners/restuning.py | 17 ++++++++----- swift/tuners/side.py | 14 +++++++---- swift/tuners/utils.py | 3 ++- swift/utils/torch_utils.py | 3 +-- 7 files changed, 61 insertions(+), 44 deletions(-) diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 0a33011dc6..3beffcfca8 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -93,7 +93,9 @@ def prepare_model(model: nn.Module, config: AdapterConfig, module = model.get_submodule(module_key) def _forward(self, *args, **kwargs): - args = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) + args = getattr(self, + f'forward_origin_{adapter_name}')(*args, + **kwargs) if isinstance(args, (tuple, list, dict)): if isinstance(config.hidden_pos, int): _type = type(args) @@ -115,9 +117,12 @@ def _feed_forward_chunk(self, attention_output): # TODO The `config.method_name` method should not be replaced twice. - setattr(module, f'forward_origin_{adapter_name}', getattr(module, config.method_name)) + setattr(module, f'forward_origin_{adapter_name}', + getattr(module, config.method_name)) num_args_in_forward_chunk_fn = len( - inspect.signature(getattr(module, f'forward_origin_{adapter_name}')).parameters) + inspect.signature( + getattr(module, + f'forward_origin_{adapter_name}')).parameters) if config.method_name == 'feed_forward_chunk' and num_args_in_forward_chunk_fn == 1: setattr(module, config.method_name, types.MethodType(_feed_forward_chunk, module)) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 9bdef953d7..26ab294ef3 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -4,6 +4,7 @@ import math import re from dataclasses import dataclass, field +from types import MethodType from typing import Dict, List import torch @@ -11,8 +12,8 @@ import torch.nn.functional as F from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available, is_bnb_available) -from peft.utils import get_auto_gptq_quant_linear, get_quantization_config from peft.tuners.lora import LoraLayer +from peft.utils import get_auto_gptq_quant_linear, get_quantization_config from swift import get_logger from ..utils.torch_utils import find_sub_module @@ -175,8 +176,7 @@ def prepare_model(model: nn.Module, config: LoRAConfig, adapter_name: str): fan_in_fan_out=config.fan_in_fan_out) def state_dict_callback(state_dict, adapter_name): - return lora_state_dict(state_dict, model.lora_module_map, - adapter_name, config.bias) + return lora_state_dict(state_dict, adapter_name, config.bias) def mark_trainable_callback(model): mark_lora_as_trainable(model, adapter_name, config.bias) @@ -187,7 +187,8 @@ def mark_trainable_callback(model): @staticmethod def activate_adapter(module: torch.nn.Module, adapter_name: str, activate: bool): - modules: List[torch.nn.Module] = find_sub_module(module, adapter_name) + modules: List[torch.nn.Module] = find_sub_module( + module, f'loramodule_{adapter_name}') for _module in modules: _module: ActivationMixin _module.set_activation(activate) @@ -206,8 +207,6 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, Returns: The lora modules """ - if not hasattr(model, 'lora_module_map'): - model.lora_module_map = {} modules = {} module_keys = [key for key, _ in model.named_modules()] assert isinstance(replace_modules, (str, list)) @@ -222,10 +221,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, module_key.endswith(target_key) for target_key in replace_modules) if target_module_found: # noqa - parts = module_key.split('.') - module = model.get_submodule('.'.join(parts[:-1])) sub_module = model.get_submodule(module_key) - _key = parts[-1] lora_module = None if getattr(model, 'is_loaded_in_8bit', False) and isinstance( @@ -312,8 +308,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, dilation=sub_module.dilation, groups=sub_module.groups, **kwargs) - elif isinstance(sub_module, (LoRALayer, LoraLayer)): + def _forward(self, *args, **kwargs): + for _name, _module in sub_module.named_modules(): + if f'loramodule_{adapter_name}' in _name and _module.is_activated( + ): + return _module.forward(*args, **kwargs) + return self.forward_origin(*args, **kwargs) if lora_module is not None: lora_module.weight = sub_module.weight @@ -322,11 +323,13 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, if getattr(sub_module, 'state', None) is not None: lora_module.state = sub_module.state lora_module.to(sub_module.weight.device) - lora_module.adapter_name = adapter_name - setattr(module, _key, lora_module) + setattr(sub_module, f'loramodule_{adapter_name}', + lora_module) + if not hasattr(sub_module, 'forward_origin'): + sub_module.forward_origin = sub_module.forward + sub_module.forward = MethodType(_forward, sub_module) modules[module_key] = adapter_name - model.lora_module_map.update(modules) logger.info(f'Lora modules(module_key -> adapter_name): {modules}') @staticmethod @@ -341,8 +344,6 @@ def unpatch_lora(model, config: LoRAConfig): model: The model called with `tune` function. config: The `LoRAConfig` to use. """ - if not hasattr(model, 'lora_module_map'): - model.lora_module_map = {} module_keys = [key for key, _ in model.named_modules()] assert isinstance(config.replace_modules, (str, list)) replace_modules = config.replace_modules @@ -397,7 +398,6 @@ def unpatch_lora(model, config: LoRAConfig): origin_module.to(sub_module.weight.device).to( sub_module.weight.dtype) setattr(module, _key, origin_module) - model.lora_module_map.pop(module_key, None) class LoRALayer(ActivationMixin): @@ -420,6 +420,8 @@ def __init__( # Mark the weight as unmerged self.merged = False self.merge_weights = merge_weights + if not self._unique_thread: + self.merge_weights = False class Embedding(nn.Embedding, LoRALayer): @@ -801,8 +803,8 @@ def mark_lora_as_trainable(model: nn.Module, if 'bias' in n: p.requires_grad = True elif bias == 'lora_only': - for m in model.modules(): - if adapter_name == getattr(m, 'adapter_name', None) and \ + for n, m in model.named_modules(): + if f'loramodule_{adapter_name}' in n and \ hasattr(m, 'bias') and \ m.bias is not None: m.bias.requires_grad = True @@ -811,27 +813,25 @@ def mark_lora_as_trainable(model: nn.Module, def lora_state_dict(state_dict, - module_map: Dict, adapter_name: str, bias: str = 'none') -> Dict[str, torch.Tensor]: if bias == 'none': return { k: state_dict[k] for k in state_dict - if 'lora_' in k and module_map.get(k[:k.find('lora_') - - 1], None) == adapter_name + if f'loramodule_{adapter_name}' in k and 'lora_' in k } elif bias == 'all': return { k: state_dict[k] - for k in state_dict if ('lora_' in k and module_map.get( - k[:k.find('lora_') - 1], None) == adapter_name) or 'bias' in k + for k in state_dict + if ('lora_' in k and f'loramodule_{adapter_name}' in k) or ( + 'bias' in k and f'loramodule_{adapter_name}' not in k) } elif bias == 'lora_only': to_return = {} for k in state_dict: - if 'lora_' in k and module_map.get(k[:k.find('lora_') - 1], - None) == adapter_name: + if f'loramodule_{adapter_name}' in k and 'lora_' in k: to_return[k] = state_dict[k] bias_name = k.split('lora_')[0] + 'bias' if bias_name in state_dict: diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 35e0217e4f..f306ea3d79 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -132,7 +132,9 @@ def _forward(self, *args, **kwargs): else: kwargs[config.attention_mask_pos] = attention_mask - forward_output = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) + forward_output = getattr( + self, f'forward_origin_{adapter_name}')(*args, + **kwargs) if config.extract_embedding: forward_output = getattr( self, @@ -140,7 +142,8 @@ def _forward(self, *args, **kwargs): return forward_output - setattr(module, f'forward_origin_{adapter_name}', module.forward) + setattr(module, f'forward_origin_{adapter_name}', + module.forward) module.forward = types.MethodType(_forward, module) if isinstance(config.dim, list): input_dim = config.dim[len(match_module_keys)] diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index cf06e20307..4a77887ac9 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -142,9 +142,12 @@ def _forward_target(self, *args, **kwargs): args_main = _forward_restuning(self, _arg) args[0 if self.target_hidden_pos is None else self. target_hidden_pos] = args_main - args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) + args_main = getattr(self, + f'forward_origin_{adapter_name}')(*args, + **kwargs) else: - _args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) + _args_main = getattr(self, f'forward_origin_{adapter_name}')( + *args, **kwargs) _arg = _args_main[0 if self.target_hidden_pos is None else self .target_hidden_pos] if isinstance( _args_main, @@ -266,14 +269,16 @@ def _forward_restuning(self, origin_arg): tgt_module.stem_module_ins_list = stem_module_ins_list target_module_ins = tgt_module - if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'origin_module_keys'): + if isinstance(tgt_module, nn.Sequential) and not hasattr( + tgt_module, 'origin_module_keys'): tgt_module.origin_module_keys = copy.deepcopy( list(tgt_module._modules.keys())) - setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType( - _forward_seq, tgt_module)) + setattr(tgt_module, f'forward_origin_{adapter_name}', + types.MethodType(_forward_seq, tgt_module)) else: - setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward) + setattr(tgt_module, f'forward_origin_{adapter_name}', + tgt_module.forward) tgt_module.forward = types.MethodType(_forward_target, tgt_module) if target_module_ins is None: diff --git a/swift/tuners/side.py b/swift/tuners/side.py index d03db26fe5..3c40baede9 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -79,7 +79,9 @@ def prepare_model(model: nn.Module, config: SideConfig, ) def _forward(self, *args, **kwargs): - args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) + args_main = getattr( + self, f'forward_origin_{adapter_name}')(*args, + **kwargs) if isinstance(args_main, (tuple, list, dict)): if isinstance(config.hidden_pos, str): args_main[config.hidden_pos] = getattr( @@ -94,7 +96,8 @@ def _forward(self, *args, **kwargs): args_main = _type(args_main) return args_main - if isinstance(tgt_module, nn.Sequential) and not hasattr(tgt_module, 'tgt_module_keys'): + if isinstance(tgt_module, nn.Sequential) and not hasattr( + tgt_module, 'tgt_module_keys'): tgt_module.tgt_module_keys = copy.deepcopy( list(tgt_module._modules.keys())) @@ -105,10 +108,11 @@ def forward_seq(self, input, *args, **kwargs): input = module(input) return input - setattr(tgt_module, f'forward_origin_{adapter_name}', types.MethodType( - forward_seq, tgt_module)) + setattr(tgt_module, f'forward_origin_{adapter_name}', + types.MethodType(forward_seq, tgt_module)) else: - setattr(tgt_module, f'forward_origin_{adapter_name}', tgt_module.forward) + setattr(tgt_module, f'forward_origin_{adapter_name}', + tgt_module.forward) tgt_module.forward = types.MethodType(_forward, tgt_module) side_module = SideModule(config.dim, config.side_module_name) setattr(tgt_module, f'side_{adapter_name}', side_module) diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py index b43c23b6fe..7289773532 100644 --- a/swift/tuners/utils.py +++ b/swift/tuners/utils.py @@ -135,7 +135,8 @@ class ActivationMixin: def __init__(self): self._thread_inf: Dict[int, bool] = {} - self._unique_thread = int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0')) + self._unique_thread = bool( + int(os.environ.get(ActivationMixin.USE_UNIQUE_THREAD, '0'))) def set_activation(self, activate=True): tid = 0 if self._unique_thread else threading.get_ident() diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 7a177ce903..867c8d4513 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -103,8 +103,7 @@ def find_sub_module(module: torch.nn.Module, for name, sub_module in module.named_modules(): if not name: continue - if module_name == name or getattr(sub_module, 'adapter_name', - None) == module_name: + if module_name == name: _modules.append(sub_module) else: _modules.extend(find_sub_module(sub_module, module_name)) From e7fa13eebb826b05324bbeae00310938ccb3ed8f Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 14:40:48 +0800 Subject: [PATCH 50/70] fixbug --- swift/tuners/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index c7c98cc06a..01ceebcb46 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -159,7 +159,7 @@ def state_dict(self, for name, output in self.adapters.items(): if adapter_name == name or adapter_name is None: state_dicts.update( - output.state_dict_callback(destination, adapter_name)) + output.state_dict_callback(destination, name)) if kwargs.get('save_extra_states', True): state_dicts.update({ k: v From f317637471fc9e48176c12f74a59956f91ed7a28 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 14:52:58 +0800 Subject: [PATCH 51/70] update unittest --- tests/tuners/test_swift_base.py | 46 ++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index ba2f0f100e..dd7496f138 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -243,12 +243,19 @@ def _init_weights(m): model1 = Swift.prepare_model( model1, config={ - 'lora': LoRAConfig(target_modules=['query', 'key', 'value']) + 'lora1': LoRAConfig(target_modules=['query', 'key', 'value']), + 'adapter1': + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0) }) model2 = Swift.prepare_model( model2, config={ - 'adapter': + 'lora2': LoRAConfig(target_modules=['query', 'key', 'value']), + 'adapter2': AdapterConfig( dim=model.config.hidden_size, target_modules=r'.*layer\.\d+$', @@ -258,34 +265,43 @@ def _init_weights(m): model = Swift.prepare_model( model, config={ - 'lora': - LoRAConfig(target_modules=['query', 'key', 'value']), - 'adapter': - AdapterConfig( - dim=model.config.hidden_size, - target_modules=r'.*layer\.\d+$', - method_name='feed_forward_chunk', - hidden_pos=0) + 'lora1': LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora2': LoRAConfig(target_modules=['query', 'key', 'value']), + 'adapter1': + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0), + 'adapter2': + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0), }) - model.deactivate_adapter('adapter') + model.deactivate_adapter('adapter2') + model.deactivate_adapter('lora2') outputs1 = model(**inputs) outputs2 = model1(**inputs) self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits)) - model.activate_adapter('adapter') - model.deactivate_adapter('lora') + model.activate_adapter('adapter2') + model.activate_adapter('lora2') + model.deactivate_adapter('adapter1') + model.deactivate_adapter('lora1') outputs1 = model(**inputs) outputs2 = model2(**inputs) self.assertTrue(torch.allclose(outputs1.logits, outputs2.logits)) def thread_func1(): - model.set_active_adapters(['lora']) + model.set_active_adapters(['lora1', 'adapter1']) outputs_single = model1(**inputs) outputs_t1 = model(**inputs) self.assertTrue( torch.allclose(outputs_single.logits, outputs_t1.logits)) def thread_func2(): - model.set_active_adapters(['adapter']) + model.set_active_adapters(['lora2', 'adapter2']) outputs_single = model2(**inputs) outputs_t2 = model(**inputs) self.assertTrue( From cf737ec818c9f1325a5916d3c81efe12487f5658 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 15:36:52 +0800 Subject: [PATCH 52/70] fix bug --- swift/tuners/base.py | 4 ++-- swift/tuners/lora.py | 3 +-- tests/tuners/test_swift_base.py | 32 ++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index 01ceebcb46..8ce17d76ec 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -62,9 +62,9 @@ def __init__(self, model, config, DEFAULT_ADAPTER) elif isinstance(config, dict): assert (all(isinstance(c, SwiftConfig) for c in config.values())) - for adapter_name, config in config.items(): + for adapter_name, _config in config.items(): self.adapters[adapter_name] = self._prepare_model( - model, config, adapter_name) + model, _config, adapter_name) self.model = model self.extra_state_keys = extra_state_keys or [] diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 26ab294ef3..3819d1940a 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -311,8 +311,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, def _forward(self, *args, **kwargs): for _name, _module in sub_module.named_modules(): - if f'loramodule_{adapter_name}' in _name and _module.is_activated( - ): + if 'loramodule_' in _name and _module.is_activated(): return _module.forward(*args, **kwargs) return self.forward_origin(*args, **kwargs) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index dd7496f138..53e15cfa66 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -243,7 +243,8 @@ def _init_weights(m): model1 = Swift.prepare_model( model1, config={ - 'lora1': LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora1': + LoRAConfig(target_modules=['query', 'key', 'value']), 'adapter1': AdapterConfig( dim=model.config.hidden_size, @@ -254,7 +255,8 @@ def _init_weights(m): model2 = Swift.prepare_model( model2, config={ - 'lora2': LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora2': + LoRAConfig(target_modules=['query', 'key', 'value']), 'adapter2': AdapterConfig( dim=model.config.hidden_size, @@ -265,20 +267,22 @@ def _init_weights(m): model = Swift.prepare_model( model, config={ - 'lora1': LoRAConfig(target_modules=['query', 'key', 'value']), - 'lora2': LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora1': + LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora2': + LoRAConfig(target_modules=['query', 'key', 'value']), 'adapter1': - AdapterConfig( - dim=model.config.hidden_size, - target_modules=r'.*layer\.\d+$', - method_name='feed_forward_chunk', - hidden_pos=0), + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0), 'adapter2': - AdapterConfig( - dim=model.config.hidden_size, - target_modules=r'.*layer\.\d+$', - method_name='feed_forward_chunk', - hidden_pos=0), + AdapterConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + method_name='feed_forward_chunk', + hidden_pos=0), }) model.deactivate_adapter('adapter2') model.deactivate_adapter('lora2') From e64e30256c697a5ba2646115571fd46eccdea137 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 15:39:48 +0800 Subject: [PATCH 53/70] update unittest --- tests/tuners/test_swift_base.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 53e15cfa66..b2a3a7e3a7 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -267,10 +267,13 @@ def _init_weights(m): model = Swift.prepare_model( model, config={ - 'lora1': - LoRAConfig(target_modules=['query', 'key', 'value']), - 'lora2': - LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora1': LoRAConfig(target_modules=['query', 'key', 'value']), + 'lora2': LoRAConfig(target_modules=['query', 'key', 'value']), + }) + + model = Swift.prepare_model( + model, + config={ 'adapter1': AdapterConfig( dim=model.config.hidden_size, @@ -284,6 +287,7 @@ def _init_weights(m): method_name='feed_forward_chunk', hidden_pos=0), }) + model.deactivate_adapter('adapter2') model.deactivate_adapter('lora2') outputs1 = model(**inputs) From 479c8661ec0edc760ebbc19bb1d02a349499a6f2 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 16:00:03 +0800 Subject: [PATCH 54/70] fix type claim --- swift/tuners/base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index 8ce17d76ec..8eaa43aec7 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -452,13 +452,13 @@ class Swift: """The Wrapper to use both Peft and Swift tuners.""" @staticmethod - def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig, - Dict[str, SwiftConfig]], - **kwargs): + def prepare_model(model: Union[nn.Module, 'SwiftModel'], + config: Union[SwiftConfig, PeftConfig, + Dict[str, SwiftConfig]], **kwargs): """Prepare a model by the input config. Args: - model(`nn.Module`): The model to be tuned. + model(`Union[nn.Module, 'SwiftModel']`): The model to be tuned. config(`Union[SwiftConfig, PeftConfig, Dict[str, SwiftConfig]]`): The config or config dict, can be either SwiftConfigs or PeftConfigs **kwargs: @@ -476,7 +476,7 @@ def prepare_model(model: nn.Module, config: Union[SwiftConfig, PeftConfig, raise ValueError(f'Unsupported swift config type: {config.__class__}') @staticmethod - def from_pretrained(model: nn.Module, + def from_pretrained(model: Union[nn.Module, 'SwiftModel'], model_id: str = None, adapter_name: Union[str, List[str]] = None, revision: str = None, @@ -484,7 +484,7 @@ def from_pretrained(model: nn.Module, """Prepare a model by a model_id in the ModelScope hub or a local dir. Args: - model(`nn.Module`): The model to be tuned. + model(`Union[nn.Module, 'SwiftModel']`): The model to be tuned. model_id(`str`): The model id of the modelhub or a local dir containing the configs/weights. adapter_name(`str`, `optional`): The adapter_name to use. revision(`str`, `optional`): The model revision if the model_id is a model id of the modelhub. From 9cf191735fcd2b60cdae66c8d3c4c360581fbbbd Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 16:10:16 +0800 Subject: [PATCH 55/70] add test --- tests/tuners/test_swift_base.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index b2a3a7e3a7..c81fc17b34 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -142,11 +142,16 @@ def test_swift_restuner_forward(self): torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_lora_injection(self): - model = SbertForSequenceClassification(SbertConfig()) + model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + preprocessor = Preprocessor.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') + input = preprocessor('this is a test') model2 = copy.deepcopy(model) lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) model = Swift.prepare_model(model, config=lora_config) self.assertTrue(isinstance(model, SwiftModel)) + output1 = model(**input) model.save_pretrained(self.tmp_dir) self.assertTrue(os.path.exists(os.path.join(self.tmp_dir, 'default'))) self.assertTrue( @@ -154,7 +159,8 @@ def test_swift_lora_injection(self): os.path.join(self.tmp_dir, 'default', WEIGHTS_NAME))) model2 = Swift.from_pretrained(model2, self.tmp_dir) - + output2 = model2(**input) + self.assertTrue(torch.allclose(output1.logits, output2.logits)) state_dict = model.state_dict() state_dict2 = model2.state_dict() for key in state_dict: From ddc815c2a59d8b6fa1f9738e0fd4806d72f8ff0f Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 16:17:46 +0800 Subject: [PATCH 56/70] add test --- tests/tuners/test_swift_base.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index c81fc17b34..83dd5fa44a 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -142,6 +142,18 @@ def test_swift_restuner_forward(self): torch.allclose(outputs_lora.logits, outputs_reactivate.logits)) def test_swift_lora_injection(self): + + from swift.tuners.lora import Linear + + def reset_parameters(self): + nn.Linear.reset_parameters(self) + if hasattr(self, 'lora_A'): + # initialize A the same way as the default for nn.Linear and B to zero + nn.init.ones_(self.lora_A) + nn.init.ones_(self.lora_B) + + Linear.reset_parameters = reset_parameters + model = Model.from_pretrained( 'damo/nlp_structbert_sentence-similarity_chinese-base') preprocessor = Preprocessor.from_pretrained( From 0868f61378ac80b26e1b27c591f07472f02230f5 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 21:33:54 +0800 Subject: [PATCH 57/70] add docs --- README.md | 2 +- README_CN.md | 2 +- docs/Get Started/1.Introduction.md | 103 ++++++++++++++++ docs/Get Started/2.Installation.md | 25 ++++ docs/Get Started/3.Use in train and infer.md | 123 +++++++++++++++++++ docs/Get Started/4.examples.md | 4 + docs/Modules/1.Interface.md | 70 +++++++++++ docs/Modules/2.lora.md | 17 +++ docs/Modules/3.Restuning.md | 21 ++++ docs/Modules/4.adapter.md | 15 +++ docs/Modules/5.side.md | 13 ++ docs/Modules/6.prompt.md | 17 +++ swift/tuners/lora.py | 1 - swift/tuners/restuning.py | 26 ++-- 14 files changed, 424 insertions(+), 15 deletions(-) create mode 100644 docs/Get Started/1.Introduction.md create mode 100644 docs/Get Started/2.Installation.md create mode 100644 docs/Get Started/3.Use in train and infer.md create mode 100644 docs/Get Started/4.examples.md create mode 100644 docs/Modules/1.Interface.md create mode 100644 docs/Modules/2.lora.md create mode 100644 docs/Modules/3.Restuning.md create mode 100644 docs/Modules/4.adapter.md create mode 100644 docs/Modules/5.side.md create mode 100644 docs/Modules/6.prompt.md diff --git a/README.md b/README.md index bdc5d17eb6..722d0b9a17 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Key features: 1. By integrating the ModelScope library, models can be readily obatined via a model-id. 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result. -3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with different tuners in different threads in a time-sharing manner. +3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with multiple tuners in different threads. ## LLM SFT Example [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) diff --git a/README_CN.md b/README_CN.md index 1ebe678276..4b6d7f4379 100644 --- a/README_CN.md +++ b/README_CN.md @@ -27,7 +27,7 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 关键特点: 1. 通过集成ModelScope库,可以通过model id轻松获取模型。 2. SWIFT提供的tuners可以组合在一起,以便在模型上探索多个tuners,以获得最佳结果。 -3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活,用户可以用一个模型在不同线程中分时使用不同的tuners。 +3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活,用户可以在推理时用一个模型在不同线程中使用多种tuners而互不干扰。 ## 大模型微调的例子 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) diff --git a/docs/Get Started/1.Introduction.md b/docs/Get Started/1.Introduction.md new file mode 100644 index 0000000000..36c4c32409 --- /dev/null +++ b/docs/Get Started/1.Introduction.md @@ -0,0 +1,103 @@ +# 介绍 + +Swift是一个提供LLM模型轻量级训练和推理的开源框架。Swift提供的主要能力是`efficient tuners`,tuners是运行时动态加载到模型上的额外结构,在训练时将原模型的参数冻结,只训练tuner部分,这样可以达到快速训练、降低显存使用的目的。比如,最常用的tuner是LoRA。 + +总之,在这个框架中提供了以下特性: + +- **具备SOTA特性的Efficient Tuners**:用于结合大模型实现轻量级(在商业级显卡上)训练和推理,并取得较好效果 +- **使用ModelScope Hub的Trainer**:基于`transformers trainer`提供,支持LLM模型的训练,并支持将训练后的模型上传到[ModelScope Hub](https://www.modelscope.cn/models)中 +- **可运行的模型Examples**:针对热门大模型提供的训练脚本和推理脚本,并针对热门开源数据集提供了预处理逻辑,可直接运行使用 + +# 快速开始 + +在本章节会介绍如何快速安装swift并设定好运行环境,并跑通一个用例。 + +安装swift的方式非常简单,用户只需要在python>=3.8环境中运行: + +```shell +pip install ms-swift +``` + +下面的代码使用LoRA在分类任务上训练了`bert-base-uncased`模型: + +**运行下面的代码前请额外安装modelscope: ** + +```shell +pip install modelscope>=1.9.0 +``` + +```python +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +from modelscope import AutoModelForSequenceClassification, AutoTokenizer, MsDataset +from transformers import default_data_collator + +from swift import Trainer, LoRAConfig, Swift, TrainingArguments + + +model = AutoModelForSequenceClassification.from_pretrained( + 'AI-ModelScope/bert-base-uncased', revision='v1.0.0') +tokenizer = AutoTokenizer.from_pretrained( + 'AI-ModelScope/bert-base-uncased', revision='v1.0.0') +lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) +model = Swift.prepare_model(model, config=lora_config) + +train_dataset = MsDataset.load('clue', subset_name='afqmc', split='train').to_hf_dataset().select(range(100)) +val_dataset = MsDataset.load('clue', subset_name='afqmc', split='validation').to_hf_dataset().select(range(100)) + + +def tokenize_function(examples): + return tokenizer(examples["sentence1"], examples["sentence2"], + padding="max_length", truncation=True, max_length=128) + + +train_dataset = train_dataset.map(tokenize_function) +val_dataset = val_dataset.map(tokenize_function) + +arguments = TrainingArguments( + output_dir='./outputs', + per_device_train_batch_size=16, +) + +trainer = Trainer(model, arguments, train_dataset=train_dataset, + eval_dataset=val_dataset, + data_collator=default_data_collator,) + +trainer.train() +``` + +在上面的例子中,我们使用了`bert-base-uncased`作为基模型,将LoRA模块patch到了['query', 'key', 'value']三个Linear上,进行了一次训练。 + +训练结束后可以看到outputs文件夹,它的文件结构如下: + +> outputs +> +> ​ |-- checkpoint-xx +> +> ​ |-- configuration.json +> +> ​ |-- default +> +> ​ |-- adapter_config.json +> +> ​ |-- adapter_model.bin +> +> ​ |-- ... + +可以使用该文件夹执行推理: + +```python +from modelscope import AutoModelForSequenceClassification, AutoTokenizer +from swift import Trainer, LoRAConfig, Swift + + +model = AutoModelForSequenceClassification.from_pretrained( + 'AI-ModelScope/bert-base-uncased', revision='v1.0.0') +tokenizer = AutoTokenizer.from_pretrained( + 'AI-ModelScope/bert-base-uncased', revision='v1.0.0') +lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) +model = Swift.from_pretrained(model, model_id='./outputs/checkpoint-21') + +print(model(**tokenizer('this is a test', return_tensors='pt'))) +``` \ No newline at end of file diff --git a/docs/Get Started/2.Installation.md b/docs/Get Started/2.Installation.md new file mode 100644 index 0000000000..740d67bcc8 --- /dev/null +++ b/docs/Get Started/2.Installation.md @@ -0,0 +1,25 @@ +# 安装和使用 + +## Wheel包安装 + +可以使用pip进行安装: + +```shell +pip install ms-swift +``` + +## 源代码安装 + +```shell +git clone https://github.com/modelscope/swift.git +cd swift +pip install -e . +``` + +## Notebook环境 + +Swift支持训练的绝大多数模型都可以在`A10`显卡上使用,用户可以使用ModelScope官方提供的免费显卡资源: + +1. 进入[ModelScope](https://www.modelscope.cn)官方网站并登录 +2. 点击左侧的`我的Notebook`并开启一个免费GPU实例 +3. 愉快地薅A10显卡羊毛 \ No newline at end of file diff --git a/docs/Get Started/3.Use in train and infer.md b/docs/Get Started/3.Use in train and infer.md new file mode 100644 index 0000000000..bcb68e3b15 --- /dev/null +++ b/docs/Get Started/3.Use in train and infer.md @@ -0,0 +1,123 @@ +# Swift API + +## 在训练中使用Swift + +调用`Swift.prepare_model()`来将tuners添加到模型上: + +```python +from modelscope import Model +from swift import Swift, LoRAConfig +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +lora_config = LoRAConfig( + r=16, + target_modules=['query_key_value'], + lora_alpha=32, + lora_dropout=0.) +model = Swift.prepare_model(model, lora_config) +# use model to do other things +``` + +也可以同时使用多个tuners: + +```python +from modelscope import Model +from swift import Swift, LoRAConfig, AdapterConfig +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +lora_config = LoRAConfig( + r=16, + target_modules=['query_key_value'], + lora_alpha=32, + lora_dropout=0.) +adapter_config = AdapterConfig( + dim=model.config.hidden_size, + target_modules=['mlp'], + method_name='forward', + hidden_pos=0, + adapter_length=32, + ) +model = Swift.prepare_model(model, {'first_tuner': lora_config, 'second_tuner': adapter_config}) +# use model to do other things +``` + +在使用多个tuners时,传入的第二个参数需要是Dict,key是tuner名字,value是tuner配置。 + +训练后可以调用: + +```python +model.save_pretrained(save_directory='./output') +``` + +来存储模型checkpoint。模型的checkpoint文件只会包括tuners的权重,不会包含模型本身的权重。存储后的结构如下: + +> outputs +> +> ​ |-- configuration.json +> +> ​ |-- first_tuner +> +> ​ |-- adapter_config.json +> +> ​ |-- adapter_model.bin +> +> ​ |-- second_tuner +> +> ​ |-- adapter_config.json +> +> ​ |-- adapter_model.bin +> +> ​ |-- ... + +如果只传入单独的config,则会使用默认的名称`default`: + +> outputs +> +> ​ |-- configuration.json +> +> ​ |-- default +> +> ​ |-- adapter_config.json +> +> ​ |-- adapter_model.bin +> +> ​ |-- ... + +## 在推理时使用Swift + +使用`Swift.from_pretrained()`来拉起训练后存储的checkpoint: + +```python +from modelscope import Model +from swift import Swift +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +model = Swift.from_pretrained(model, './output') +``` + +## 加载多个tuners并在不同线程中并行使用 + +在模型提供服务时,很可能出现一个模型同时服务多个http线程的情况,其中每个线程代表了一类用户请求。Swift支持在不同线程中激活不同tuners: + +```python +from modelscope import Model +from swift import Swift +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +# 假设output中存在训练完成的a、b、c、d是个tuners +model = Swift.from_pretrained(model, './output') + +# 假设两类请求,一类使用a、b两个tuner,一类使用c、d两个tuner +type_1 = ['a', 'b', 'c'] +type_2 = ['a', 'c', 'd'] + +def request(_input, _type): + if _type == 'type_1': + model.set_active_adapters(type_1) + elif _type == 'type_2': + model.set_active_adapters(type_2) + return model(**_input) + +``` + +在不同线程中使用同样一个tuner是安全的。 diff --git a/docs/Get Started/4.examples.md b/docs/Get Started/4.examples.md new file mode 100644 index 0000000000..3c2e531aa1 --- /dev/null +++ b/docs/Get Started/4.examples.md @@ -0,0 +1,4 @@ +# LLM训练方案 + +Swift提供了完整的LLM训练方案,可以查看[Examples的README](../../examples/pytorch/llm/README_CN.md). + diff --git a/docs/Modules/1.Interface.md b/docs/Modules/1.Interface.md new file mode 100644 index 0000000000..11d39c0379 --- /dev/null +++ b/docs/Modules/1.Interface.md @@ -0,0 +1,70 @@ +# 接口介绍 + +## Swift + +##### Swift.prepare_model(model: Union[nn.Module, 'SwiftModel'], config: Union[SwiftConfig, PeftConfig, Dict[str, SwiftConfig]], **kwargs) + +>该静态方法随机初始化指定类型的tuners +> +>model: 需要加载tuner的模型,可以是SwiftModel,后添加的tuners会和前面SwiftModel中的一起生效 +> +>config:加载的tuner的config,可以是SwiftConfig或PeftConfig,或者带有名称的config的dict。如果不传递名称则名称默认为`default` +> +>kwargs: +> +>​ extra_state_keys: List[str] 需要被额外存储到文件的原始模型weights的key +> +>​ inference_mode: bool 是否以推理模式启动 + +SwiftConfig的具体参数可以查看每个tuner的文档。 + +##### Swift.from_pretrained(model: Union[nn.Module, 'SwiftModel'], model_id: str = None, adapter_name: Union[str, List[str]] = None, revision: str = None, **kwargs) + +> 该静态方法拉起之前存储过的tuners的checkpoint +> +> model: 需要加载tuner的模型,可以是SwiftModel,后添加的tuners会和前面SwiftModel中的一起生效 +> +> model_id:已存储的tuners的本地目录或modelscope hub id。 +> +> adapter_name:需要被拉起的adapter名称,默认为None代表全部拉起 +> +> kwargs: +> +> ​ inference_mode: bool 是否以推理模式启动 +> +> ​ revision: model_id的revision +> +> ​ extra_state_keys: 下次save_pretrained时额外存储的weights + +## SwiftModel + +在`Swift.prepare_model`或`Swift.from_pretrained`拉起后,都会返回一个`SwiftModel`类型的实例。该实例包装了实际传入的模型。 + +##### save_pretrained(self, save_directory: str, safe_serialization: bool = False, adapter_name: Union[str, List[str]] = None, **kwargs) + +> 实例方法,将模型存储到本地磁盘中,可直接被Swift.from_pretrained拉起 +> +> save_directory:存储的目录 +> +> safe_serialization: 是否存储safe_tensors +> +> adapter_name:待存储的adapter名称,默认为None代表全部存储 + +##### set_active_adapters(self, adapter_names: List[str]) + +> 实例方法,设置模型在当前线程中生效的所有adapter。如果将环境变量`USE_UNIQUE_THREAD`设置为'0',则设置对所有线程同时生效。 +> +> adapter_names:adapter名称列表 + +##### activate_adapter(self, adapter_name) + +> 实例方法,在当前线程中单独激活某个adapter,如果将环境变量`USE_UNIQUE_THREAD`设置为'0',则设置对所有线程同时生效。 +> +> adapter_name:adapter名称 + +##### deactivate_adapter(self, adapter_name) + +> 实例方法,在当前线程中单独激活某个adapter,如果将环境变量`USE_UNIQUE_THREAD`设置为'0',则设置对所有线程同时生效。 +> +> adapter_name:adapter名称 + diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md new file mode 100644 index 0000000000..49909e8e48 --- /dev/null +++ b/docs/Modules/2.lora.md @@ -0,0 +1,17 @@ +# LoRA + +LoRA是[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) 论文提供的轻量级训练组件。LoRA可以添加到Linear、Embedding、Conv2d等算子上生效。 + +>```python +>LoRAConfig ( +> r: int LoRA结构的秩 +> target_modules: Union[List[str], str] MLP结构的module_key,如果是str类型则进行full_match统配查找,如果是List,则进行末尾匹配 +> lora_alpha: int LoRA结构的权重比例,lora_alpha/r的值是lora结构的权重 +> lora_dropout: float LoRA结构的dropout比例 +> merge_weights: bool 在推理时是否将loRA权重合并到原始weights上 +> use_merged_linear: bool 是否是merged linear结构 +> enable_lora: List[bool]: 如果是use_merged_linear,哪些module需要添加LoRA结构 +> bias: str 偏置是否参与训练和存储,可以为`none`:所有偏置不参与训练, `all`:所有模块的偏置均参与训练, `lora_only`:仅loRA结构的偏置参与训练 +>) +>``` + diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md new file mode 100644 index 0000000000..c380771742 --- /dev/null +++ b/docs/Modules/3.Restuning.md @@ -0,0 +1,21 @@ +# Restuning + +Restuning是[Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding Tuner from Backbone]()论文提供的轻量级训练组件。Restuning工作在深度学习模型多层结构的layer上。 + +>```python +>ResTuningConfig ( +>dims: Union[List[int], int] layers输出的hidden_state的维度,可以传入List以适配上采样或下采样 +>root_modules: str 提供root hidden_state的模块的正则表达式 +>root_modules_hook: str 可以为`input`或`output`,表示hidden_state从root_module的输入或输出中取到 +>stem_modules: Union[List[str], str 提供root hidden_state的模块的正则表达式(str)或完整module路径(List) +>stem_modules_hook: str 可以为`input`或`output`,表示hidden_state从stem_module的输入或输出中取到 +>target_modules: str target module的正则表达式 +>target_modules_hook: str 可以为`input`或`output` hidden_state从target_module的输入或输出中取到 +>target_hidden_pos: Union[int, str] target_module forward输入或输出中hidden_state的index +>tuner_cfg: restuning模块中子tuner的配置,可以传入str或dict +>use_upsample: bool 是否加入上采样模块 +>upsample_out_channels: List[int] 如果进行上采样,上采样的通道数 +>zero_init_last: bool 是否对tuner的最后一层Linear进行全零初始化 +>) +>``` + diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md new file mode 100644 index 0000000000..e07af189cc --- /dev/null +++ b/docs/Modules/4.adapter.md @@ -0,0 +1,15 @@ +# Adapter + +Adapter是[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1902.00751) 论文提供的轻量级训练组件。一般添加到MLP结构之后生效。 + +>```python +>AdapterConfig ( +> dim: int MLP结构输出中hidden_state的dim,一般等于模型的hidden_size +> target_modules: Union[List[str], str] MLP结构的module_key,如果是str类型则进行full_match统配查找,如果是List,则进行末尾匹配 +> hidden_pos: Union[str, int] MLP输出结构中hidden_state的位置,如果是tuple/list则传入int,如果是dict则传入str类型的key +> method_name: str MLP结构的前向方法,Adapter默认会patch到该方法上,在forward调用后使用其hidden_state输入tuner,默认是forward。 +> adapter_length: int adapter结构中间层长度,默认为128 +> act_layer: str 激活算子,默认为gelu +>) +>``` + diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md new file mode 100644 index 0000000000..2ad0fe587a --- /dev/null +++ b/docs/Modules/5.side.md @@ -0,0 +1,13 @@ +# Side + +Side是[Side-Tuning: A Baseline for Network Adaptation via Additive Side Networks](https://arxiv.org/abs/1912.13503) 论文提供的轻量级训练组件。Side可以添加到MLP结构上。 + +>```python +>SideConfig ( +>dim: int hidden_state的维度 +>target_modules: str 需要嵌入的位置的正则表达式 +>side_module_name: str side module的名字,可以是fcn4,mlp,alexnet +>hidden_pos: Union[str, int] hidden_state在MLP结构中的位置,如果MLP输出为tuple/list,则hidden_pos需要是一个int,否则需要是一个str +>) +>``` + diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md new file mode 100644 index 0000000000..9d93121503 --- /dev/null +++ b/docs/Modules/6.prompt.md @@ -0,0 +1,17 @@ +# Prompt + +Prompt是[Visual Prompt Tuning](https://arxiv.org/abs/2106.09685) 论文提供的轻量级训练组件。Prompt可以添加到每个layer的输入上,为hidden_state添加prompt embedding。 + +>```python +>PromptConfig ( +> dim: int layer输入参数中hidden_state的维度 +> target_modules: Union[str, List[str]]:可以是需要嵌入prompt的layer的正则表达式(字符串类型),如果是List,则匹配这些layers名称的末尾 +> embedding_pos: Union[str, int] layer输入参数中hidden_state的位置,如果是tuple/list则是int类型,如果是dict则是str类型 +> attention_mask_pos: Union[str, int] layer输入参数中attention_mask的位置,如果是tuple/list则是int类型,如果是dict则是str类型 +> attention_mask_value: Union[float, int, bool] prompt部分的attention值,默认为0.0 +> prompt_length: int prompt的长度 +> attach_front: bool prompt和hidden_state组合的方式,True代表将prompt concat到hidden_state的前面,反之则concat到后面 +> extract_embedding: bool 是否在最后的layer结束后将hidden_state中的prompt部分移除 +>) +>``` + diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 3819d1940a..bb7882aee2 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -12,7 +12,6 @@ import torch.nn.functional as F from peft.import_utils import (is_auto_gptq_available, is_bnb_4bit_available, is_bnb_available) -from peft.tuners.lora import LoraLayer from peft.utils import get_auto_gptq_quant_linear, get_quantization_config from swift import get_logger diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index 4a77887ac9..4744e55b38 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -84,7 +84,7 @@ class ResTuningConfig(SwiftConfig): 'The hook type of target modules, can be "input" or "output"' }) - target_hidden_pos: str = field( + target_hidden_pos: Union[int, str] = field( default=None, metadata={ 'help': @@ -118,6 +118,7 @@ class ResTuningConfig(SwiftConfig): def __post_init__(self): from .mapping import SwiftTuners self.swift_type = SwiftTuners.RESTUNING + self.target_hidden_pos = 0 if self.target_hidden_pos is None else self.target_hidden_pos class ResTuning: @@ -136,26 +137,27 @@ def _forward_seq(self, input, *args, **kwargs): def _forward_target(self, *args, **kwargs): if self.target_modules_hook == 'input': - args = list(args) - _arg = args[0 if self.target_hidden_pos is None else self. - target_hidden_pos] + if isinstance(self.target_hidden_pos, int): + args = list(args) + _arg = args[self.target_hidden_pos] + else: + _arg = kwargs[self.target_hidden_pos] args_main = _forward_restuning(self, _arg) - args[0 if self.target_hidden_pos is None else self. - target_hidden_pos] = args_main + if isinstance(self.target_hidden_pos, int): + args[self.target_hidden_pos] = args_main + else: + kwargs[self.target_hidden_pos] = args_main args_main = getattr(self, f'forward_origin_{adapter_name}')(*args, **kwargs) else: _args_main = getattr(self, f'forward_origin_{adapter_name}')( *args, **kwargs) - _arg = _args_main[0 if self.target_hidden_pos is None else self - .target_hidden_pos] if isinstance( - _args_main, - (tuple, list)) else _args_main + _arg = _args_main[self.target_hidden_pos] if isinstance( + _args_main, (tuple, list, dict)) else _args_main args_main = _forward_restuning(self, _arg) if type(_args_main) != type(args_main): - _args_main[0 if self.target_hidden_pos is None else self. - target_hidden_pos] = args_main + _args_main[self.target_hidden_pos] = args_main args_main = _args_main return args_main From 539cea805c2b0f5f0279480364a567e8de825601 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 23:27:52 +0800 Subject: [PATCH 58/70] update doc --- README.md | 2 ++ README_CN.md | 2 ++ docs/Get Started/3.Use in train and infer.md | 2 +- docs/Modules/{1.Interface.md => 1.swift.md} | 0 docs/Modules/2.lora.md | 16 +++++++++ docs/Modules/3.Restuning.md | 21 +++++++++++ docs/Modules/4.adapter.md | 17 +++++++++ docs/Modules/5.side.md | 18 ++++++++++ docs/Modules/6.prompt.md | 18 ++++++++++ docs/Modules/7.peft.md | 38 ++++++++++++++++++++ 10 files changed, 133 insertions(+), 1 deletion(-) rename docs/Modules/{1.Interface.md => 1.swift.md} (100%) create mode 100644 docs/Modules/7.peft.md diff --git a/README.md b/README.md index 722d0b9a17..17efba9455 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ Key features: 2. Tuners provided by SWIFT be combined together to allow exploration of multiple tuners on a model for best result. 3. Support calling `activate_adapter`或`deactivate_adapter` to activate/deactivate a single tuner. User can use one model with multiple tuners in different threads. +Users can check the [documentation of Swift](./docs/Get Started/1.Introduction.md) to get detail tutorials. + ## LLM SFT Example [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) diff --git a/README_CN.md b/README_CN.md index 4b6d7f4379..61ac291017 100644 --- a/README_CN.md +++ b/README_CN.md @@ -29,6 +29,8 @@ SWIFT(Scalable lightWeight Infrastructure for Fine-Tuning)是一个可扩展 2. SWIFT提供的tuners可以组合在一起,以便在模型上探索多个tuners,以获得最佳结果。 3. 支持调用`activate_adapter`或`deactivate_adapter`来使tuner激活或失活,用户可以在推理时用一个模型在不同线程中使用多种tuners而互不干扰。 +用户可以查看 [Swift官方文档](./docs/Get Started/1.Introduction.md) 来了解详细信息。 + ## 大模型微调的例子 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm) diff --git a/docs/Get Started/3.Use in train and infer.md b/docs/Get Started/3.Use in train and infer.md index bcb68e3b15..2209cecfc6 100644 --- a/docs/Get Started/3.Use in train and infer.md +++ b/docs/Get Started/3.Use in train and infer.md @@ -120,4 +120,4 @@ def request(_input, _type): ``` -在不同线程中使用同样一个tuner是安全的。 +在不同线程中使用同一个tuner是安全的。 diff --git a/docs/Modules/1.Interface.md b/docs/Modules/1.swift.md similarity index 100% rename from docs/Modules/1.Interface.md rename to docs/Modules/1.swift.md diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md index 49909e8e48..55d47831bf 100644 --- a/docs/Modules/2.lora.md +++ b/docs/Modules/2.lora.md @@ -15,3 +15,19 @@ LoRA是[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/ab >) >``` +一个使用LoRA的例子如下: + +```python +from modelscope import Model +from swift import Swift, LoRAConfig +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +lora_config = LoRAConfig( + r=16, + target_modules=['query_key_value'], + lora_alpha=32, + lora_dropout=0.) +model = Swift.prepare_model(model, lora_config) +# use model to do other things +``` + diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md index c380771742..c2635b385f 100644 --- a/docs/Modules/3.Restuning.md +++ b/docs/Modules/3.Restuning.md @@ -19,3 +19,24 @@ Restuning是[Res-Tuning: A Flexible and Efficient Tuning Paradigm via Unbinding >) >``` +一个使用Restuning的例子如下: + +```python +from swift import (ResTuningConfig, Swift, snapshot_download) + +model_dir = snapshot_download('AI-ModelScope/vit-base-patch16-224') +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained(model_dir) +restuning_config_1 = ResTuningConfig( + dims=768, + root_modules=r'.*vit.encoder.layer.0$', + stem_modules=r'.*vit.encoder.layer\.\d+$', + target_modules=r'.*vit.layernorm', + target_modules_hook='input', + tuner_cfg='res_adapter', +) +model = Swift.prepare_model(model, config=restuning_config_1) +# use model to do other things +``` + diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md index e07af189cc..52b553b715 100644 --- a/docs/Modules/4.adapter.md +++ b/docs/Modules/4.adapter.md @@ -13,3 +13,20 @@ Adapter是[Parameter-Efficient Transfer Learning for NLP](http://arxiv.org/abs/1 >) >``` +一个使用adapter的例子如下: + +```python +from modelscope import Model +from swift import Swift, LoRAConfig +import torch +model = Model.from_pretrained('ZhipuAI/chatglm2-6b', torch_dtype=torch.bfloat16, device_map='auto') +adapter_config = AdapterConfig( + dim=model.config.hidden_size, + target_modules=['mlp']), + method_name='forward', + hidden_pos=0, + ) +model = Swift.prepare_model(model, adapter_config) +# use model to do other things +``` + diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md index 2ad0fe587a..a33b970513 100644 --- a/docs/Modules/5.side.md +++ b/docs/Modules/5.side.md @@ -11,3 +11,21 @@ Side是[Side-Tuning: A Baseline for Network Adaptation via Additive Side Network >) >``` +一个使用Side的例子如下: + +```python +from modelscope import Model + +from swift import (SideConfig, Swift) + +model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') +side_config = SideConfig( + dim=model.config.hidden_size, + target_modules=r'.*encoder.encoder', + side_module_name='mlp', + hidden_pos='last_hidden_state') +model = Swift.prepare_model(model, side_config) +# use model to do other things +``` + diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md index 9d93121503..54d521b8f4 100644 --- a/docs/Modules/6.prompt.md +++ b/docs/Modules/6.prompt.md @@ -15,3 +15,21 @@ Prompt是[Visual Prompt Tuning](https://arxiv.org/abs/2106.09685) 论文提供 >) >``` +一个使用Prompt的例子如下: + +```python +from modelscope import Model + +from swift import (PromptConfig, Swift) + +model = Model.from_pretrained( + 'damo/nlp_structbert_sentence-similarity_chinese-base') +prompt_config = PromptConfig( + dim=model.config.hidden_size, + target_modules=r'.*layer\.\d+$', + embedding_pos=0, + attention_mask_pos=1) +model = Swift.prepare_model(model, config=prompt_config) +# use model to do other things +``` + diff --git a/docs/Modules/7.peft.md b/docs/Modules/7.peft.md new file mode 100644 index 0000000000..c3cba7862c --- /dev/null +++ b/docs/Modules/7.peft.md @@ -0,0 +1,38 @@ +# 对Peft的兼容性 + +为了支持习惯Peft的用户,Swift提供了对于Peft的兼容性。用户可以从swift中import peft组件: + +>PeftModel +>PeftConfig +>PeftModelForSeq2SeqLM +>PeftModelForSequenceClassification +>PeftModelForTokenClassification +>PeftModelForCausalLM +>PromptEncoderConfig +>PromptTuningConfig +>PrefixTuningConfig +>PromptLearningConfig +>LoraConfig +>get_peft_config +>get_peft_model_state_dict +>get_peft_model + +以上组件均可以从swift中import: + +```python +from swift import PeftModel, PeftConfig +``` + +Swift类也支持初始化Peft的tuner: + +```python +from modelscope.models.nlp import SbertForSequenceClassification +from modelscope.models.nlp.structbert import SbertConfig + +from swift import LoraConfig, Swift +model = SbertForSequenceClassification(SbertConfig()) +lora_config = LoraConfig(target_modules=['query', 'key', 'value']) +model = Swift.prepare_model(model, lora_config) +``` + +Swift对Peft进行了浅封装,使Peft可以在from_pretrained时使用modelscope hub中的模型。 \ No newline at end of file From c10c44350bebf1cddba55ca51330543a0bca63e4 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Wed, 13 Sep 2023 23:28:30 +0800 Subject: [PATCH 59/70] pre-commit passed --- docs/Get Started/1.Introduction.md | 6 +++--- docs/Get Started/2.Installation.md | 2 +- docs/Get Started/4.examples.md | 1 - docs/Modules/1.swift.md | 1 - docs/Modules/2.lora.md | 1 - docs/Modules/3.Restuning.md | 1 - docs/Modules/4.adapter.md | 1 - docs/Modules/5.side.md | 1 - docs/Modules/6.prompt.md | 1 - docs/Modules/7.peft.md | 2 +- 10 files changed, 5 insertions(+), 12 deletions(-) diff --git a/docs/Get Started/1.Introduction.md b/docs/Get Started/1.Introduction.md index 36c4c32409..14f68b2d0c 100644 --- a/docs/Get Started/1.Introduction.md +++ b/docs/Get Started/1.Introduction.md @@ -48,7 +48,7 @@ val_dataset = MsDataset.load('clue', subset_name='afqmc', split='validation').to def tokenize_function(examples): - return tokenizer(examples["sentence1"], examples["sentence2"], + return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128) @@ -60,7 +60,7 @@ arguments = TrainingArguments( per_device_train_batch_size=16, ) -trainer = Trainer(model, arguments, train_dataset=train_dataset, +trainer = Trainer(model, arguments, train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=default_data_collator,) @@ -100,4 +100,4 @@ lora_config = LoRAConfig(target_modules=['query', 'key', 'value']) model = Swift.from_pretrained(model, model_id='./outputs/checkpoint-21') print(model(**tokenizer('this is a test', return_tensors='pt'))) -``` \ No newline at end of file +``` diff --git a/docs/Get Started/2.Installation.md b/docs/Get Started/2.Installation.md index 740d67bcc8..7bc620c51d 100644 --- a/docs/Get Started/2.Installation.md +++ b/docs/Get Started/2.Installation.md @@ -22,4 +22,4 @@ Swift支持训练的绝大多数模型都可以在`A10`显卡上使用,用户 1. 进入[ModelScope](https://www.modelscope.cn)官方网站并登录 2. 点击左侧的`我的Notebook`并开启一个免费GPU实例 -3. 愉快地薅A10显卡羊毛 \ No newline at end of file +3. 愉快地薅A10显卡羊毛 diff --git a/docs/Get Started/4.examples.md b/docs/Get Started/4.examples.md index 3c2e531aa1..80240e2679 100644 --- a/docs/Get Started/4.examples.md +++ b/docs/Get Started/4.examples.md @@ -1,4 +1,3 @@ # LLM训练方案 Swift提供了完整的LLM训练方案,可以查看[Examples的README](../../examples/pytorch/llm/README_CN.md). - diff --git a/docs/Modules/1.swift.md b/docs/Modules/1.swift.md index 11d39c0379..0d5b35c9ab 100644 --- a/docs/Modules/1.swift.md +++ b/docs/Modules/1.swift.md @@ -67,4 +67,3 @@ SwiftConfig的具体参数可以查看每个tuner的文档。 > 实例方法,在当前线程中单独激活某个adapter,如果将环境变量`USE_UNIQUE_THREAD`设置为'0',则设置对所有线程同时生效。 > > adapter_name:adapter名称 - diff --git a/docs/Modules/2.lora.md b/docs/Modules/2.lora.md index 55d47831bf..013c4da7ee 100644 --- a/docs/Modules/2.lora.md +++ b/docs/Modules/2.lora.md @@ -30,4 +30,3 @@ lora_config = LoRAConfig( model = Swift.prepare_model(model, lora_config) # use model to do other things ``` - diff --git a/docs/Modules/3.Restuning.md b/docs/Modules/3.Restuning.md index c2635b385f..4beb11a022 100644 --- a/docs/Modules/3.Restuning.md +++ b/docs/Modules/3.Restuning.md @@ -39,4 +39,3 @@ restuning_config_1 = ResTuningConfig( model = Swift.prepare_model(model, config=restuning_config_1) # use model to do other things ``` - diff --git a/docs/Modules/4.adapter.md b/docs/Modules/4.adapter.md index 52b553b715..10ab21c665 100644 --- a/docs/Modules/4.adapter.md +++ b/docs/Modules/4.adapter.md @@ -29,4 +29,3 @@ adapter_config = AdapterConfig( model = Swift.prepare_model(model, adapter_config) # use model to do other things ``` - diff --git a/docs/Modules/5.side.md b/docs/Modules/5.side.md index a33b970513..6c49e2fad3 100644 --- a/docs/Modules/5.side.md +++ b/docs/Modules/5.side.md @@ -28,4 +28,3 @@ side_config = SideConfig( model = Swift.prepare_model(model, side_config) # use model to do other things ``` - diff --git a/docs/Modules/6.prompt.md b/docs/Modules/6.prompt.md index 54d521b8f4..a9578911d5 100644 --- a/docs/Modules/6.prompt.md +++ b/docs/Modules/6.prompt.md @@ -32,4 +32,3 @@ prompt_config = PromptConfig( model = Swift.prepare_model(model, config=prompt_config) # use model to do other things ``` - diff --git a/docs/Modules/7.peft.md b/docs/Modules/7.peft.md index c3cba7862c..aadfa08023 100644 --- a/docs/Modules/7.peft.md +++ b/docs/Modules/7.peft.md @@ -35,4 +35,4 @@ lora_config = LoraConfig(target_modules=['query', 'key', 'value']) model = Swift.prepare_model(model, lora_config) ``` -Swift对Peft进行了浅封装,使Peft可以在from_pretrained时使用modelscope hub中的模型。 \ No newline at end of file +Swift对Peft进行了浅封装,使Peft可以在from_pretrained时使用modelscope hub中的模型。 From 0faee0f553ef7f8cfc4cfbfb02158a3ede98bb9b Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 14 Sep 2023 10:50:32 +0800 Subject: [PATCH 60/70] fix --- swift/tuners/prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index f306ea3d79..56605f5896 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -217,7 +217,7 @@ def forward(self, x): if not self.is_activated(): return x prompt_token = self.prompt_token.expand(x.shape[0], -1, - -1).to(x.device) + -1).to(x.device, x.dtype) if self.layer_num == 0: if self.attach_front: From caf83a5df21f93dab62f5f1618a3783c45187ed3 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 14 Sep 2023 11:35:54 +0800 Subject: [PATCH 61/70] fix bug --- swift/tuners/lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index bb7882aee2..760db2d314 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -309,7 +309,7 @@ def _dynamic_patch_lora(model, replace_modules, use_merged_linear, **kwargs) def _forward(self, *args, **kwargs): - for _name, _module in sub_module.named_modules(): + for _name, _module in self.named_modules(): if 'loramodule_' in _name and _module.is_activated(): return _module.forward(*args, **kwargs) return self.forward_origin(*args, **kwargs) From aacecfeff56cdff741bcc8c7d5cfcb7c240d9839 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 14 Sep 2023 14:54:36 +0800 Subject: [PATCH 62/70] fix bugs --- examples/pytorch/llm/src/llm_infer.py | 5 ++--- examples/pytorch/llm/src/llm_sft.py | 11 +++++------ examples/pytorch/llm/src/utils/__init__.py | 3 ++- examples/pytorch/llm/src/utils/swift_utils.py | 5 ++--- swift/trainers/trainers.py | 1 + 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index bd7e28868a..61c23ca3c1 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -37,13 +37,12 @@ class InferArguments: default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 - dataset_sample: int = -1 # -1: all dataset + dataset_sample: int = 20000 # -1: all dataset dataset_test_size: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 - quantization_bit: Optional[int] = field( - default=None, metadata={'choices': {4, 8}}) + quantization_bit: int = field(default=0, metadata={'choices': {0, 4, 8}}) bnb_4bit_comp_dtype: str = field( default=None, metadata={'choices': {'fp16', 'bf16', 'fp32'}}) bnb_4bit_quant_type: str = field( diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 401d7c3199..424fe3e89b 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -9,15 +9,14 @@ import json import torch import torch.distributed as dist -from examples.pytorch.llm.src.utils.metric_utils import compute_nlg_metrics -from examples.pytorch.llm.src.utils.swift_utils import prepare_model from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, broadcast_string, check_json_format, find_all_linear_for_lora, get_dataset, get_dist_setting, get_model_tokenizer, get_preprocess, is_dist, is_master, plot_images, process_dataset, select_bnb, select_dtype, - show_layers, sort_by_max_length) + show_layers, sort_by_max_length, + compute_nlg_metrics, prepare_model) from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) @@ -36,7 +35,7 @@ class SftArguments: metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( default='lora', - metadata={'help': f'adapter choices: {["lora", "full", "adapter", "restuning"]}'}) + metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'}) template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -54,7 +53,7 @@ class SftArguments: default='alpaca-en,alpaca-zh', metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 - dataset_sample: int = -1 # -1: all dataset + dataset_sample: int = 20000 # -1: all dataset dataset_test_size: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -271,7 +270,7 @@ def llm_sft(args: SftArguments) -> None: tokenizer, args.system, args.max_length, - validate_generation=True) + validate_generation=args.predict_with_generate) val_dataset = val_dataset.map(preprocess_func_eval) del dataset if args.test_oom_error: diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index 56b16ead20..ceb60765f1 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -1,7 +1,8 @@ -from .dataset import DATASET_MAPPING, get_dataset, process_dataset +from .dataset import DATASET_MAPPING, get_dataset from .metric_utils import compute_nlg_metrics from .model import MODEL_MAPPING, get_model_tokenizer from .preprocess import TEMPLATE_MAPPING, get_preprocess +from .swift_utils import prepare_model from .utils import (broadcast_string, check_json_format, download_dataset, find_all_linear_for_lora, get_dist_setting, inference, is_dist, is_local_master, is_master, plot_images, diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py index 3f11634f00..8d931017c6 100644 --- a/examples/pytorch/llm/src/utils/swift_utils.py +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -1,7 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import Any, Dict -import torch.nn +from torch.nn import Module from swift import (AdapterConfig, LoRAConfig, ResTuningConfig, Swift, SwiftConfig, SwiftTuners, get_logger) @@ -12,8 +12,7 @@ def prepare_model( - model: torch.nn.Module, - args: Any, + model: Module, args ): swift_config: Dict[str, SwiftConfig] = dict() for sft_type in [_type.strip() for _type in args.sft_type.split(',')]: diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 5c2de223af..a659ec8747 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -22,6 +22,7 @@ class Seq2SeqTrainer(PushToMsHubMixin, SwiftMixin, HfSeq2SeqTrainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # performance self.perf: Dict[str, Any] = { 'gen_time': 0., From 38bd482706571efb9a60b241fabd3f53aa8aac58 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 14 Sep 2023 16:11:54 +0800 Subject: [PATCH 63/70] update sh --- examples/pytorch/llm/README.md | 31 ++- examples/pytorch/llm/README_CN.md | 34 ++- .../baichuan2_7b_chat/lora_ddp/infer.sh | 5 +- .../scripts/baichuan2_7b_chat/lora_ddp/sft.sh | 7 +- .../llm/scripts/chatglm2_6b/lora_ddp/infer.sh | 5 +- .../llm/scripts/chatglm2_6b/lora_ddp/sft.sh | 7 +- .../full_mp}/infer.sh | 8 +- .../qwen_7b_chat/{full => full_mp}/sft.sh | 8 +- .../{full => full_mp_ddp}/infer.sh | 5 +- .../full_mp_ddp}/sft.sh | 27 ++- .../llm/scripts/qwen_7b_chat/lora/infer.sh | 6 +- .../llm/scripts/qwen_7b_chat/lora/sft.sh | 8 +- .../scripts/qwen_7b_chat/lora_ddp/infer.sh | 4 +- .../llm/scripts/qwen_7b_chat/lora_ddp/sft.sh | 10 +- .../llm/scripts/qwen_vl_chat/lora/infer.sh | 17 -- .../llm/scripts/qwen_vl_chat/lora/sft.sh | 31 --- .../llm/scripts/qwen_vl_chat/qlora/infer.sh | 19 -- .../llm/scripts/qwen_vl_chat/qlora/sft.sh | 33 --- .../scripts/qwen_vl_chat/qlora_ddp/infer.sh | 19 -- .../llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh | 39 ---- .../llm/scripts/seqgpt_560m/full/infer.sh | 4 +- .../llm/scripts/seqgpt_560m/full/sft.sh | 6 +- examples/pytorch/llm/src/llm_sft.py | 27 ++- examples/pytorch/llm/src/utils/__init__.py | 10 +- examples/pytorch/llm/src/utils/dataset.py | 204 ++++++++++++++++-- examples/pytorch/llm/src/utils/model.py | 2 +- examples/pytorch/llm/src/utils/utils.py | 136 +++++++++++- swift/trainers/trainers.py | 5 +- 28 files changed, 448 insertions(+), 269 deletions(-) rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => qwen_7b_chat/full_mp}/infer.sh (76%) rename examples/pytorch/llm/scripts/qwen_7b_chat/{full => full_mp}/sft.sh (87%) rename examples/pytorch/llm/scripts/qwen_7b_chat/{full => full_mp_ddp}/infer.sh (74%) rename examples/pytorch/llm/scripts/{qwen_agent/lora_ddp => qwen_7b_chat/full_mp_ddp}/sft.sh (59%) delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh delete mode 100644 examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index 76bbbae4a0..20dcdedc55 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -25,15 +25,16 @@ 6. openbuddy-llama series: openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b 7. internlm series: internlm-7b, internlm-7b-chat, internlm-7b-chat-8k 8. other: polylm-13b, seqgpt-560m -3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ... +3. supported features: quantization, DDP, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ... 4. supported datasets: - 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh + 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh 2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh 3. multi-modal: coco-en + 4. other: cls-fudan-news-zh, ner-jave-zh 5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation ## Prepare the Environment -Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization) +Experimental environment: V100, A10, 3090, A100, ... (V100 does not support bf16, quantization) ```bash # Installing miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -63,38 +64,50 @@ pip install . ## Run SFT and Inference Performace: full(nice) > lora > qlora + Training GPU memory: qlora(low,3090) > lora > full(2*A100) ```bash # Clone the repository and enter the code directory. git clone https://github.com/modelscope/swift.git cd swift/examples/pytorch/llm -# sft lora and infer qwen-7b-chat, Requires 27GB GPU memory. +# sft lora and infer qwen-7b-chat, Requires 38GB GPU memory. # You can save GPU memory by setting `--gradient_checkpointing true`, but this will slightly decrease the training speed. # If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'. # Recommended experimental environment: A100 bash scripts/qwen_7b_chat/lora/sft.sh bash scripts/qwen_7b_chat/lora/infer.sh -# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*27GB GPU memory. +# sft(lora+ddp) and infer qwen-7b-chat, Requires 2*38GB GPU memory. +# Recommended experimental environment: A100 bash scripts/qwen_7b_chat/lora_ddp/sft.sh bash scripts/qwen_7b_chat/lora_ddp/infer.sh +# sft(lora+mp+ddp) and infer qwen-7b-chat, Requires 4*16GB GPU memory. +# Recommended experimental environment: V100, A10, 3090 +bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh + # sft(qlora) and infer qwen-7b-chat, Requires 13GB GPU memory. # If you want to use quantification, you need to `pip install bitsandbytes -U` -# Recommended experimental environment: 3090 +# Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh # sft(qlora+ddp) and infer qwen-7b-chat, Requires 2*13GB GPU memory. +# Recommended experimental environment: A10, 3090 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh -# sft(full) and infer qwen-7b-chat, Requires 100GB GPU memory. +# sft(full+mp) and infer qwen-7b-chat, Requires 2*50GB GPU memory. # Recommended experimental environment: A100 -bash scripts/qwen_7b_chat/full/sft.sh -bash scripts/qwen_7b_chat/full/infer.sh +bash scripts/qwen_7b_chat/full_mp/sft.sh +bash scripts/qwen_7b_chat/full_mp/infer.sh +# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*50GB GPU memory. +# Recommended experimental environment: A100 +bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh +bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh # For more scripts, please see `scripts/` folder. ``` diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index c66c2eab57..0e23d44619 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -28,13 +28,14 @@ 8. other: polylm-13b, seqgpt-560m 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ... 4. 支持的数据集: - 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh + 1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en, cmnli-zh, jd-zh, dureader-robust-zh, medical-en, medical-zh, medical-mini-zh, sharegpt-en, sharegpt-zh, code-python-zh 2. agent: [damo-agent-zh](https://modelscope.cn/datasets/damo/MSAgent-Bench/summary), damo-agent-mini-zh 3. 多模态: coco-en + 4. 其他: cls-fudan-news-zh, ner-jave-zh 5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation ## 准备实验环境 -实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化) +实验环境: V100, A10, 3090, A100均可. (V100不支持bf16, 量化) ```bash # 安装miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -65,37 +66,50 @@ pip install . ## 微调和推理 性能: full(优) > lora > qlora + 训练显存: qlora(低,3090) > lora > full(2*A100) ```bash # clone仓库并进入代码目录 git clone https://github.com/modelscope/swift.git cd swift/examples/pytorch/llm -# 微调(lora)+推理 qwen-7b-chat, 需要27GB显存. +# 微调(lora)+推理 qwen-7b-chat, 需要38GB显存. # 你可以通过设置`--gradient_checkpointing true`来节约显存, 但这会略微降低训练速度. # 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`. # 推荐的实验环境: A100 bash scripts/qwen_7b_chat/lora/sft.sh bash scripts/qwen_7b_chat/lora/infer.sh -# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*27GB显存. +# 微调(lora+ddp)+推理 qwen-7b-chat, 需要2卡*38GB显存. +# 推荐的实验环境: A100 bash scripts/qwen_7b_chat/lora_ddp/sft.sh bash scripts/qwen_7b_chat/lora_ddp/infer.sh -# 微调(qlora)+推理 qwen-7b-chat, 需要13GB显存. +# 微调(lora+mp+ddp)+推理 qwen-7b-chat, 需要4卡*15GB显存. +# 推荐的实验环境: V100, 3090, A10 +bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh +bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh + +# 微调(qlora)+推理 qwen-7b-chat, 需要9GB显存. # 如果你想要使用量化, 你需要`pip install bitsandbytes -U` -# 推荐的实验环境: 3090 +# 推荐的实验环境: 3090, A10 bash scripts/qwen_7b_chat/qlora/sft.sh bash scripts/qwen_7b_chat/qlora/infer.sh -# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*13GB显存. +# 微调(qlora+ddp)+推理 qwen-7b-chat, 需要2卡*14GB显存. +# 推荐的实验环境: 3090, A10 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh bash scripts/qwen_7b_chat/qlora_ddp/infer.sh -# 微调(full)+推理 qwen-7b-chat, 需要100G显存. +# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存. +# 推荐的实验环境: A100 +bash scripts/qwen_7b_chat/full_mp/sft.sh +bash scripts/qwen_7b_chat/full_mp/infer.sh + +# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存. # 推荐的实验环境: A100 -bash scripts/qwen_7b_chat/full/sft.sh -bash scripts/qwen_7b_chat/full/infer.sh +bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh +bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh # 更多的scripts脚本, 可以看`scripts`文件夹. ``` diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh index e62aa4b203..ca53acdf99 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type baichuan \ --dtype bf16 \ --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset damo-agent-mini-zh \ + --dataset_sample -1 \ + --max_length 4096 \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh index ea219e0759..c315d78850 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh @@ -1,4 +1,5 @@ # Experimental environment: 2 * A100 +# 2 * 44GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -11,10 +12,10 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample 20000 \ + --dataset damo-agent-mini-zh \ + --dataset_sample -1 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 4096 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh index 96aa910f23..85d856ad36 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatglm2 \ --dtype bf16 \ --ckpt_dir "runs/chatglm2-6b/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset code-python-zh \ + --dataset_sample -1 \ + --max_length 8192 \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh index 7ec0bb88d9..b85eac5572 100644 --- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: A100 +# 2 * 50GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -10,13 +12,14 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ + --dataset code-python-zh \ --dataset_sample -1 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 8192 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh similarity index 76% rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh index b6c221155d..17e53a8c82 100644 --- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh @@ -1,14 +1,14 @@ CUDA_VISIBLE_DEVICES=0 \ python src/llm_infer.py \ --model_type qwen-7b-chat \ - --sft_type lora \ + --sft_type full \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ --eval_human false \ - --dataset damo-agent-mini-zh \ - --dataset_sample -1 \ - --max_length 2048 \ + --dataset damo-agent-zh \ + --dataset_sample 200000 \ + --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh similarity index 87% rename from examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh index 182e287faf..2a961f7e72 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * A100 -# 100GB GPU memory +# 2 * 75GB GPU memory CUDA_VISIBLE_DEVICES=0,1 \ python src/llm_sft.py \ --model_type qwen-7b-chat \ @@ -7,10 +7,10 @@ python src/llm_sft.py \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset damo-agent-zh \ + --dataset_sample 200000 \ --num_train_epochs 1 \ - --max_length 2048 \ + --max_length 8192 \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0.01 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh similarity index 74% rename from examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh index 9ef3c08124..f99464d035 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh @@ -5,7 +5,10 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset medical-en,medical-zh \ + --dataset_sample 200000 \ + --max_length 8192 \ --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh similarity index 59% rename from examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh rename to examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh index 7f4c9c37bd..de95dda252 100644 --- a/examples/pytorch/llm/scripts/qwen_agent/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh @@ -1,37 +1,34 @@ -# Experimental environment: 2 * A100 +# Experimental environment: 4 * A100 +# 4 * 75GB GPU memory nproc_per_node=2 -CUDA_VISIBLE_DEVICES=0,1 \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ torchrun \ --nproc_per_node=$nproc_per_node \ --master_port 29500 \ src/llm_sft.py \ --model_type qwen-7b-chat \ - --sft_type lora \ + --sft_type full \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --ddp_backend nccl \ - --dataset damo-agent-mini-zh \ - --dataset_sample -1 \ + --dataset medical-en,medical-zh \ + --dataset_sample 200000 \ --num_train_epochs 1 \ - --max_length 2048 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --lora_dropout_p 0. \ - --lora_target_modules ALL \ + --max_length 8192 \ --gradient_checkpointing false \ --batch_size 1 \ - --weight_decay 0. \ - --learning_rate 1e-4 \ + --weight_decay 0.01 \ + --learning_rate 2e-5 \ --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ - --max_grad_norm 0.5 \ + --max_grad_norm 1 \ --warmup_ratio 0.03 \ --eval_steps 100 \ --save_steps 100 \ + --only_save_model true \ --save_total_limit 2 \ --logging_steps 10 \ --use_flash_attn true \ --push_to_hub false \ - --hub_model_id qwen-7b-chat-qlora \ + --hub_model_id qwen-7b-chat-full \ --hub_private_repo true \ --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh index 5aae79e72d..6382b5d34f 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh @@ -5,7 +5,11 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset cot-en,cot-zh \ + --dataset_sample 50000 \ + --max_length 2048 \ + --use_flash_attn true \ --max_new_tokens 1024 \ --temperature 0.9 \ --top_k 50 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh index 025f728cb1..0d1d205a1a 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh @@ -1,3 +1,5 @@ +# Experimental environment: A100 +# 38GB GPU memory CUDA_VISIBLE_DEVICES=0 \ python src/llm_sft.py \ --model_type qwen-7b-chat \ @@ -5,14 +7,14 @@ python src/llm_sft.py \ --template_type chatml \ --dtype bf16 \ --output_dir runs \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset cot-en,cot-zh \ + --dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ - --lora_target_modules c_attn c_proj \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh index 27d3c0cbb3..8d5674bef4 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh @@ -5,7 +5,9 @@ python src/llm_infer.py \ --template_type chatml \ --dtype bf16 \ --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \ - --eval_human true \ + --eval_human false \ + --dataset sharegpt-en,sharegpt-zh \ + --dataset_sample 50000 \ --max_length 2048 \ --use_flash_attn true \ --max_new_tokens 1024 \ diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh index fd92b9a941..82f0838235 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh @@ -1,6 +1,6 @@ # Experimental environment: 2 * A100 -# 2 * 27GB GPU memory -# use_flash_attn=false: 2 * 31GB GPU memory +# 2 * 38GB GPU memory +# use_flash_attn=false: 2 * 70GB GPU memory nproc_per_node=2 CUDA_VISIBLE_DEVICES=0,1 \ torchrun \ @@ -13,14 +13,14 @@ torchrun \ --dtype bf16 \ --output_dir runs \ --ddp_backend nccl \ - --dataset alpaca-en,alpaca-zh \ - --dataset_sample -1 \ + --dataset sharegpt-en,sharegpt-zh \ + --dataset_sample 50000 \ --num_train_epochs 1 \ --max_length 2048 \ --lora_rank 8 \ --lora_alpha 32 \ --lora_dropout_p 0. \ - --lora_target_modules c_attn c_proj \ + --lora_target_modules ALL \ --gradient_checkpointing false \ --batch_size 1 \ --weight_decay 0. \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh deleted file mode 100644 index 9c2299bb25..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh +++ /dev/null @@ -1,17 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_infer.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \ - --eval_human false \ - --dataset coco-en \ - --dataset_sample 20000 \ - --max_length 2048 \ - --max_new_tokens 1024 \ - --use_flash_attn true \ - --temperature 0.9 \ - --top_k 50 \ - --top_p 0.9 \ - --do_sample true \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh deleted file mode 100644 index 8eb51200b9..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh +++ /dev/null @@ -1,31 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_sft.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --output_dir runs \ - --dataset coco-en \ - --dataset_sample 20000 \ - --num_train_epochs 1 \ - --max_length 2048 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --lora_dropout_p 0. \ - --lora_target_modules c_attn attn.c_proj \ - --gradient_checkpointing false \ - --batch_size 1 \ - --weight_decay 0. \ - --learning_rate 1e-4 \ - --gradient_accumulation_steps 16 \ - --max_grad_norm 0.5 \ - --warmup_ratio 0.03 \ - --eval_steps 100 \ - --save_steps 100 \ - --save_total_limit 2 \ - --logging_steps 10 \ - --use_flash_attn true \ - --push_to_hub false \ - --hub_model_id qwen-vl-chat-lora \ - --hub_private_repo true \ - --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh deleted file mode 100644 index e3c68d9770..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh +++ /dev/null @@ -1,19 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_infer.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \ - --eval_human false \ - --dataset coco-en \ - --dataset_sample 20000 \ - --max_length 2048 \ - --quantization_bit 4 \ - --bnb_4bit_comp_dtype bf16 \ - --max_new_tokens 1024 \ - --use_flash_attn false \ - --temperature 0.9 \ - --top_k 50 \ - --top_p 0.9 \ - --do_sample true \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh deleted file mode 100644 index 8f23629c6c..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh +++ /dev/null @@ -1,33 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_sft.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --output_dir runs \ - --dataset coco-en \ - --dataset_sample 20000 \ - --num_train_epochs 1 \ - --max_length 2048 \ - --quantization_bit 4 \ - --bnb_4bit_comp_dtype bf16 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --lora_dropout_p 0. \ - --lora_target_modules c_attn attn.c_proj \ - --gradient_checkpointing true \ - --batch_size 1 \ - --weight_decay 0. \ - --learning_rate 1e-4 \ - --gradient_accumulation_steps 16 \ - --max_grad_norm 0.5 \ - --warmup_ratio 0.03 \ - --eval_steps 100 \ - --save_steps 100 \ - --save_total_limit 2 \ - --logging_steps 10 \ - --use_flash_attn false \ - --push_to_hub false \ - --hub_model_id qwen-vl-chat-qlora \ - --hub_private_repo true \ - --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh deleted file mode 100644 index e3c68d9770..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/infer.sh +++ /dev/null @@ -1,19 +0,0 @@ -CUDA_VISIBLE_DEVICES=0 \ -python src/llm_infer.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \ - --eval_human false \ - --dataset coco-en \ - --dataset_sample 20000 \ - --max_length 2048 \ - --quantization_bit 4 \ - --bnb_4bit_comp_dtype bf16 \ - --max_new_tokens 1024 \ - --use_flash_attn false \ - --temperature 0.9 \ - --top_k 50 \ - --top_p 0.9 \ - --do_sample true \ diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh deleted file mode 100644 index ff512f36ab..0000000000 --- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh +++ /dev/null @@ -1,39 +0,0 @@ -# Experimental environment: 3090 -nproc_per_node=2 -CUDA_VISIBLE_DEVICES=0,1 \ -torchrun \ - --nproc_per_node=$nproc_per_node \ - --master_port 29500 \ - src/llm_sft.py \ - --model_type qwen-vl-chat \ - --sft_type lora \ - --template_type chatml \ - --dtype bf16 \ - --output_dir runs \ - --ddp_backend nccl \ - --dataset coco-en \ - --dataset_sample 20000 \ - --num_train_epochs 1 \ - --max_length 2048 \ - --quantization_bit 4 \ - --bnb_4bit_comp_dtype bf16 \ - --lora_rank 8 \ - --lora_alpha 32 \ - --lora_dropout_p 0. \ - --lora_target_modules c_attn attn.c_proj \ - --gradient_checkpointing false \ - --batch_size 1 \ - --weight_decay 0. \ - --learning_rate 1e-4 \ - --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \ - --max_grad_norm 0.5 \ - --warmup_ratio 0.03 \ - --eval_steps 100 \ - --save_steps 100 \ - --save_total_limit 2 \ - --logging_steps 10 \ - --use_flash_attn false \ - --push_to_hub false \ - --hub_model_id qwen-vl-chat-qlora \ - --hub_private_repo true \ - --hub_token 'your-sdk-token' \ diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh index a1f095bc58..cb3e4b7062 100644 --- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh +++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh @@ -6,8 +6,8 @@ python src/llm_infer.py \ --dtype bf16 \ --ckpt_dir "runs/seqgpt-560m/vx_xxx/checkpoint-xxx" \ --eval_human false \ - --dataset cmnli-zh \ - --dataset_sample 20000 \ + --dataset ner-jave-zh \ + --dataset_sample -1 \ --max_length 1024 \ --max_new_tokens 1024 \ --temperature 0.9 \ diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh index 9c5e30b8e7..5d0ada5770 100644 --- a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh +++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh @@ -5,9 +5,9 @@ python src/llm_sft.py \ --template_type default-generation \ --dtype bf16 \ --output_dir runs \ - --dataset cmnli-zh \ - --dataset_sample 20000 \ - --num_train_epochs 1 \ + --dataset ner-jave-zh \ + --dataset_sample -1 \ + --num_train_epochs 3 \ --max_length 1024 \ --gradient_checkpointing false \ --batch_size 32 \ diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 424fe3e89b..bcd6bf7304 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -11,11 +11,11 @@ import torch.distributed as dist from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, check_json_format, + broadcast_string, check_json_format, dataset_map, find_all_linear_for_lora, get_dataset, get_dist_setting, - get_model_tokenizer, get_preprocess, is_dist, is_master, - plot_images, process_dataset, select_bnb, select_dtype, - show_layers, sort_by_max_length, + get_model_tokenizer, get_preprocess, is_ddp_plus_mp, + is_dist, is_master, plot_images, process_dataset, + select_bnb, select_dtype, show_layers, sort_by_max_length, compute_nlg_metrics, prepare_model) from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, @@ -39,7 +39,6 @@ class SftArguments: template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' - # DDP + MP(device_map) is not supported ddp_backend: Optional[str] = field( default=None, metadata={'choices': ['nccl', 'gloo', 'mpi', 'ccl']}) @@ -76,6 +75,7 @@ class SftArguments: gradient_checkpointing: bool = False batch_size: int = 1 + eval_batch_size: Optional[int] = None num_train_epochs: int = 1 # if max_steps >= 0, override num_train_epochs max_steps: int = -1 @@ -120,7 +120,7 @@ class SftArguments: default=None, metadata={ 'help': - "This parameter is used only when model_type.startswith('qwen-7b')" + "This parameter is used only when model_type.startswith('qwen')" }) # generation config, only useful when `predict_with_generate=True` @@ -153,7 +153,7 @@ def __post_init__(self): assert all([_type.lower() in all_types for _type in sft_type]), \ f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}' if self.sft_type == 'full': - assert self.quantization_bit is None, 'not supported' + assert self.quantization_bit != 0, 'not supported' assert self.dtype != 'fp16', 'please use bf16 or fp32' if self.learning_rate is None: self.learning_rate = 2e-5 @@ -198,6 +198,11 @@ def __post_init__(self): if self.use_flash_attn is None: self.use_flash_attn = 'auto' self.train_sampler_random = not self.test_oom_error + if self.eval_batch_size is None: + if self.predict_with_generate: + self.eval_batch_size = 1 + else: + self.eval_batch_size = batch_size def llm_sft(args: SftArguments) -> None: @@ -209,7 +214,7 @@ def llm_sft(args: SftArguments) -> None: # ### Loading Model and Tokenizer kwargs = {'low_cpu_mem_usage': True} - if is_dist(): + if is_dist() and not is_ddp_plus_mp(): kwargs['device_map'] = {'': local_rank} else: kwargs['device_map'] = 'auto' @@ -274,7 +279,7 @@ def llm_sft(args: SftArguments) -> None: val_dataset = val_dataset.map(preprocess_func_eval) del dataset if args.test_oom_error: - train_dataset = sort_by_max_length(train_dataset) + train_dataset = sort_by_max_length(train_dataset, 20000) # Data analysis stat_dataset(train_dataset) stat_dataset(val_dataset) @@ -344,10 +349,10 @@ def llm_sft(args: SftArguments) -> None: **kwargs) if args.gradient_checkpointing: - # fix: gradients will be None - model.config.use_cache = True model.enable_input_require_grads() if is_dist(): + # Compatible with https://github.com/huggingface/transformers/pull/25903 + training_args._frozen = False if args.gradient_checkpointing: training_args.ddp_find_unused_parameters = False training_args.ddp_broadcast_buffers = False diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index ceb60765f1..07953bfc8a 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -3,8 +3,8 @@ from .model import MODEL_MAPPING, get_model_tokenizer from .preprocess import TEMPLATE_MAPPING, get_preprocess from .swift_utils import prepare_model -from .utils import (broadcast_string, check_json_format, download_dataset, - find_all_linear_for_lora, get_dist_setting, inference, - is_dist, is_local_master, is_master, plot_images, - process_dataset, select_bnb, select_dtype, show_layers, - sort_by_max_length) +from .utils import (broadcast_string, check_json_format, dataset_map, + download_dataset, find_all_linear_for_lora, + get_dist_setting, inference, is_ddp_plus_mp, is_dist, + is_local_master, is_master, plot_images, process_dataset, + select_bnb, select_dtype, show_layers, sort_by_max_length) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 3a2294a395..79f537857b 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -3,9 +3,10 @@ import os import re from functools import partial -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional import json +import numpy as np from datasets import Dataset as HfDataset from datasets import concatenate_datasets from modelscope import MsDataset @@ -373,7 +374,7 @@ def get_jd_zh_dataset() -> HfDataset: 'Sentiment Classification', False) -def _process_dureader_robust(dataset: HfDataset) -> HfDataset: +def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset: prompt = """Task: Question Generation Context: {context} Answer: {answer} @@ -396,34 +397,191 @@ def get_dureader_robust_qg_zh_dataset() -> HfDataset: dataset_dict['validation'].to_hf_dataset(), dataset_dict['test'].to_hf_dataset() ]) - return _process_dureader_robust(dataset) + return _preprocess_dureader_robust(dataset) + + +def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset: + query = [] + response = [] + for d in tqdm(dataset): + r = d['output'] + if r is None: + continue + if subset_name == 'zh': + q = d['instruction'] + else: + q = d['input'] + if q is None: + continue + query.append(q) + response.append(r) + return HfDataset.from_dict({'query': query, 'response': response}) + + +def get_medical_dataset(subset_name: str, + dataset_sample: int = -1) -> HfDataset: + """ + mode: Literal['en', zh] + """ + dataset_dict = MsDataset.load( + 'huangjintao/medical_zh', subset_name=subset_name) + dataset: HfDataset = concatenate_datasets([ + dataset_dict['train'].to_hf_dataset(), + dataset_dict['val'].to_hf_dataset(), + dataset_dict['test'].to_hf_dataset(), + ]) + if dataset_sample != -1: + idxs = np.random.permutation(dataset_sample) + dataset = dataset.select(idxs) + return _preprocess_medical(dataset, subset_name) + + +def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset: + query = [] + response = [] + history: List[History] = [] + for d in tqdm(dataset): + conversation = ast.literal_eval(d['conversation']) + query.append(conversation[-1]['human']) + response.append(conversation[-1]['assistant']) + h = [] + for c in conversation[:-1]: + h.append((c['human'], c['assistant'])) + history.append(h) + return HfDataset.from_dict({ + 'query': query, + 'response': response, + 'history': history + }) + + +def get_sharegpt_dataset(subset_name_list: List[str]) -> HfDataset: + dataset_list = [] + for subset_name in subset_name_list: + dataset = MsDataset.load( + 'huangjintao/sharegpt', subset_name=subset_name, + split='train').to_hf_dataset() + dataset_list.append(dataset) + dataset = concatenate_datasets(dataset_list) + return _preprocess_sharegpt(dataset) + + +_sharegpt_zh_subset_list = ['common-zh', 'computer-zh', 'unknow-zh'] + +_sharegpt_en_subset_list = ['common-en', 'computer-en'] + + +def get_sharegpt_all_zh_dataset(): + """multi-round chat""" + return get_sharegpt_dataset(_sharegpt_zh_subset_list) + + +def get_sharegpt_all_en_dataset(): + """multi-round chat""" + return get_sharegpt_dataset(_sharegpt_en_subset_list) + + +def get_cls_fudan_news_zh() -> HfDataset: + """Sequence Classification """ + dataset = MsDataset.load('damo/zh_cls_fudan-news').to_hf_dataset() + return HfDataset.from_dict({ + 'query': dataset['prompt'], + 'response': dataset['answer'] + }) + + +def get_ner_jave_zh() -> HfDataset: + """Named Entity Recognition""" + dataset = MsDataset.load('damo/zh_ner-JAVE').to_hf_dataset() + return HfDataset.from_dict({ + 'query': dataset['prompt'], + 'response': dataset['answer'] + }) + + +def _preprocess_code_python_dataset(dataset: HfDataset) -> HfDataset: + query = [] + response = [] + for d in tqdm(dataset): + chat_rounds = ast.literal_eval(d['chat_rounds']) + assert len(chat_rounds) == 2 + query.append(chat_rounds[-2]['content']) + response.append(chat_rounds[-1]['content']) + return HfDataset.from_dict({'query': query, 'response': response}) + + +def get_code_python_zh_dataset() -> HfDataset: + dataset = MsDataset.load( + 'codefuse-ai/CodeExercise-Python-27k').to_hf_dataset() + return _preprocess_code_python_dataset(dataset) DATASET_MAPPING = { # nlp chat - 'alpaca-en': get_alpaca_gpt4_en_dataset, - 'alpaca-zh': get_alpaca_gpt4_zh_dataset, - 'finance-en': get_finance_en_dataset, - 'multi-alpaca-all': get_multi_alpaca_all, - 'code-en': get_code_alpaca_en_dataset, - 'instinwild-en': get_instinwild_en_dataset, - 'instinwild-zh': get_instinwild_zh_dataset, - 'cot-en': get_cot_en_dataset, - 'cot-zh': get_cot_zh_dataset, - 'damo-agent-mini-zh': partial(get_damo_agent_zh_dataset, use_mini=True), - 'damo-agent-zh': get_damo_agent_zh_dataset, # containing normal chat - 'firefly-all-zh': get_firefly_all_zh_dataset, - 'poetry-zh': get_poetry_zh_dataset, - 'instruct-en': get_instruct_en_dataset, - 'gpt4all-en': get_gpt4all_en_dataset, + 'alpaca-en': + get_alpaca_gpt4_en_dataset, + 'alpaca-zh': + get_alpaca_gpt4_zh_dataset, + 'finance-en': + get_finance_en_dataset, + 'multi-alpaca-all': + get_multi_alpaca_all, + 'code-en': + get_code_alpaca_en_dataset, + 'instinwild-en': + get_instinwild_en_dataset, + 'instinwild-zh': + get_instinwild_zh_dataset, + 'cot-en': + get_cot_en_dataset, + 'cot-zh': + get_cot_zh_dataset, + 'firefly-all-zh': + get_firefly_all_zh_dataset, + 'poetry-zh': + get_poetry_zh_dataset, + 'instruct-en': + get_instruct_en_dataset, + 'gpt4all-en': + get_gpt4all_en_dataset, + 'medical-en': + partial(get_medical_dataset, subset_name='en'), + 'medical-zh': + partial(get_medical_dataset, subset_name='zh'), + 'medical-mini-zh': + partial(get_medical_dataset, subset_name='zh', dataset_sample=100000), + 'code-python-zh': + get_code_python_zh_dataset, + + # multi-round chat + 'damo-agent-mini-zh': + partial(get_damo_agent_zh_dataset, use_mini=True), + 'damo-agent-zh': + get_damo_agent_zh_dataset, # containing normal chat + 'sharegpt-en': + get_sharegpt_all_en_dataset, + 'sharegpt-zh': + get_sharegpt_all_zh_dataset, + # nlp text-generation (please use model:base, template:default-generation) - 'cmnli-zh': get_cmnli_zh_dataset, - 'jd-zh': get_jd_zh_dataset, - 'dureader-robust-zh': get_dureader_robust_qg_zh_dataset, - # multi-modal chat - 'coco-en': get_coco_en_dataset, + 'cmnli-zh': + get_cmnli_zh_dataset, + 'jd-zh': + get_jd_zh_dataset, + 'dureader-robust-zh': + get_dureader_robust_qg_zh_dataset, 'advertise_gen': get_advertise_gen_dataset, 'du_reader': get_du_reader_dataset, + + # multi-modal chat + 'coco-en': + get_coco_en_dataset, + + # other (e.g. example dataset for specific model) + 'cls-fudan-news-zh': + get_cls_fudan_news_zh, # seqgpt-560m + 'ner-jave-zh': + get_ner_jave_zh, # seqgpt-560m } diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index abfe1140e4..7d3741ebd8 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -392,7 +392,7 @@ class ResTunerTM(NamedTuple): }, 'baichuan2-7b-chat': { 'model_id': 'baichuan-inc/Baichuan2-7B-Chat', - 'revision': 'v1.0.0', + 'revision': 'v1.0.1', 'template': 'baichuan', 'lora_TM': LoRATM.baichuan, }, diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index 18f1e77e71..e6306d36f3 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -1,20 +1,29 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +# Part of the implementation is borrowed from huggingface/transformers. +import heapq import logging import os import shutil +from functools import wraps from tempfile import TemporaryDirectory -from typing import Any, List, Mapping, Optional, Sequence, Tuple +from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, + Tuple, Union) import matplotlib.pyplot as plt import numpy as np import requests import torch import torch.distributed as dist +from accelerate.utils.modeling import (get_balanced_memory, + infer_auto_device_map) from datasets import Dataset as HfDataset +from modelscope import MsDataset from modelscope.utils.config_ds import MS_CACHE_HOME from modelscope.utils.logger import get_logger as get_ms_logger +from torch import device as Device from torch import dtype as Dtype from torch.nn import Linear, Module +from torch.nn.parallel import DistributedDataParallel as DDP from tqdm.auto import tqdm from transformers import GenerationConfig, TextStreamer, trainer @@ -252,10 +261,10 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float, return dataset['train'], dataset['test'] -def sort_by_max_length(dataset: HfDataset) -> HfDataset: - dataset_len = [len(d['input_ids']) for d in dataset] - idx = sorted( - range(len(dataset)), key=lambda i: dataset_len[i], reverse=True) +def sort_by_max_length(dataset: HfDataset, num_dataset: int) -> HfDataset: + dataset_len = [len(d['input_ids']) for d in tqdm(dataset)] + idx = heapq.nlargest( + num_dataset, range(len(dataset_len)), key=lambda i: dataset_len[i]) input_ids = [] labels = [] for i in tqdm(idx): @@ -282,6 +291,107 @@ def check_json_format(obj: Any) -> Any: return res +_old_msdataset_load = MsDataset.load + + +@wraps(_old_msdataset_load) +def _msdataset_ddp_load(*args, **kwargs): + if is_dist() and not is_local_master(): + dist.barrier() + dataset = _old_msdataset_load(*args, **kwargs) + if is_dist() and is_local_master(): + dist.barrier() + + if is_dist(): + dist.barrier() + return dataset + + +def is_ddp_plus_mp() -> bool: + if not is_dist(): + return False + n_gpu = torch.cuda.device_count() + local_world_size = get_dist_setting()[3] + assert n_gpu % local_world_size == 0 + if n_gpu // local_world_size >= 2: + logger.info('Using DDP + MP(device_map)') + return True + return False + + +def _get_max_memory(device_ids: List[int]) -> Dict[Union[int, str], int]: + """add feat in accelerate to support DDP + MP""" + import psutil + # Make sure CUDA is initialized on each GPU to have the right memory info. + for i in device_ids: + _ = torch.tensor([0], device=i) + + device_ids_set = set(device_ids) + max_memory = {} + for i in range(torch.cuda.device_count()): + max_memory[i] = 0 + if i in device_ids_set: + max_memory[i] = torch.cuda.mem_get_info(i)[0] + max_memory['cpu'] = psutil.virtual_memory().available + return max_memory + + +def _sync_max_memory( + max_memory: Dict[Union[int, str], int]) -> Dict[Union[int, str], int]: + """Make sure that the model structure of MP(device_map) is the same, when using DDP.""" + max_memory_list = [ + v for k, v in max_memory.items() if (v > 0 and k != 'cpu') + ] + _, local_rank, world_size, _ = get_dist_setting() + src_tensor = torch.tensor(max_memory_list).to(local_rank) + tgt_tensor_list = [torch.zeros_like(src_tensor) for _ in range(world_size)] + dist.all_gather(tgt_tensor_list, src_tensor) + tgt_tensor = torch.stack(tgt_tensor_list, dim=0) + new_max_memory_iter = iter(tgt_tensor.min(dim=0)[0].tolist()) + new_max_memory = {} + for k, v in max_memory.items(): + new_max_memory[k] = v + if v > 0 and k != 'cpu': + new_max_memory[k] = next(new_max_memory_iter) + return new_max_memory + + +@wraps(infer_auto_device_map) +def _infer_auto_device_map_patch( + model: Module, + max_memory: Optional[Dict[Union[int, str], Union[int, str]]] = None, + **kwargs) -> Dict[str, Union[int, str, Device]]: + """The auxiliary function for supports DDP+MP. Monkey Patching. + add feat in accelerate to support DDP + MP""" + verbose = kwargs.pop('verbose', False) + n_gpu = torch.cuda.device_count() + _, local_rank, _, local_world_size = get_dist_setting() + device_ids = list(range(local_rank, n_gpu, local_world_size)) + max_memory = _get_max_memory(device_ids) + max_memory = _sync_max_memory(max_memory) + max_memory = get_balanced_memory( + model, max_memory, low_zero=False, **kwargs) + max_memory = {k: v for k, v in max_memory.items() if v > 0} + return infer_auto_device_map(model, max_memory, verbose=verbose, **kwargs) + + +def dataset_map( + dataset: HfDataset, preprocess_func: Callable[[Dict[str, Any]], + Dict[str, + Optional[List[int]]]] +) -> HfDataset: + # faster than dataset.map + input_ids = [] + labels = [] + for d in tqdm(dataset): + d = preprocess_func(d) + if d['input_ids'] is None: + continue + input_ids.append(d['input_ids']) + labels.append(d['labels']) + return HfDataset.from_dict({'input_ids': input_ids, 'labels': labels}) + + logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s') logger.handlers[0].setFormatter(logger_format) @@ -296,3 +406,19 @@ def check_json_format(obj: Any) -> Any: # monkey patching trainer.DEFAULT_PROGRESS_CALLBACK = ProgressCallbackNew trainer.DEFAULT_CALLBACKS = [DefaultFlowCallbackNew] +MsDataset.load = _msdataset_ddp_load +if is_ddp_plus_mp(): + import transformers + import accelerate + _old_ddp_init = DDP.__init__ + accelerate.accelerator.torch.nn.parallel.DistributedDataParallel.__init__ = ( + lambda self, model, device_ids, output_device, *args, **kwargs: + _old_ddp_init(self, model, *args, **kwargs)) + transformers.modeling_utils.get_balanced_memory = lambda *args, **kwargs: None + transformers.modeling_utils.infer_auto_device_map = _infer_auto_device_map_patch + _old_accelerator_init = trainer.Accelerator.__init__ + trainer.Accelerator.__init__ = ( + lambda self, device_placement=False, *args, **kwargs: + _old_accelerator_init( + self, device_placement=device_placement, *args, **kwargs)) + trainer.Accelerator.verify_device_map = lambda *args, **kwargs: False diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index a659ec8747..1f4a4c2f46 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -8,7 +8,10 @@ from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer -from transformers.deepspeed import is_deepspeed_zero3_enabled +try: + from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +except ImportError: + from transformers.deepspeed import is_deepspeed_zero3_enabled from .callback import DefaultFlowCallbackNew, ProgressCallbackNew from .mixin import PushToMsHubMixin, SwiftMixin From a522bbfd7494e4bf9675c3dec664608f0f473a02 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 14 Sep 2023 16:35:00 +0800 Subject: [PATCH 64/70] update --- examples/pytorch/llm/src/llm_sft.py | 4 ++-- examples/pytorch/llm/src/utils/dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index bcd6bf7304..3430c20ca4 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -153,7 +153,7 @@ def __post_init__(self): assert all([_type.lower() in all_types for _type in sft_type]), \ f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}' if self.sft_type == 'full': - assert self.quantization_bit != 0, 'not supported' + assert self.quantization_bit == 0, 'not supported' assert self.dtype != 'fp16', 'please use bf16 or fp32' if self.learning_rate is None: self.learning_rate = 2e-5 @@ -202,7 +202,7 @@ def __post_init__(self): if self.predict_with_generate: self.eval_batch_size = 1 else: - self.eval_batch_size = batch_size + self.eval_batch_size = self.batch_size def llm_sft(args: SftArguments) -> None: diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 79f537857b..9366574a6a 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -3,7 +3,7 @@ import os import re from functools import partial -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import json import numpy as np @@ -570,7 +570,7 @@ def get_code_python_zh_dataset() -> HfDataset: get_jd_zh_dataset, 'dureader-robust-zh': get_dureader_robust_qg_zh_dataset, - 'advertise_gen': get_advertise_gen_dataset, + 'advertise-gen': get_advertise_gen_dataset, 'du_reader': get_du_reader_dataset, # multi-modal chat From c3cab0db0afb9c038edec570b465fd830ad39eba Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 14 Sep 2023 16:43:02 +0800 Subject: [PATCH 65/70] fix bug --- examples/pytorch/llm/src/llm_sft.py | 17 ++++---- examples/pytorch/llm/src/utils/dataset.py | 6 ++- examples/pytorch/llm/src/utils/swift_utils.py | 4 +- swift/trainers/trainers.py | 9 +++-- swift/tuners/adapter.py | 24 +++++------ swift/tuners/base.py | 20 +++++++--- swift/tuners/lora.py | 40 ++++++++++--------- swift/tuners/prompt.py | 17 ++++---- swift/tuners/restuning.py | 23 ++++++----- swift/tuners/side.py | 34 ++++++++++------ 10 files changed, 111 insertions(+), 83 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 3430c20ca4..ef72981dfa 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -11,12 +11,12 @@ import torch.distributed as dist from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, check_json_format, dataset_map, - find_all_linear_for_lora, get_dataset, get_dist_setting, - get_model_tokenizer, get_preprocess, is_ddp_plus_mp, - is_dist, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers, sort_by_max_length, - compute_nlg_metrics, prepare_model) + broadcast_string, check_json_format, compute_nlg_metrics, + dataset_map, find_all_linear_for_lora, get_dataset, + get_dist_setting, get_model_tokenizer, get_preprocess, + is_ddp_plus_mp, is_dist, is_master, plot_images, + prepare_model, process_dataset, select_bnb, select_dtype, + show_layers, sort_by_max_length) from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) @@ -35,7 +35,10 @@ class SftArguments: metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( default='lora', - metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'}) + metadata={ + 'help': + f'tuner choices: {["lora", "full", "adapter", "restuning"]}' + }) template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 9366574a6a..3b418d2d7f 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -570,8 +570,10 @@ def get_code_python_zh_dataset() -> HfDataset: get_jd_zh_dataset, 'dureader-robust-zh': get_dureader_robust_qg_zh_dataset, - 'advertise-gen': get_advertise_gen_dataset, - 'du_reader': get_du_reader_dataset, + 'advertise-gen': + get_advertise_gen_dataset, + 'du_reader': + get_du_reader_dataset, # multi-modal chat 'coco-en': diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py index 8d931017c6..0c56972aca 100644 --- a/examples/pytorch/llm/src/utils/swift_utils.py +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -11,9 +11,7 @@ logger = get_logger() -def prepare_model( - model: Module, args -): +def prepare_model(model: Module, args): swift_config: Dict[str, SwiftConfig] = dict() for sft_type in [_type.strip() for _type in args.sft_type.split(',')]: if sft_type.lower() == SwiftTuners.LORA.lower(): diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 1f4a4c2f46..c51eae8841 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -8,14 +8,15 @@ from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer -try: - from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled -except ImportError: - from transformers.deepspeed import is_deepspeed_zero3_enabled from .callback import DefaultFlowCallbackNew, ProgressCallbackNew from .mixin import PushToMsHubMixin, SwiftMixin +try: + from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +except ImportError: + from transformers.deepspeed import is_deepspeed_zero3_enabled + class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): pass diff --git a/swift/tuners/adapter.py b/swift/tuners/adapter.py index 3beffcfca8..98f829525a 100644 --- a/swift/tuners/adapter.py +++ b/swift/tuners/adapter.py @@ -26,10 +26,12 @@ class AdapterConfig(SwiftConfig): See http://arxiv.org/abs/1902.00751 Args: - dim: The dimension of the hidden states - target_modules: The feedforward module to be replaced, in regex format - hidden_pos: The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs) - method_name: The method to be replaced, default to replace the forward method + dim(`int`): The dimension of the hidden states + target_modules(`Union[str, List[str]]`): The feedforward module to be replaced. + in regex format if this argument is str, else will match with `end with` if List[str]. + hidden_pos(`Union[str, int]`): The position of the hidden state to be passed into the adapter, + can be int (args) or str (kwargs) + method_name(`str`): The method to be replaced, default is `forward` adapter_length: The length of the adapter length (intermediate length) act_layer: The activation layer of the adapter """ @@ -37,25 +39,24 @@ class AdapterConfig(SwiftConfig): dim: int = field( default=None, metadata={'help': 'The dimension of the hidden states'}) - target_modules: str = field( + target_modules: Union[str, List[str]] = field( default=None, metadata={ - 'help': 'The feedforward module to be replaced, in regex format' + 'help': + 'The feedforward module to be replaced. in regex format if this argument is str, ' + 'else will match with `end with` if List[str].' }) hidden_pos: Union[str, int] = field( default=None, metadata={ 'help': - 'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)' + 'The position of the hidden state to be passed into the adapter, can be int (args) or str (kwargs)' }) method_name: str = field( default='forward', - metadata={ - 'help': - 'The method to be replaced, default to replace the forward method' - }) + metadata={'help': 'The method to be replaced, default is `forward`'}) adapter_length: int = field( default=128, @@ -182,7 +183,6 @@ def __init__( super(nn.Module, self).__init__() self.dim = dim self.adapter_length = adapter_length - # self.adapter_type = adapter_type self.linear1 = nn.Linear(dim, adapter_length) self.act = act_layer() self.linear2 = nn.Linear(adapter_length, dim) diff --git a/swift/tuners/base.py b/swift/tuners/base.py index 8eaa43aec7..8ad9807e09 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -29,7 +29,7 @@ class SwiftModel(nn.Module): Args: model (`Union[nn.Module, 'SwiftModel']`) A module to be tuned by Swift. - config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of adapter_name: SwiftConfig. + config (`Union[SwiftConfig, Dict[str, SwiftConfig]]`) A config or a dict of {adapter_name: SwiftConfig}. If it's a config class, the adapter_name will be `default` extra_state_keys (`List[str]`, `optional`) A list of regex to match the extra state keys to be saved. inference_mode (bool, `optional`): Load model at inference mode, default False. @@ -202,7 +202,7 @@ def load_state_file(path): @classmethod def from_pretrained(cls, - model: nn.Module, + model: Union[nn.Module, 'SwiftModel'], model_id: str = None, adapter_name: Union[str, List[str]] = None, inference_mode: bool = False, @@ -211,9 +211,11 @@ def from_pretrained(cls, """Load a set of tuners and corresponding weights by a model_id. Args: - model (`torch.nn.Module`): The model to be tuned. - model_id (`str`): The model_id or a local model dir to use to tune the model. + model (`Union[torch.nn.Module, 'SwiftModel']`): The model to be tuned, + if the model is already a `SwiftModel` it will be un-wrapped and re-wrapped.. + model_id (`str`): The model_id or a local model dir of tuners to use to tune the model. adapter_name (`Union[str, List[str]]`): The adapter_names saved in the model repo to load. + Default `None`, means load all tuners saved in the model_id inference_mode (`bool`): Use in the inference mode or not. revision (`str`): The model revision to use. **kwargs: @@ -247,6 +249,10 @@ def from_pretrained(cls, sub_folder = os.path.join(model_dir, _name) config_file = os.path.join(sub_folder, CONFIG_NAME) + if not os.path.isfile(config_file): + logger.warning(f'{_name} is not a valid tuner') + continue + with open(config_file, 'r') as file: json_object = json.load(file) @@ -315,7 +321,6 @@ def create_or_update_model_card(self, output_dir: str): lines.append( f'{training_procedure_heading}\n{training_config_text}') - # Adds peft version framework_block_heading = '### Framework versions\n' from swift.version import __version__ if framework_block_heading in lines: @@ -326,6 +331,11 @@ def create_or_update_model_card(self, output_dir: str): lines.append( f'{framework_block_heading}\n\n- SWIFT {__version__}\n') + base_model_heading = '### Base model information\n' + lines.append( + f'{base_model_heading}\n\n- BaseModel Class {self.base_model.__class__.__name__}\n' + ) + # write the lines back to README.md with open(os.path.join(output_dir, 'README.md'), 'w') as f: f.writelines(lines) diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py index 760db2d314..15a6594aa1 100644 --- a/swift/tuners/lora.py +++ b/swift/tuners/lora.py @@ -5,7 +5,7 @@ import re from dataclasses import dataclass, field from types import MethodType -from typing import Dict, List +from typing import Dict, List, Union import torch import torch.nn as nn @@ -106,19 +106,20 @@ class LoRAConfig(SwiftConfig): The configuration class for the loRA module. Args: - r: The rank of the LoRA module - target_modules: The modules to be replaced by LoRA, can be the end of the module name or a regex string - lora_alpha: The factor to add the lora weights - lora_dropout: The dropout rate of the lora module - merge_weights: Whether to merge weights when validating - use_merged_linear: Whether to replace with merged linear layer - enable_lora: The modules need to be turned on when using the merged linear layer - fan_in_fan_out: Set this to True if the layer to replace stores weight like (fan_in, fan_out) - bias: Bias type. Values ca be "none", "all" or "lora_only" + r(int): The rank of the LoRA module + target_modules(List[str]): The modules to be replaced by LoRA, + can be the end of the module name or a regex string + lora_alpha(float): The factor to add the lora weights + lora_dropout(float): The dropout rate of the lora module + merge_weights(bool): Whether to merge weights when validating + use_merged_linear(bool): Whether to replace with merged linear layer + enable_lora(List[bool]): The modules need to be turned on when using the merged linear layer + fan_in_fan_out(bool): Set this to True if the layer to replace stores weight like (fan_in, fan_out) + bias(str): Bias type. Values ca be "none", "all" or "lora_only" """ r: int = field(default=6, metadata={'help': 'The rank of the LoRA module'}) - target_modules: List = field( + target_modules: List[str] = field( default=None, metadata={ 'help': @@ -193,18 +194,19 @@ def activate_adapter(module: torch.nn.Module, adapter_name: str, _module.set_activation(activate) @staticmethod - def _dynamic_patch_lora(model, replace_modules, use_merged_linear, - adapter_name, **kwargs): + def _dynamic_patch_lora(model: torch.nn.Module, + replace_modules: Union[str, List[str]], + use_merged_linear: bool, adapter_name: str, + **kwargs): """Dynamic patch lora to model Args: - model: The torch.nn.Module containing the target module to be patched. - replace_modules: The module names to be replaced, the replacing strategy is `end with`. - use_merged_linear: Whether to replace with merged linear layer + model(`torch.nn.Module`): The torch.nn.Module containing the target module to be patched. + replace_modules(`Union[str, List[str]]`): The module names to be replaced, + the replacing strategy is `end with`. + use_merged_linear(bool): Whether to replace with merged linear layer. + adapter_name(str): The adapter name. **kwargs: The arguments passed from `tune` which are needed by lora. - - Returns: - The lora modules """ modules = {} module_keys = [key for key, _ in model.named_modules()] diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py index 56605f5896..141c196fdb 100644 --- a/swift/tuners/prompt.py +++ b/swift/tuners/prompt.py @@ -28,14 +28,15 @@ class PromptConfig(SwiftConfig): Here we apply the VPT to other fields. Args: - dim: The dimension of the hidden states - target_modules: The layer module to be replaced, in regex format - embedding_pos: The position of the embedding tensor - attention_mask_pos: The position of the attention mask - attention_mask_value: The value to pad to the attention mask - prompt_length: The length of the prompt tokens - attach_front: When set to True, prompt is attached in front of the embedding - extract_embedding: Whether the embedding is extracted at final stage to keep the same dims with inputs + dim(`Union[int, List[int]]`): The dimension of the hidden states, use list if there are up-sample blocks + or down-sample blocks + target_modules(str): The layer module to be replaced, in regex format + embedding_pos(Union[str, int]): The position of the embedding tensor + attention_mask_pos(Union[str, int]): The position of the attention mask + attention_mask_value(Union[float, int, bool]): The value to pad to the attention mask + prompt_length(int): The length of the prompt tokens + attach_front(bool): When set to True, prompt is attached in front of the embedding + extract_embedding(bool): Whether the embedding is extracted at final stage to keep the same dims with inputs """ dim: Union[int, List[int]] = field( diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py index 4744e55b38..d808551971 100644 --- a/swift/tuners/restuning.py +++ b/swift/tuners/restuning.py @@ -28,16 +28,19 @@ class ResTuningConfig(SwiftConfig): See Args: - dims: The dimensions of the hidden states - root_modules: The root module to be replaced, can a regex string - root_modules_hook: The hook type of root modules, can be "input" or "output" - stem_modules: The stem modules to be replaced, can a regex string or name list of full match format - stem_modules_hook: The hook type of stem modules, can be "input" or "output" - target_modules: The target module to be replaced, can a regex string - target_modules_hook: The hook type of target modules, can be "input" or "output" - tuner_cfg: The configuration of the tuning module, can a string or customized config - use_upsample: Whether to use auxiliary upsample module - use_bypass: Whether to use bypass + dims(`Union[List[int], int]`): The dimensions of the hidden states + root_modules(`str`): The root module to be replaced, can a regex string + root_modules_hook(`str`): The hook type of root modules, can be "input" or "output" + stem_modules(`Union[List[str], str]`): The stem modules to be replaced, + can a regex string or name list of full match format + stem_modules_hook(`Union[List[str], str]`): The hook type of stem modules, can be "input" or "output" + target_modules(`str`): The target module to be replaced, can a regex string + target_modules_hook(`str`): The hook type of target modules, can be "input" or "output" + tuner_cfg(`Union[List[Dict], Dict, str]`): The configuration of the tuning module, + can a string or customized config + use_upsample(bool): Whether to use auxiliary upsample module + upsample_out_channels(List[int]): The channels if `use_upsample` + zero_init_last(bool): Use zero to initialize the last Linear in every sub tuner. """ diff --git a/swift/tuners/side.py b/swift/tuners/side.py index 3c40baede9..168cc2bb2c 100644 --- a/swift/tuners/side.py +++ b/swift/tuners/side.py @@ -44,14 +44,21 @@ class SideConfig(SwiftConfig): }) side_module_name: str = field( - default=1., + default='fcn4', metadata={'help': 'The name of the additive side networks'}) - hidden_pos: Union[str, int] = field( + source_hidden_pos: Union[str, int] = field( default=0, metadata={ 'help': - 'The position of the hidden state to passed into the adapter, can be int (args) or str (kwargs)' + 'The position of the hidden state input to the target module, can be int (args) or str (kwargs)' + }) + + target_hidden_pos: Union[str, int] = field( + default=0, + metadata={ + 'help': + 'The position of the hidden state output from the target module, can be int (args) or str (kwargs)' }) def __post_init__(self): @@ -82,18 +89,19 @@ def _forward(self, *args, **kwargs): args_main = getattr( self, f'forward_origin_{adapter_name}')(*args, **kwargs) + + if isinstance(config.source_hidden_pos, int): + x = args[config.source_hidden_pos] + else: + x = kwargs[config.source_hidden_pos] + + x_main = args_main[config.target_modules] \ + if isinstance(args_main, (tuple, list, dict)) else args_main + out = getattr(self, f'side_{adapter_name}')(x, x_main) if isinstance(args_main, (tuple, list, dict)): - if isinstance(config.hidden_pos, str): - args_main[config.hidden_pos] = getattr( - self, f'side_{adapter_name}')( - *args, args_main[config.hidden_pos]) + args_main[config.target_modules] = out else: - _type = type(args_main) - args_main = list(args_main) - args_main[config.hidden_pos] = getattr( - self, f'side_{adapter_name}')( - *args, args_main[config.hidden_pos]) - args_main = _type(args_main) + args_main = out return args_main if isinstance(tgt_module, nn.Sequential) and not hasattr( From 14cbaac763db8b09993af9306082bb204ba315cc Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Thu, 14 Sep 2023 16:43:46 +0800 Subject: [PATCH 66/70] fix arg --- tests/tuners/test_swift_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py index 83dd5fa44a..f6deec9f86 100644 --- a/tests/tuners/test_swift_base.py +++ b/tests/tuners/test_swift_base.py @@ -359,7 +359,7 @@ def test_swift_side_bert(self): dim=model.config.hidden_size, target_modules=r'.*encoder.encoder', side_module_name='mlp', - hidden_pos='last_hidden_state') + target_hidden_pos='last_hidden_state') model = Swift.prepare_model(model, config=side_config) result_activate = model(**inputs).logits From 903cb34564836d0ba8e399f0a0b164bf4b895b0d Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 14 Sep 2023 17:40:54 +0800 Subject: [PATCH 67/70] fix bugs --- examples/pytorch/llm/src/llm_sft.py | 26 ++++++++++--------- examples/pytorch/llm/src/utils/dataset.py | 6 +++-- examples/pytorch/llm/src/utils/swift_utils.py | 4 +-- swift/trainers/trainers.py | 9 ++++--- swift/utils/torch_utils.py | 11 ++++---- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 3430c20ca4..bbf9b7895f 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -11,12 +11,12 @@ import torch.distributed as dist from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, check_json_format, dataset_map, - find_all_linear_for_lora, get_dataset, get_dist_setting, - get_model_tokenizer, get_preprocess, is_ddp_plus_mp, - is_dist, is_master, plot_images, process_dataset, - select_bnb, select_dtype, show_layers, sort_by_max_length, - compute_nlg_metrics, prepare_model) + broadcast_string, check_json_format, compute_nlg_metrics, + dataset_map, find_all_linear_for_lora, get_dataset, + get_dist_setting, get_model_tokenizer, get_preprocess, + is_ddp_plus_mp, is_dist, is_master, plot_images, + prepare_model, process_dataset, select_bnb, select_dtype, + show_layers, sort_by_max_length) from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) @@ -35,7 +35,10 @@ class SftArguments: metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( default='lora', - metadata={'help': f'tuner choices: {["lora", "full", "adapter", "restuning"]}'}) + metadata={ + 'help': + f'tuner choices: {["lora", "full", "adapter", "restuning"]}' + }) template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -234,15 +237,15 @@ def llm_sft(args: SftArguments) -> None: args.model_type, torch_dtype=args.torch_dtype, **kwargs) if args.resume_from_ckpt is None: - model = prepare_model(model, args) + if args.sft_type != 'full': + model = prepare_model(model, args) else: model = Swift.from_pretrained( model, args.resume_from_ckpt, is_trainable=True) show_layers(model) print_model_info(model) - logger.info(str(model)) - logger.info(model.get_trainable_parameters()) + logger.info(model) # ### Loading Dataset dataset = get_dataset(args.dataset.split(',')) @@ -311,8 +314,7 @@ def llm_sft(args: SftArguments) -> None: do_eval=True, evaluation_strategy='steps', per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=1 - if args.predict_with_generate else args.batch_size, + per_device_eval_batch_size=args.eval_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate, weight_decay=args.weight_decay, diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 9366574a6a..3b418d2d7f 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -570,8 +570,10 @@ def get_code_python_zh_dataset() -> HfDataset: get_jd_zh_dataset, 'dureader-robust-zh': get_dureader_robust_qg_zh_dataset, - 'advertise-gen': get_advertise_gen_dataset, - 'du_reader': get_du_reader_dataset, + 'advertise-gen': + get_advertise_gen_dataset, + 'du_reader': + get_du_reader_dataset, # multi-modal chat 'coco-en': diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py index 8d931017c6..63484a1e9f 100644 --- a/examples/pytorch/llm/src/utils/swift_utils.py +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -11,9 +11,7 @@ logger = get_logger() -def prepare_model( - model: Module, args -): +def prepare_model(model: Module, args) -> Module: swift_config: Dict[str, SwiftConfig] = dict() for sft_type in [_type.strip() for _type in args.sft_type.split(',')]: if sft_type.lower() == SwiftTuners.LORA.lower(): diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py index 1f4a4c2f46..c51eae8841 100644 --- a/swift/trainers/trainers.py +++ b/swift/trainers/trainers.py @@ -8,14 +8,15 @@ from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer from transformers import Trainer as HfTrainer from transformers import trainer -try: - from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled -except ImportError: - from transformers.deepspeed import is_deepspeed_zero3_enabled from .callback import DefaultFlowCallbackNew, ProgressCallbackNew from .mixin import PushToMsHubMixin, SwiftMixin +try: + from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +except ImportError: + from transformers.deepspeed import is_deepspeed_zero3_enabled + class Trainer(PushToMsHubMixin, SwiftMixin, HfTrainer): pass diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 867c8d4513..b51453df2e 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -89,12 +89,11 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: n_params /= 1e6 n_grads /= 1e6 n_buffers /= 1e6 - s = [ - f'{name}: ', f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ', - f'{n_buffers:.4f}M Buffers, ', - f'Trainable percentage: {100 * n_grads / n_params:.2f}%.' - ] - logger.info(''.join(s)) + s = (f'{name}: ' + f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable ' + f'[{100 * n_grads / n_params:.4f}%]), ' + f'{n_buffers:.4f}M Buffers.') + logger.info(s) def find_sub_module(module: torch.nn.Module, From 2aa0182e718d04fc29996e88627c1357605246c2 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 11:51:20 +0800 Subject: [PATCH 68/70] temporary commit --- examples/pytorch/llm/src/llm_infer.py | 17 ++--- examples/pytorch/llm/src/llm_sft.py | 40 +++++----- examples/pytorch/llm/src/utils/dataset.py | 92 +++++++++++------------ examples/pytorch/llm/src/utils/utils.py | 4 +- 4 files changed, 71 insertions(+), 82 deletions(-) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index 61c23ca3c1..0783fd9858 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -26,7 +26,7 @@ class InferArguments: template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) ckpt_dir: str = '/path/to/your/vx_xxx/checkpoint-xxx' - eval_human: bool = False # False: eval test_dataset + eval_human: bool = False # False: eval val_dataset seed: int = 42 dtype: str = field( @@ -38,7 +38,7 @@ class InferArguments: metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 dataset_sample: int = 20000 # -1: all dataset - dataset_test_size: float = 0.01 + dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -138,14 +138,13 @@ def llm_infer(args: InferArguments) -> None: inference(input_ids, model, tokenizer, streamer, generation_config, args.skip_prompt) else: - dataset = get_dataset(args.dataset.split(',')) - _, test_dataset = process_dataset(dataset, args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) - mini_test_dataset = test_dataset.select( - range(min(10, test_dataset.shape[0]))) + _, val_dataset = get_dataset( + args.dataset.split(','), args.dataset_test_ratio, + args.dataset_sample, args.dataset_seed) + mini_val_dataset = val_dataset.select( + range(min(10, val_dataset.shape[0]))) del dataset - for data in mini_test_dataset: + for data in mini_val_dataset: response = data['response'] data['response'] = None input_ids = preprocess_func(data)['input_ids'] diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index bbf9b7895f..1482e5f7ca 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -56,7 +56,7 @@ class SftArguments: metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'}) dataset_seed: int = 42 dataset_sample: int = 20000 # -1: all dataset - dataset_test_size: float = 0.01 + dataset_test_ratio: float = 0.01 system: str = 'you are a helpful assistant!' max_length: Optional[int] = 2048 @@ -127,11 +127,11 @@ class SftArguments: }) # generation config, only useful when `predict_with_generate=True` + max_new_tokens: int = 1024 do_sample: bool = True - top_p: float = 0.7 - max_new_tokens: int = None - temperature: float = 0.95 - top_k: int = 20 + temperature: float = 0.9 + top_k: int = 50 + top_p: float = 0.9 def __post_init__(self): if is_dist(): @@ -248,23 +248,17 @@ def llm_sft(args: SftArguments) -> None: logger.info(model) # ### Loading Dataset - dataset = get_dataset(args.dataset.split(',')) - if isinstance(dataset, tuple): - train_dataset, val_dataset = dataset - else: - train_dataset, val_dataset = process_dataset(dataset, - args.dataset_test_size, - args.dataset_sample, - args.dataset_seed) - - generation_config = { - 'do_sample': args.do_sample, - 'top_p': args.top_p, - 'max_length': None, - 'max_new_tokens': args.max_new_tokens, - 'temperature': args.temperature, - 'top_k': args.top_k, - } + train_dataset, val_dataset = get_dataset( + args.dataset.split(','), args.dataset_test_ratio, args.dataset_sample, + args.dataset_seed) + generation_config = GenerationConfig( + do_sample=args.do_sample, + max_length=None, + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + ) preprocess_func_train = get_preprocess( args.template_type, @@ -346,7 +340,7 @@ def llm_sft(args: SftArguments) -> None: ddp_backend=args.ddp_backend, gradient_checkpointing=args.gradient_checkpointing, predict_with_generate=args.predict_with_generate, - generation_config=GenerationConfig.from_dict(generation_config), + generation_config=generation_config, local_rank=local_rank, **kwargs) diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 3b418d2d7f..664b7981a9 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -13,7 +13,7 @@ from tqdm.auto import tqdm from .preprocess import History -from .utils import download_dataset +from .utils import download_dataset, process_dataset def _preprocess_alpaca_dataset( @@ -41,23 +41,17 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset: 'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset() return _preprocess_alpaca_dataset(dataset) +def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset: + for d in dataset: + pass def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: dataset_train: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen', - split='train').to_hf_dataset().rename_columns({ - 'content': 'query', - 'summary': 'response', - }) + 'lvjianjin/AdvertiseGen',split='train').to_hf_dataset() dataset_val: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen', - split='validation').to_hf_dataset().rename_columns({ - 'content': - 'query', - 'summary': - 'response', - }) - return dataset_train, dataset_val + 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset() + return (_preprocess_advertise_gen_dataset(dataset_train), + _preprocess_advertise_gen_dataset(dataset_val)) def get_alpaca_gpt4_zh_dataset() -> HfDataset: @@ -184,12 +178,12 @@ def _preprocess_mutimodal_dataset(dataset: HfDataset, prompt: str, def get_coco_en_dataset() -> HfDataset: dataset_dict = MsDataset.load('modelscope/coco_2014_caption') - dataset: HfDataset = concatenate_datasets([ - dataset_dict['train'].to_hf_dataset(), - dataset_dict['validation'].to_hf_dataset() - ]) - return _preprocess_mutimodal_dataset(dataset, 'please describe the image', - 'image', 'caption') + train_dataset = dataset_dict['train'].to_hf_dataset() + val_dataset = dataset_dict['validation'].to_hf_dataset() + return ( + _preprocess_mutimodal_dataset(dataset, 'please describe the image', + 'image', 'caption') for dataset in (train_dataset, val_dataset) + ) def _filter_agent_dataset(dataset: List[Dict[str, Any]], @@ -392,12 +386,14 @@ def _preprocess_dureader_robust(dataset: HfDataset) -> HfDataset: def get_dureader_robust_qg_zh_dataset() -> HfDataset: """Question Generation""" dataset_dict = MsDataset.load('modelscope/DuReader_robust-QG') - dataset: HfDataset = concatenate_datasets([ + train_dataset: HfDataset = concatenate_datasets([ dataset_dict['train'].to_hf_dataset(), dataset_dict['validation'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset() ]) - return _preprocess_dureader_robust(dataset) + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset() + return ( + _preprocess_dureader_robust(dataset) for dataset in (train_dataset, val_dataset) + ) def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset: @@ -419,21 +415,22 @@ def _preprocess_medical(dataset: HfDataset, subset_name: str) -> HfDataset: def get_medical_dataset(subset_name: str, - dataset_sample: int = -1) -> HfDataset: + train_dataset_sample: int = -1) -> HfDataset: """ mode: Literal['en', zh] """ dataset_dict = MsDataset.load( 'huangjintao/medical_zh', subset_name=subset_name) - dataset: HfDataset = concatenate_datasets([ + train_dataset: HfDataset = concatenate_datasets([ dataset_dict['train'].to_hf_dataset(), dataset_dict['val'].to_hf_dataset(), - dataset_dict['test'].to_hf_dataset(), ]) + val_dataset: HfDataset = dataset_dict['test'].to_hf_dataset(), if dataset_sample != -1: idxs = np.random.permutation(dataset_sample) - dataset = dataset.select(idxs) - return _preprocess_medical(dataset, subset_name) + train_dataset = train_dataset.select(idxs) + return (_preprocess_medical(dataset, subset_name) + for dataset in (train_dataset, val_dataset)) def _preprocess_sharegpt(dataset: HfDataset) -> HfDataset: @@ -549,7 +546,7 @@ def get_code_python_zh_dataset() -> HfDataset: 'medical-zh': partial(get_medical_dataset, subset_name='zh'), 'medical-mini-zh': - partial(get_medical_dataset, subset_name='zh', dataset_sample=100000), + partial(get_medical_dataset, subset_name='zh', train_dataset_sample=100000), 'code-python-zh': get_code_python_zh_dataset, @@ -588,26 +585,25 @@ def get_code_python_zh_dataset() -> HfDataset: def get_dataset( - dataset_name_list: List[str] -) -> Union[HfDataset, Tuple[HfDataset, HfDataset]]: + dataset_name_list: List[str], + dataset_test_ratio: float, + dataset_sample: int, + dataset_seed: int +) -> Tuple[HfDataset, HfDataset]: """Returns a dataset to be split or a train-val dataset tuple""" - dataset_list: List[Union[HfDataset, Tuple[HfDataset, HfDataset]]] = [] + train_dataset_list: List[HfDataset] = [] + val_dataset_list: List[HfDataset] = [] for dataset_name in dataset_name_list: get_function = DATASET_MAPPING[dataset_name] - dataset_list.append(get_function()) - - assert (all(isinstance(dataset, tuple) for dataset in dataset_list) - or all(isinstance(dataset, HfDataset) for dataset in dataset_list)) - if not isinstance(dataset_list[0], tuple): - dataset = concatenate_datasets(dataset_list) - else: - train_datasets = [dataset[0] for dataset in dataset_list] - val_datasets = [dataset[1] for dataset in dataset_list] - if len(train_datasets) > 1: - train_dataset = concatenate_datasets(train_datasets) - val_dataset = concatenate_datasets(val_datasets) + dataset = get_function() + if isinstance(dataset, (list, tuple)): + train_dataset = dataset[0] + val_dataset = dataset[1] else: - train_dataset = train_datasets[0] - val_dataset = val_datasets[0] - dataset = (train_dataset, val_dataset) - return dataset + process_dataset(dataset, dataset_test_) + train_dataset_list.append(train_dataset) + val_dataset_list.append(val_dataset) + + train_dataset = concatenate_datasets(train_dataset_list) + val_dataset = concatenate_datasets(val_dataset_list) + return train_dataset, val_dataset diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index e6306d36f3..70167db4f8 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -249,7 +249,7 @@ def download_files(url: str, local_path: str, cookies) -> None: f.write(data) -def process_dataset(dataset: HfDataset, dataset_test_size: float, +def process_dataset(dataset: HfDataset, dataset_test_ratio: float, dataset_sample: int, dataset_seed: int) -> Tuple[HfDataset, HfDataset]: random_state = np.random.RandomState(dataset_seed) @@ -257,7 +257,7 @@ def process_dataset(dataset: HfDataset, dataset_test_size: float, index = random_state.permutation(len(dataset))[:dataset_sample] dataset = dataset.select(index) dataset = dataset.train_test_split( - dataset_test_size, seed=get_seed(random_state)) + dataset_test_ratio, seed=get_seed(random_state)) return dataset['train'], dataset['test'] From 6a57b109ae15d60f7c0f5d3c79b1e8ce99a04bcc Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 12:37:33 +0800 Subject: [PATCH 69/70] fix bugs --- .../baichuan2_7b_chat/lora_ddp/infer.sh | 2 +- examples/pytorch/llm/src/llm_sft.py | 24 ++++++++------- examples/pytorch/llm/src/utils/__init__.py | 1 + examples/pytorch/llm/src/utils/dataset.py | 30 +------------------ .../pytorch/llm/src/utils/metric_utils.py | 6 ++-- 5 files changed, 20 insertions(+), 43 deletions(-) diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh index ce54c3ffaa..6988d4a37d 100644 --- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh +++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh @@ -8,7 +8,7 @@ python src/llm_infer.py \ --eval_human false \ --dataset damo-agent-mini-zh \ --max_length 4096 \ - --max_new_tokens 1024 \ + --max_new_tokens 2048 \ --temperature 0.9 \ --top_k 50 \ --top_p 0.9 \ diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 7d2d5fd483..517886d589 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -12,12 +12,12 @@ import torch.distributed as dist from transformers import BitsAndBytesConfig, GenerationConfig from utils import (DATASET_MAPPING, MODEL_MAPPING, TEMPLATE_MAPPING, - broadcast_string, check_json_format, dataset_map, - find_all_linear_for_lora, get_dataset, get_dist_setting, - get_model_tokenizer, get_preprocess, is_ddp_plus_mp, - is_dist, is_master, plot_images, select_bnb, select_dtype, - compute_nlg_metrics, prepare_model, - show_layers, sort_by_max_length) + broadcast_string, check_json_format, compute_nlg_metrics, + dataset_map, find_all_linear_for_lora, get_dataset, + get_dist_setting, get_model_tokenizer, get_preprocess, + is_ddp_plus_mp, is_dist, is_master, plot_images, + prepare_model, select_bnb, select_dtype, show_layers, + sort_by_max_length) from swift import (HubStrategy, Seq2SeqTrainer, Seq2SeqTrainingArguments, Swift, get_logger) @@ -270,16 +270,20 @@ def llm_sft(args: SftArguments) -> None: val_dataset = val_dataset.select(val_idxs) logger.info(f'train_dataset: {train_dataset}') logger.info(f'val_dataset: {val_dataset}') - preprocess_func_train = get_preprocess(args.template_type, tokenizer, - args.system, args.max_length, validate_generation=False) + preprocess_func_train = get_preprocess( + args.template_type, + tokenizer, + args.system, + args.max_length, + validate_generation=False) preprocess_func_eval = get_preprocess( args.template_type, tokenizer, args.system, args.max_length, validate_generation=args.predict_with_generate) - train_dataset = dataset_map(train_dataset, preprocess_func) - val_dataset = dataset_map(val_dataset, preprocess_func) + train_dataset = dataset_map(train_dataset, preprocess_func_train) + val_dataset = dataset_map(val_dataset, preprocess_func_eval) if args.test_oom_error: train_dataset = sort_by_max_length(train_dataset, 20000) # Data analysis diff --git a/examples/pytorch/llm/src/utils/__init__.py b/examples/pytorch/llm/src/utils/__init__.py index 671d38b56d..341293902d 100644 --- a/examples/pytorch/llm/src/utils/__init__.py +++ b/examples/pytorch/llm/src/utils/__init__.py @@ -2,6 +2,7 @@ from .metric_utils import compute_nlg_metrics from .model import MODEL_MAPPING, get_model_tokenizer from .preprocess import TEMPLATE_MAPPING, get_preprocess +from .swift_utils import prepare_model from .utils import (broadcast_string, check_json_format, dataset_map, download_dataset, find_all_linear_for_lora, get_dist_setting, inference, is_ddp_plus_mp, is_dist, diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index ba70b76884..6804a9dca4 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -14,7 +14,7 @@ from swift.utils import get_seed from .preprocess import History -from .utils import download_dataset, process_dataset +from .utils import download_dataset def _preprocess_alpaca_dataset( @@ -42,18 +42,6 @@ def get_alpaca_gpt4_en_dataset() -> HfDataset: 'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset() return _preprocess_alpaca_dataset(dataset) -def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset: - for d in dataset: - pass - -def get_advertise_gen_dataset() -> Tuple[HfDataset, HfDataset]: - dataset_train: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen',split='train').to_hf_dataset() - dataset_val: HfDataset = MsDataset.load( - 'lvjianjin/AdvertiseGen', split='validation').to_hf_dataset() - return (_preprocess_advertise_gen_dataset(dataset_train), - _preprocess_advertise_gen_dataset(dataset_val)) - def _preprocess_advertise_gen_dataset(dataset: HfDataset) -> HfDataset: prompt = """Task: Generating advertisements based on keywords. @@ -156,22 +144,6 @@ def get_instinwild_en_dataset() -> HfDataset: return _preprocess_alpaca_dataset(dataset) -def get_du_reader_dataset() -> Tuple[HfDataset, HfDataset]: - dataset_train: HfDataset = MsDataset.load( - 'modelscope/DuReader_robust-QG', - split='train').to_hf_dataset().rename_columns({ - 'text1': 'query', - 'text2': 'response', - }) - dataset_val: HfDataset = MsDataset.load( - 'modelscope/DuReader_robust-QG', - split='validation').to_hf_dataset().rename_columns({ - 'text1': 'query', - 'text2': 'response', - }) - return dataset_train, dataset_val - - def get_cot_en_dataset() -> HfDataset: dataset: HfDataset = MsDataset.load( 'YorickHe/CoT', split='train').to_hf_dataset() diff --git a/examples/pytorch/llm/src/utils/metric_utils.py b/examples/pytorch/llm/src/utils/metric_utils.py index d4f964a5e6..2e8df7d53d 100644 --- a/examples/pytorch/llm/src/utils/metric_utils.py +++ b/examples/pytorch/llm/src/utils/metric_utils.py @@ -1,9 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import jieba import numpy as np -from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu -from rouge.rouge import Rouge from swift import get_logger @@ -11,6 +8,9 @@ def compute_nlg_metrics(prediction, tokenizer): + import jieba + from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + from rouge.rouge import Rouge preds, labels = prediction[0], prediction[1] score_dict = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []} From 166d3c038f5dd126c16e7a4c702a7706f82b356b Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Fri, 15 Sep 2023 14:16:20 +0800 Subject: [PATCH 70/70] merge branch --- examples/pytorch/llm/src/llm_sft.py | 30 +++++++------------ examples/pytorch/llm/src/utils/swift_utils.py | 6 ++-- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 517886d589..5d484e423b 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -35,11 +35,7 @@ class SftArguments: default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( - default='lora', - metadata={ - 'help': - f'tuner choices: {["lora", "full", "adapter", "restuning"]}' - }) + default='lora', metadata={'choices': ['lora', 'full']}) template_type: str = field( default=None, metadata={'choices': list(TEMPLATE_MAPPING.keys())}) output_dir: str = 'runs' @@ -75,7 +71,6 @@ class SftArguments: lora_rank: int = 8 lora_alpha: int = 32 lora_dropout_p: float = 0. - adapter_length: int = 32 gradient_checkpointing: bool = False batch_size: int = 1 @@ -147,16 +142,12 @@ def __post_init__(self): # Initialize in advance dist.init_process_group(backend=self.ddp_backend) - from swift import SwiftTuners - all_types = [ - SwiftTuners.LORA.lower(), - SwiftTuners.ADAPTER.lower(), - SwiftTuners.RESTUNING.lower() - ] + ['full'] - sft_type = [_type.strip() for _type in self.sft_type.split(',')] - assert all([_type.lower() in all_types for _type in sft_type]), \ - f'Unsupported tuners: {self.sft_type}, supported tuners are: {all_types}' - if self.sft_type == 'full': + if self.sft_type == 'lora': + if self.learning_rate is None: + self.learning_rate = 1e-4 + if self.only_save_model is None: + self.only_save_model = False + elif self.sft_type == 'full': assert self.quantization_bit == 0, 'not supported' assert self.dtype != 'fp16', 'please use bf16 or fp32' if self.learning_rate is None: @@ -164,10 +155,8 @@ def __post_init__(self): if self.only_save_model is None: self.only_save_model = True else: - if self.learning_rate is None: - self.learning_rate = 1e-4 - if self.only_save_model is None: - self.only_save_model = False + raise ValueError(f'sft_type: {self.sft_type}') + if self.template_type is None: self.template_type = MODEL_MAPPING[self.model_type].get( 'template', 'default') @@ -239,6 +228,7 @@ def llm_sft(args: SftArguments) -> None: if args.resume_from_ckpt is None: if args.sft_type != 'full': + # lora model = prepare_model(model, args) else: model = Swift.from_pretrained( diff --git a/examples/pytorch/llm/src/utils/swift_utils.py b/examples/pytorch/llm/src/utils/swift_utils.py index 63484a1e9f..ee8ef3b489 100644 --- a/examples/pytorch/llm/src/utils/swift_utils.py +++ b/examples/pytorch/llm/src/utils/swift_utils.py @@ -27,7 +27,7 @@ def prepare_model(model: Module, args) -> Module: target_modules=args.lora_target_modules, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout_p) - logger.info(f'lora_config: {lora_config}') + logger.debug(f'lora_config: {lora_config}') swift_config['lora'] = lora_config elif sft_type.lower() == SwiftTuners.ADAPTER.lower(): adapter_config = AdapterConfig( @@ -38,12 +38,12 @@ def prepare_model(model: Module, args) -> Module: hidden_pos=0, adapter_length=args.adapter_length, ) - logger.info(f'adapter_config: {adapter_config}') + logger.debug(f'adapter_config: {adapter_config}') swift_config['adapter'] = adapter_config elif sft_type.lower() == SwiftTuners.RESTUNING.lower(): restuner_config = ResTuningConfig( dims=model.config.hidden_size, **MODEL_MAPPING[args.model_type]['restuner_TM']) - logger.info(f'restuner_config: {restuner_config}') + logger.debug(f'restuner_config: {restuner_config}') swift_config['restuner'] = restuner_config return Swift.prepare_model(model, swift_config)