From 3b1fe0dfdda3bb6e198ed0fcc199af4cb87cacdf Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 7 Jan 2024 14:11:17 +0800 Subject: [PATCH 1/3] support additional_trainable_parameters --- ...344\273\244\350\241\214\345\217\202\346\225\260.md" | 1 + .../llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh | 3 ++- swift/llm/tuner.py | 6 +++++- swift/llm/utils/argument.py | 8 ++++++++ swift/utils/__init__.py | 10 +++++----- swift/utils/torch_utils.py | 8 ++++++++ 6 files changed, 29 insertions(+), 7 deletions(-) diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 7994dae5a8..4b1273c40a 100644 --- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -11,6 +11,7 @@ - `--model_cache_dir`: 默认为`None`. 如果模型在本地已经有缓存, 且缓存路径并非ModelScope默认cache路径, 可以通过指定该参数从cache_dir中导入model和tokenizer. - `--sft_type`: 表示微调的方式, 默认是`'lora'`. 你可以选择的值包括: 'lora', 'full', 'longlora', 'qalora'. 如果你要使用qlora, 你需设置`--sft_type lora --quantization_bit 4`. - `--freeze_parameters`: 当sft_type指定为'full'时, 将模型最底部的参数进行freeze. 指定范围为0. ~ 1., 默认为`0.`. 该参数提供了lora与全参数微调的折中方案. +- `--additional_trainable_parameters`: 作为freeze_parameters的补充, 只有在sft_type指定为'full'才允许被使用, 默认为`[]`. 例如你如果想训练50%的参数的情况下想额外训练embedding层, 你可以设置`--freeze_parameters 0.5 --additional_trainable_parameters transformer.wte`, 所有以`transformer.wte`开头的parameters都会被激活. - `--tuner_backend`: 表示lora, qlora的后端支持, 默认是`'swift'`. 你可以选择的值包括: 'swift', 'peft'. - `--template_type`: 表示使用的对话模板的类型, 默认是`'AUTO'`, 即根据`model_type`查找`MODEL_MAPPING`中的`template`. 可以选择的`template_type`可以查看`TEMPLATE_MAPPING.keys()`. - `--output_dir`: 表示ckpt存储的目录, 默认是`'output'`. 我们会在该目录后拼接`model_type`和微调版本号. 方便用户对不同模型进行多次对比实验, 而不需要改变`output_dir`命令行参数. 如果不需要拼接这些内容, 你需要额外指定参数`--add_output_dir_suffix false`. diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh index 2056a17567..6ba119d237 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh @@ -14,5 +14,6 @@ swift sft \ --use_flash_attn true \ --only_save_model true \ --dataset codefuse-evol-instruction-zh \ - --freeze_parameters 0.2 \ + --freeze_parameters 0.25 \ + --additional_trainable_parameters transformer.wte \ --preprocess_num_proc 4 \ diff --git a/swift/llm/tuner.py b/swift/llm/tuner.py index cac16edabd..7bd93beb0e 100644 --- a/swift/llm/tuner.py +++ b/swift/llm/tuner.py @@ -5,7 +5,8 @@ from swift.trainers import TrainerCallback from swift.tuners import (LongLoRAConfig, LongLoRAModelType, LoraConfig, LoRAConfig, NEFTuneConfig, Swift) -from swift.utils import freeze_model_parameters, get_logger +from swift.utils import (activate_model_parameters, freeze_model_parameters, + get_logger) from .utils import SftArguments, find_all_linear_for_lora, is_lora logger = get_logger() @@ -76,6 +77,9 @@ def prepare_model(model, args: SftArguments): elif args.sft_type == 'full': if args.freeze_parameters > 0: freeze_model_parameters(model, args.freeze_parameters) + if len(args.additional_trainable_parameters) > 0: + activate_model_parameters(model, + args.additional_trainable_parameters) else: raise ValueError(f'args.sft_type: {args.sft_type}') diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index ceb8aa4c92..029922795a 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -39,6 +39,7 @@ class SftArguments: sft_type: Literal['lora', 'full', 'longlora', 'qalora'] = 'lora' freeze_parameters: float = 0. # 0 ~ 1 + additional_trainable_parameters: List[str] = field(default_factory=list) tuner_backend: Literal['swift', 'peft'] = 'swift' template_type: str = field( default='AUTO', @@ -211,6 +212,9 @@ def __post_init__(self) -> None: assert self.freeze_parameters == 0., ( 'lora does not support `freeze_parameters`, please set `--sft_type full`' ) + assert len(self.additional_trainable_parameters) == 0, ( + 'lora does not support `additional_trainable_parameters`, please set `--sft_type full`' + ) if 'int4' in self.model_type or 'int8' in self.model_type: assert self.quantization_bit == 0, 'int4 and int8 models do not need to be quantized again.' if self.learning_rate is None: @@ -227,6 +231,10 @@ def __post_init__(self) -> None: "Fine-tuning with dtype=='fp16' can lead to NaN issues. " 'Please use fp32+AMP or bf16 to perform full parameter fine-tuning.' ) + if isinstance(self.additional_trainable_parameters, str): + self.additional_trainable_parameters = [ + self.additional_trainable_parameters + ] if self.learning_rate is None: self.learning_rate = 2e-5 if self.only_save_model is None: diff --git a/swift/utils/__init__.py b/swift/utils/__init__.py index 8d53c4417e..0b34684310 100644 --- a/swift/utils/__init__.py +++ b/swift/utils/__init__.py @@ -8,10 +8,10 @@ from .run_utils import get_main from .tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, plot_images, read_tensorboard_file, tensorboard_smoothing) -from .torch_utils import (broadcast_string, freeze_model_parameters, - get_dist_setting, get_model_info, is_ddp_plus_mp, - is_dist, is_local_master, is_master, - is_on_same_device, seed_everything, show_layers, - time_synchronize) +from .torch_utils import (activate_model_parameters, broadcast_string, + freeze_model_parameters, get_dist_setting, + get_model_info, is_ddp_plus_mp, is_dist, + is_local_master, is_master, is_on_same_device, + seed_everything, show_layers, time_synchronize) from .utils import (add_version_to_work_dir, check_json_format, lower_bound, parse_args, read_multi_line, test_time, upper_bound) diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 1e05b25a34..d82238e7f5 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -131,6 +131,14 @@ def freeze_model_parameters(model: Module, freeze_parameters: float) -> None: p.requires_grad = False +def activate_model_parameters( + model: Module, additional_trainable_parameters: List[int]) -> None: + for n, p in model.named_parameters(): + for additional_tp in additional_trainable_parameters: + if n.startswith(additional_tp): + p.requires_grad = True + + def broadcast_string(string: Optional[str], buffer_size: int = 1024) -> str: """String broadcasting in case of DDP string: main rank: str From bc63f0ebbf90bf46912823b2f617d65f240a1bba Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 7 Jan 2024 14:18:48 +0800 Subject: [PATCH 2/3] update --- swift/llm/utils/argument.py | 2 +- swift/utils/torch_utils.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py index 029922795a..8929ba86f1 100644 --- a/swift/llm/utils/argument.py +++ b/swift/llm/utils/argument.py @@ -225,7 +225,7 @@ def __post_init__(self) -> None: else: self.only_save_model = True elif self.sft_type == 'full': - assert 0 <= self.freeze_parameters < 1 + assert 0 <= self.freeze_parameters <= 1 assert self.quantization_bit == 0, 'Full parameter fine-tuning does not support quantization.' assert self.dtype != 'fp16', ( "Fine-tuning with dtype=='fp16' can lead to NaN issues. " diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index d82238e7f5..8265ab943d 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -133,10 +133,19 @@ def freeze_model_parameters(model: Module, freeze_parameters: float) -> None: def activate_model_parameters( model: Module, additional_trainable_parameters: List[int]) -> None: + if len(additional_trainable_parameters) == 0: + return + has_activate = False for n, p in model.named_parameters(): for additional_tp in additional_trainable_parameters: if n.startswith(additional_tp): p.requires_grad = True + has_activate = True + if not has_activate: + logger.warning( + 'len(additional_trainable_parameters) > 0 but no parameters are activated.' + f'additional_trainable_parameters: {additional_trainable_parameters}' + ) def broadcast_string(string: Optional[str], buffer_size: int = 1024) -> str: From de2d28889f3c869ae32b6b6841287c386f26c181 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Sun, 7 Jan 2024 15:06:44 +0800 Subject: [PATCH 3/3] update sh --- .../pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh index 6ba119d237..c9bb2298d8 100644 --- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh +++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh @@ -1,5 +1,5 @@ # Experimental environment: 2 * A100 -# 2 * 78GB GPU memory +# 2 * 80GB GPU memory NPROC_PER_NODE=2 \ CUDA_VISIBLE_DEVICES=0,1 \ swift sft \