Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/LLM/命令行参数.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- `--model_cache_dir`: 默认为`None`. 如果模型在本地已经有缓存, 且缓存路径并非ModelScope默认cache路径, 可以通过指定该参数从cache_dir中导入model和tokenizer.
- `--sft_type`: 表示微调的方式, 默认是`'lora'`. 你可以选择的值包括: 'lora', 'full', 'longlora', 'qalora'. 如果你要使用qlora, 你需设置`--sft_type lora --quantization_bit 4`.
- `--freeze_parameters`: 当sft_type指定为'full'时, 将模型最底部的参数进行freeze. 指定范围为0. ~ 1., 默认为`0.`. 该参数提供了lora与全参数微调的折中方案.
- `--additional_trainable_parameters`: 作为freeze_parameters的补充, 只有在sft_type指定为'full'才允许被使用, 默认为`[]`. 例如你如果想训练50%的参数的情况下想额外训练embedding层, 你可以设置`--freeze_parameters 0.5 --additional_trainable_parameters transformer.wte`, 所有以`transformer.wte`开头的parameters都会被激活.
- `--tuner_backend`: 表示lora, qlora的后端支持, 默认是`'swift'`. 你可以选择的值包括: 'swift', 'peft'.
- `--template_type`: 表示使用的对话模板的类型, 默认是`'AUTO'`, 即根据`model_type`查找`MODEL_MAPPING`中的`template`. 可以选择的`template_type`可以查看`TEMPLATE_MAPPING.keys()`.
- `--output_dir`: 表示ckpt存储的目录, 默认是`'output'`. 我们会在该目录后拼接`model_type`和微调版本号. 方便用户对不同模型进行多次对比实验, 而不需要改变`output_dir`命令行参数. 如果不需要拼接这些内容, 你需要额外指定参数`--add_output_dir_suffix false`.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Experimental environment: 2 * A100
# 2 * 78GB GPU memory
# 2 * 80GB GPU memory
NPROC_PER_NODE=2 \
CUDA_VISIBLE_DEVICES=0,1 \
swift sft \
Expand All @@ -14,5 +14,6 @@ swift sft \
--use_flash_attn true \
--only_save_model true \
--dataset codefuse-evol-instruction-zh \
--freeze_parameters 0.2 \
--freeze_parameters 0.25 \
--additional_trainable_parameters transformer.wte \
--preprocess_num_proc 4 \
5 changes: 2 additions & 3 deletions swift/llm/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
read_multi_line, seed_everything, show_layers)
from .utils import (InferArguments, Template, get_additional_saved_files,
get_dataset, get_model_tokenizer, get_template, inference,
inference_stream, set_generation_config)
inference_stream, is_lora, set_generation_config)

logger = get_logger()

Expand Down Expand Up @@ -138,8 +138,7 @@ def prepare_model_template(
logger.info(f'generation_config: {generation_config}')
set_generation_config(model, generation_config)
# Preparing LoRA
if args.sft_type in ('lora', 'qalora',
'longlora') and args.ckpt_dir is not None:
if is_lora(args.sft_type) and args.ckpt_dir is not None:
model = Swift.from_pretrained(
model, args.ckpt_dir, inference_mode=True)

Expand Down
6 changes: 5 additions & 1 deletion swift/llm/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from swift.trainers import TrainerCallback
from swift.tuners import (LongLoRAConfig, LongLoRAModelType, LoraConfig,
LoRAConfig, NEFTuneConfig, Swift)
from swift.utils import freeze_model_parameters, get_logger
from swift.utils import (activate_model_parameters, freeze_model_parameters,
get_logger)
from .utils import SftArguments, find_all_linear_for_lora, is_lora

logger = get_logger()
Expand Down Expand Up @@ -76,6 +77,9 @@ def prepare_model(model, args: SftArguments):
elif args.sft_type == 'full':
if args.freeze_parameters > 0:
freeze_model_parameters(model, args.freeze_parameters)
if len(args.additional_trainable_parameters) > 0:
activate_model_parameters(model,
args.additional_trainable_parameters)
else:
raise ValueError(f'args.sft_type: {args.sft_type}')

Expand Down
10 changes: 9 additions & 1 deletion swift/llm/utils/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class SftArguments:

sft_type: Literal['lora', 'full', 'longlora', 'qalora'] = 'lora'
freeze_parameters: float = 0. # 0 ~ 1
additional_trainable_parameters: List[str] = field(default_factory=list)
tuner_backend: Literal['swift', 'peft'] = 'swift'
template_type: str = field(
default='AUTO',
Expand Down Expand Up @@ -211,6 +212,9 @@ def __post_init__(self) -> None:
assert self.freeze_parameters == 0., (
'lora does not support `freeze_parameters`, please set `--sft_type full`'
)
assert len(self.additional_trainable_parameters) == 0, (
'lora does not support `additional_trainable_parameters`, please set `--sft_type full`'
)
if 'int4' in self.model_type or 'int8' in self.model_type:
assert self.quantization_bit == 0, 'int4 and int8 models do not need to be quantized again.'
if self.learning_rate is None:
Expand All @@ -221,12 +225,16 @@ def __post_init__(self) -> None:
else:
self.only_save_model = True
elif self.sft_type == 'full':
assert 0 <= self.freeze_parameters < 1
assert 0 <= self.freeze_parameters <= 1
assert self.quantization_bit == 0, 'Full parameter fine-tuning does not support quantization.'
assert self.dtype != 'fp16', (
"Fine-tuning with dtype=='fp16' can lead to NaN issues. "
'Please use fp32+AMP or bf16 to perform full parameter fine-tuning.'
)
if isinstance(self.additional_trainable_parameters, str):
self.additional_trainable_parameters = [
self.additional_trainable_parameters
]
if self.learning_rate is None:
self.learning_rate = 2e-5
if self.only_save_model is None:
Expand Down
10 changes: 5 additions & 5 deletions swift/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from .run_utils import get_main
from .tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, plot_images,
read_tensorboard_file, tensorboard_smoothing)
from .torch_utils import (broadcast_string, freeze_model_parameters,
get_dist_setting, get_model_info, is_ddp_plus_mp,
is_dist, is_local_master, is_master,
is_on_same_device, seed_everything, show_layers,
time_synchronize)
from .torch_utils import (activate_model_parameters, broadcast_string,
freeze_model_parameters, get_dist_setting,
get_model_info, is_ddp_plus_mp, is_dist,
is_local_master, is_master, is_on_same_device,
seed_everything, show_layers, time_synchronize)
from .utils import (add_version_to_work_dir, check_json_format, lower_bound,
parse_args, read_multi_line, test_time, upper_bound)
17 changes: 17 additions & 0 deletions swift/utils/torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,23 @@ def freeze_model_parameters(model: Module, freeze_parameters: float) -> None:
p.requires_grad = False


def activate_model_parameters(
model: Module, additional_trainable_parameters: List[int]) -> None:
if len(additional_trainable_parameters) == 0:
return
has_activate = False
for n, p in model.named_parameters():
for additional_tp in additional_trainable_parameters:
if n.startswith(additional_tp):
p.requires_grad = True
has_activate = True
if not has_activate:
logger.warning(
'len(additional_trainable_parameters) > 0 but no parameters are activated.'
f'additional_trainable_parameters: {additional_trainable_parameters}'
)


def broadcast_string(string: Optional[str], buffer_size: int = 1024) -> str:
"""String broadcasting in case of DDP
string: main rank: str
Expand Down