diff --git a/.gitignore b/.gitignore index 6cc2df63a4..90e5d4a6cc 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ result.mp4 # ast template ast_index_file.py + +runs/ \ No newline at end of file diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index cd002cce62..759a6b5252 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -4,7 +4,7 @@
@@ -16,10 +16,10 @@ ## Features 1. supported sft method: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine tuning), ... -2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ... +2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b 3. supported feature: quantization, ddp, model parallelism(device map), gradient checkpoint, gradient accumulation steps, push to modelscope hub, custom datasets, ... -4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ... -5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ... +4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh +5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default ## Prepare the Environment Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization) diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index a47849d222..35632dd624 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -4,7 +4,7 @@ @@ -17,10 +17,10 @@ ## 特性 1. [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调, ... -2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ... +2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpoint, 梯度累加, 支持推送modelscope hub, 支持自定义数据集, ... -4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ... -5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ... +4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh +5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default ## 准备实验环境 实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index d9d0ead270..1a0ecbd6b4 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -18,7 +18,8 @@ @dataclass class InferArguments: model_type: str = field( - default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) + default='qwen-7b-chat', + metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( default='lora', metadata={'choices': ['lora', 'full']}) template_type: str = field( diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index e3f4a73e22..96b824bad7 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -69,7 +69,7 @@ class SftArguments: lora_alpha: int = 32 lora_dropout_p: float = 0.1 - gradient_checkpoint: bool = True + gradient_checkpointing: bool = True batch_size: int = 1 num_train_epochs: int = 1 optim: str = 'adamw_torch' @@ -84,6 +84,7 @@ class SftArguments: save_steps: Optional[int] = None save_total_limit: int = 2 logging_steps: int = 5 + dataloader_num_workers: int = 1 push_to_hub: bool = False # 'user_name/repo_name' or 'repo_name' @@ -263,7 +264,7 @@ def llm_sft(args: SftArguments) -> None: bf16=args.bf16, fp16=args.fp16, eval_steps=args.eval_steps, - dataloader_num_workers=1, + dataloader_num_workers=args.dataloader_num_workers, load_best_model_at_end=True, metric_for_best_model='loss', greater_is_better=False, @@ -276,11 +277,12 @@ def llm_sft(args: SftArguments) -> None: push_to_hub=args.push_to_hub, resume_from_checkpoint=args.resume_from_ckpt, ddp_backend=args.ddp_backend, - gradient_checkpointing=args.gradient_checkpoint, + gradient_checkpointing=args.gradient_checkpointing, local_rank=local_rank) - if args.gradient_checkpoint: + if args.gradient_checkpointing: # fix: gradients will be None + model.config.use_cache = False model.enable_input_require_grads() if is_dist(): trainer_args.ddp_find_unused_parameters = False diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 164878a270..2f2a9a7d2c 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -110,6 +110,18 @@ def get_instinwild_en_dataset(): return _processing_alpaca(dataset) +def get_cot_en_dataset() -> HfDataset: + dataset: HfDataset = MsDataset.load( + 'YorickHe/CoT', split='train').to_hf_dataset() + return _processing_alpaca(dataset) + + +def get_cot_zh_dataset() -> HfDataset: + dataset: HfDataset = MsDataset.load( + 'YorickHe/CoT_zh', split='train').to_hf_dataset() + return _processing_alpaca(dataset) + + DATASET_MAPPING = { 'alpaca-en': get_alpaca_gpt4_en_dataset, 'alpaca-zh': get_alpaca_gpt4_zh_dataset, @@ -120,8 +132,10 @@ def get_instinwild_en_dataset(): for k in _multi_alpaca_language_list }, 'code-en': get_code_alpaca_en_dataset, - 'instinwild-zh': get_instinwild_zh_dataset, 'instinwild-en': get_instinwild_en_dataset, + 'instinwild-zh': get_instinwild_zh_dataset, + 'cot-en': get_cot_en_dataset, + 'cot-zh': get_cot_zh_dataset, } diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index ec165bf247..7a3b162ab0 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -4,8 +4,8 @@ from typing import NamedTuple, Optional import torch -from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, Model, - read_config, snapshot_download) +from modelscope import (AutoConfig, AutoModel, AutoModelForCausalLM, + AutoTokenizer, Model, read_config, snapshot_download) from torch import dtype as Dtype from swift import get_logger @@ -18,6 +18,7 @@ def get_model_tokenizer_from_repo(model_dir: str, load_model: bool = True, model_config=None, tokenizer=None, + automodel_class=AutoModelForCausalLM, **model_kwargs): """load from an independent repository""" if model_config is None: @@ -30,7 +31,7 @@ def get_model_tokenizer_from_repo(model_dir: str, model_dir, trust_remote_code=True) model = None if load_model: - model = AutoModelForCausalLM.from_pretrained( + model = automodel_class.from_pretrained( model_dir, config=model_config, torch_dtype=torch_dtype, @@ -88,8 +89,12 @@ def get_model_tokenizer_chatglm2(model_dir: str, model_kwargs['quantization_config'].llm_int8_skip_modules = [ 'output_layer' ] - return get_model_tokenizer_from_repo(model_dir, torch_dtype, load_model, - **model_kwargs) + return get_model_tokenizer_from_repo( + model_dir, + torch_dtype, + load_model, + automodel_class=AutoModel, + **model_kwargs) def get_model_tokenizer_llama2(model_dir: str, diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 19403122c5..9851dbcdc5 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -51,8 +51,8 @@ def simplify_context_list(context_list: List[Context]) -> List[Context]: - res = [] - temp = [] + res: List[Context] = [] + temp: List[str] = [] for c in context_list: if isinstance(c, str): temp.append(c) @@ -89,7 +89,7 @@ def concat_context_list( def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context], placeholder_list: List[str]) -> List[int]: - input_ids = [] + input_ids: List[int] = [] placeholder_it = iter(placeholder_list) for context in context_list: if isinstance(context, list): @@ -126,8 +126,8 @@ def _preprocess( template_config = TEMPLATE_MAPPING[template_type] if system is None: system = DEFAULT_SYSTEM - total_context_list = [] - placeholder_list = [] + total_context_list: List[Context] = [] + placeholder_list: List[str] = [] concat_context_list( template_config['prefix'], total_context_list, diff --git a/examples/pytorch/llm/src/utils/trainer_patch.py b/examples/pytorch/llm/src/utils/trainer_patch.py new file mode 100644 index 0000000000..4b666af7aa --- /dev/null +++ b/examples/pytorch/llm/src/utils/trainer_patch.py @@ -0,0 +1,62 @@ +import os + +import json +from tqdm import tqdm +from transformers.trainer_callback import (DefaultFlowCallback, + ProgressCallback, TrainerControl, + TrainerState) +from transformers.trainer_utils import has_length + +from swift.trainers import TrainingArguments + + +class ProgressCallbackNew(ProgressCallback): + + def on_train_begin(self, args, state, control, **kwargs): + if state.is_local_process_zero: + self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True) + self.current_step = 0 + + def on_prediction_step(self, + args, + state: TrainerState, + control, + eval_dataloader=None, + **kwargs): + if state.is_local_process_zero and has_length(eval_dataloader): + if self.prediction_bar is None: + self.training_bar.refresh() + self.training_bar.fp.write('\n') + self.prediction_bar = tqdm( + total=len(eval_dataloader), + leave=True, + dynamic_ncols=True, + position=0) + self.prediction_bar.update() + + def on_log(self, + args: TrainingArguments, + state: TrainerState, + control, + logs=None, + **kwargs): + logs['global_step'] = state.global_step + if 'learning_rate' in logs: + logs['learning_rate'] = round(logs['learning_rate'], 8) + if state.is_local_process_zero and self.training_bar is not None: + jsonl_path = os.path.join(args.output_dir, 'logging.jsonl') + with open(jsonl_path, 'a') as f: + f.write(json.dumps(logs) + '\n') + super().on_log(args, state, control, logs, **kwargs) + + +class DefaultFlowCallbackNew(DefaultFlowCallback): + + def on_step_end(self, args: TrainingArguments, state: TrainerState, + control: TrainerControl, **kwargs): + control = super().on_step_end(args, state, control, **kwargs) + # save the last ckpt + if state.global_step == state.max_steps: + control.should_evaluate = True + control.should_save = True + return control diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index b279ac81f2..84383b2177 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -1,19 +1,39 @@ +import logging import os from typing import List, Optional, Tuple import matplotlib.pyplot as plt import torch import torch.distributed as dist +from modelscope.utils.logger import get_logger as get_ms_logger from torch import dtype as Dtype from torch.nn import Linear, Module -from transformers import GenerationConfig, TextStreamer +from transformers import GenerationConfig, TextStreamer, trainer from swift import get_logger +from swift.utils import is_master from swift.utils.tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, read_tensorboard_file, tensorboard_smoothing) +from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew + +# monkey patch +trainer.DEFAULT_PROGRESS_CALLBACK = ProgressCallbackNew +trainer.DEFAULT_CALLBACKS = [DefaultFlowCallbackNew] -os.environ['TOKENIZERS_PARALLELISM'] = 'true' logger = get_logger() +ms_logger = get_ms_logger() + +os.environ['TOKENIZERS_PARALLELISM'] = 'true' +logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s') + +logger.handlers[0].setFormatter(logger_format) +ms_logger.handlers[0].setFormatter(logger_format) +if is_master(): + logger.setLevel(logging.INFO) + ms_logger.setLevel(logging.INFO) +else: + logger.setLevel(logging.ERROR) + ms_logger.setLevel(logging.ERROR) DTYPE_MAPPING = { 'fp16': torch.float16, diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 1c8085fead..c90d4b38ab 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -97,15 +97,6 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: logger.info(''.join(s)) -def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None: - named_p = list(model.named_parameters()) - for i, (n, p) in enumerate(named_p): - if max_lines is not None and i >= max_lines: - logger.info('...') - break - logger.info(f'{n}: requires_grad={p.requires_grad}') - - def get_seed(random_state: RandomState) -> int: seed_max = np.iinfo(np.int32).max seed = random_state.randint(0, seed_max)