From 9d6322d05eb19efe7c6be0853459b6700b7dd44b Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 23 Aug 2023 21:54:59 +0800 Subject: [PATCH 1/9] update cot dataset --- .gitignore | 2 ++ examples/pytorch/llm/README.md | 6 +++--- examples/pytorch/llm/README_CN.md | 6 +++--- examples/pytorch/llm/src/llm_infer.py | 3 ++- examples/pytorch/llm/src/utils/dataset.py | 16 +++++++++++++++- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 6cc2df63a4..90e5d4a6cc 100644 --- a/.gitignore +++ b/.gitignore @@ -131,3 +131,5 @@ result.mp4 # ast template ast_index_file.py + +runs/ \ No newline at end of file diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index cd002cce62..8f438febcb 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -16,10 +16,10 @@ ## Features 1. supported sft method: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine tuning), ... -2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ... +2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b 3. supported feature: quantization, ddp, model parallelism(device map), gradient checkpoint, gradient accumulation steps, push to modelscope hub, custom datasets, ... -4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ... -5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ... +4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh +5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default ## Prepare the Environment Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization) diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index a47849d222..407c9b9ab5 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -17,10 +17,10 @@ ## 特性 1. [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调, ... -2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ... +2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpoint, 梯度累加, 支持推送modelscope hub, 支持自定义数据集, ... -4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ... -5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ... +4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh +5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default ## 准备实验环境 实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化) diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py index d9d0ead270..1a0ecbd6b4 100644 --- a/examples/pytorch/llm/src/llm_infer.py +++ b/examples/pytorch/llm/src/llm_infer.py @@ -18,7 +18,8 @@ @dataclass class InferArguments: model_type: str = field( - default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())}) + default='qwen-7b-chat', + metadata={'choices': list(MODEL_MAPPING.keys())}) sft_type: str = field( default='lora', metadata={'choices': ['lora', 'full']}) template_type: str = field( diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py index 164878a270..2f2a9a7d2c 100644 --- a/examples/pytorch/llm/src/utils/dataset.py +++ b/examples/pytorch/llm/src/utils/dataset.py @@ -110,6 +110,18 @@ def get_instinwild_en_dataset(): return _processing_alpaca(dataset) +def get_cot_en_dataset() -> HfDataset: + dataset: HfDataset = MsDataset.load( + 'YorickHe/CoT', split='train').to_hf_dataset() + return _processing_alpaca(dataset) + + +def get_cot_zh_dataset() -> HfDataset: + dataset: HfDataset = MsDataset.load( + 'YorickHe/CoT_zh', split='train').to_hf_dataset() + return _processing_alpaca(dataset) + + DATASET_MAPPING = { 'alpaca-en': get_alpaca_gpt4_en_dataset, 'alpaca-zh': get_alpaca_gpt4_zh_dataset, @@ -120,8 +132,10 @@ def get_instinwild_en_dataset(): for k in _multi_alpaca_language_list }, 'code-en': get_code_alpaca_en_dataset, - 'instinwild-zh': get_instinwild_zh_dataset, 'instinwild-en': get_instinwild_en_dataset, + 'instinwild-zh': get_instinwild_zh_dataset, + 'cot-en': get_cot_en_dataset, + 'cot-zh': get_cot_zh_dataset, } From 572847a9495715f891b13dc837ac33d7a6fd50f1 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 23 Aug 2023 21:56:28 +0800 Subject: [PATCH 2/9] remove show_freeze_layers --- swift/utils/torch_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py index 1c8085fead..c90d4b38ab 100644 --- a/swift/utils/torch_utils.py +++ b/swift/utils/torch_utils.py @@ -97,15 +97,6 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: logger.info(''.join(s)) -def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None: - named_p = list(model.named_parameters()) - for i, (n, p) in enumerate(named_p): - if max_lines is not None and i >= max_lines: - logger.info('...') - break - logger.info(f'{n}: requires_grad={p.requires_grad}') - - def get_seed(random_state: RandomState) -> int: seed_max = np.iinfo(np.int32).max seed = random_state.randint(0, seed_max) From 320b7dfebbb56a9c0025074e379d25337098b7c6 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 23 Aug 2023 21:57:02 +0800 Subject: [PATCH 3/9] fix chatglm2 bug --- examples/pytorch/llm/src/utils/model.py | 15 ++++++++++----- examples/pytorch/llm/src/utils/preprocess.py | 10 +++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py index ec165bf247..7a3b162ab0 100644 --- a/examples/pytorch/llm/src/utils/model.py +++ b/examples/pytorch/llm/src/utils/model.py @@ -4,8 +4,8 @@ from typing import NamedTuple, Optional import torch -from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, Model, - read_config, snapshot_download) +from modelscope import (AutoConfig, AutoModel, AutoModelForCausalLM, + AutoTokenizer, Model, read_config, snapshot_download) from torch import dtype as Dtype from swift import get_logger @@ -18,6 +18,7 @@ def get_model_tokenizer_from_repo(model_dir: str, load_model: bool = True, model_config=None, tokenizer=None, + automodel_class=AutoModelForCausalLM, **model_kwargs): """load from an independent repository""" if model_config is None: @@ -30,7 +31,7 @@ def get_model_tokenizer_from_repo(model_dir: str, model_dir, trust_remote_code=True) model = None if load_model: - model = AutoModelForCausalLM.from_pretrained( + model = automodel_class.from_pretrained( model_dir, config=model_config, torch_dtype=torch_dtype, @@ -88,8 +89,12 @@ def get_model_tokenizer_chatglm2(model_dir: str, model_kwargs['quantization_config'].llm_int8_skip_modules = [ 'output_layer' ] - return get_model_tokenizer_from_repo(model_dir, torch_dtype, load_model, - **model_kwargs) + return get_model_tokenizer_from_repo( + model_dir, + torch_dtype, + load_model, + automodel_class=AutoModel, + **model_kwargs) def get_model_tokenizer_llama2(model_dir: str, diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py index 19403122c5..9851dbcdc5 100644 --- a/examples/pytorch/llm/src/utils/preprocess.py +++ b/examples/pytorch/llm/src/utils/preprocess.py @@ -51,8 +51,8 @@ def simplify_context_list(context_list: List[Context]) -> List[Context]: - res = [] - temp = [] + res: List[Context] = [] + temp: List[str] = [] for c in context_list: if isinstance(c, str): temp.append(c) @@ -89,7 +89,7 @@ def concat_context_list( def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context], placeholder_list: List[str]) -> List[int]: - input_ids = [] + input_ids: List[int] = [] placeholder_it = iter(placeholder_list) for context in context_list: if isinstance(context, list): @@ -126,8 +126,8 @@ def _preprocess( template_config = TEMPLATE_MAPPING[template_type] if system is None: system = DEFAULT_SYSTEM - total_context_list = [] - placeholder_list = [] + total_context_list: List[Context] = [] + placeholder_list: List[str] = [] concat_context_list( template_config['prefix'], total_context_list, From 15fba8a0633893fbaa821644705f020a17f3aa4b Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Wed, 23 Aug 2023 21:57:37 +0800 Subject: [PATCH 4/9] fix OS:windows bug (need num_workers=0) --- examples/pytorch/llm/src/llm_sft.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index e3f4a73e22..00e54e3ec3 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -84,6 +84,7 @@ class SftArguments: save_steps: Optional[int] = None save_total_limit: int = 2 logging_steps: int = 5 + dataloader_num_workers: int = 1 push_to_hub: bool = False # 'user_name/repo_name' or 'repo_name' @@ -263,7 +264,7 @@ def llm_sft(args: SftArguments) -> None: bf16=args.bf16, fp16=args.fp16, eval_steps=args.eval_steps, - dataloader_num_workers=1, + dataloader_num_workers=args.dataloader_num_workers, load_best_model_at_end=True, metric_for_best_model='loss', greater_is_better=False, From 45c44cbad3f93576afbf53afde94916d9d67878f Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 24 Aug 2023 02:00:37 +0800 Subject: [PATCH 5/9] update ProgressCallbackNew DefaultFlowCallbackNew --- .../pytorch/llm/src/utils/trainer_patch.py | 59 +++++++++++++++++++ examples/pytorch/llm/src/utils/utils.py | 24 +++++++- 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 examples/pytorch/llm/src/utils/trainer_patch.py diff --git a/examples/pytorch/llm/src/utils/trainer_patch.py b/examples/pytorch/llm/src/utils/trainer_patch.py new file mode 100644 index 0000000000..5befb70527 --- /dev/null +++ b/examples/pytorch/llm/src/utils/trainer_patch.py @@ -0,0 +1,59 @@ +import os + +import json +from tqdm import tqdm +from transformers.trainer_callback import (DefaultFlowCallback, + ProgressCallback, TrainerControl, + TrainerState) +from transformers.trainer_utils import has_length + +from swift.trainers import TrainingArguments + + +class ProgressCallbackNew(ProgressCallback): + + def on_train_begin(self, args, state, control, **kwargs): + if state.is_local_process_zero: + self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True) + self.current_step = 0 + + def on_prediction_step(self, + args, + state: TrainerState, + control, + eval_dataloader=None, + **kwargs): + if state.is_local_process_zero and has_length(eval_dataloader): + if self.prediction_bar is None: + self.training_bar.refresh() + self.training_bar.fp.write('\n') + self.prediction_bar = tqdm( + total=len(eval_dataloader), + leave=True, + dynamic_ncols=True, + position=0) + self.prediction_bar.update() + + def on_log(self, + args: TrainingArguments, + state: TrainerState, + control, + logs=None, + **kwargs): + if state.is_local_process_zero and self.training_bar is not None: + jsonl_path = os.path.join(args.output_dir, 'logging.jsonl') + with open(jsonl_path, 'a') as f: + f.write(json.dumps(logs) + '\n') + super().on_log(args, state, control, logs, **kwargs) + + +class DefaultFlowCallbackNew(DefaultFlowCallback): + + def on_step_end(self, args: TrainingArguments, state: TrainerState, + control: TrainerControl, **kwargs): + control = super().on_step_end(args, state, control, **kwargs) + # save the last ckpt + if state.global_step == state.max_steps: + control.should_evaluate = True + control.should_save = True + return control diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py index b279ac81f2..84383b2177 100644 --- a/examples/pytorch/llm/src/utils/utils.py +++ b/examples/pytorch/llm/src/utils/utils.py @@ -1,19 +1,39 @@ +import logging import os from typing import List, Optional, Tuple import matplotlib.pyplot as plt import torch import torch.distributed as dist +from modelscope.utils.logger import get_logger as get_ms_logger from torch import dtype as Dtype from torch.nn import Linear, Module -from transformers import GenerationConfig, TextStreamer +from transformers import GenerationConfig, TextStreamer, trainer from swift import get_logger +from swift.utils import is_master from swift.utils.tb_utils import (TB_COLOR, TB_COLOR_SMOOTH, read_tensorboard_file, tensorboard_smoothing) +from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew + +# monkey patch +trainer.DEFAULT_PROGRESS_CALLBACK = ProgressCallbackNew +trainer.DEFAULT_CALLBACKS = [DefaultFlowCallbackNew] -os.environ['TOKENIZERS_PARALLELISM'] = 'true' logger = get_logger() +ms_logger = get_ms_logger() + +os.environ['TOKENIZERS_PARALLELISM'] = 'true' +logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s') + +logger.handlers[0].setFormatter(logger_format) +ms_logger.handlers[0].setFormatter(logger_format) +if is_master(): + logger.setLevel(logging.INFO) + ms_logger.setLevel(logging.INFO) +else: + logger.setLevel(logging.ERROR) + ms_logger.setLevel(logging.ERROR) DTYPE_MAPPING = { 'fp16': torch.float16, From 8e0fe4bd1458f01f96b6edb401aa6ea22730e3c0 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 24 Aug 2023 02:16:50 +0800 Subject: [PATCH 6/9] update on_log --- examples/pytorch/llm/src/utils/trainer_patch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/pytorch/llm/src/utils/trainer_patch.py b/examples/pytorch/llm/src/utils/trainer_patch.py index 5befb70527..8d88c62a7b 100644 --- a/examples/pytorch/llm/src/utils/trainer_patch.py +++ b/examples/pytorch/llm/src/utils/trainer_patch.py @@ -40,6 +40,8 @@ def on_log(self, control, logs=None, **kwargs): + logs['global_step'] = state.global_step + logs['learning_rate'] = round(logs['learning_rate'], 8) if state.is_local_process_zero and self.training_bar is not None: jsonl_path = os.path.join(args.output_dir, 'logging.jsonl') with open(jsonl_path, 'a') as f: From 89b03fb126df6b28a40286153592db46679664f0 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 24 Aug 2023 02:22:06 +0800 Subject: [PATCH 7/9] fix bug --- examples/pytorch/llm/src/utils/trainer_patch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/llm/src/utils/trainer_patch.py b/examples/pytorch/llm/src/utils/trainer_patch.py index 8d88c62a7b..4b666af7aa 100644 --- a/examples/pytorch/llm/src/utils/trainer_patch.py +++ b/examples/pytorch/llm/src/utils/trainer_patch.py @@ -41,7 +41,8 @@ def on_log(self, logs=None, **kwargs): logs['global_step'] = state.global_step - logs['learning_rate'] = round(logs['learning_rate'], 8) + if 'learning_rate' in logs: + logs['learning_rate'] = round(logs['learning_rate'], 8) if state.is_local_process_zero and self.training_bar is not None: jsonl_path = os.path.join(args.output_dir, 'logging.jsonl') with open(jsonl_path, 'a') as f: From cc60566ab9b7091ea6a27517d16a7ba15b021fd5 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 24 Aug 2023 14:23:34 +0800 Subject: [PATCH 8/9] fix gradient_checkpointing warning --- examples/pytorch/llm/src/llm_sft.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py index 00e54e3ec3..96b824bad7 100644 --- a/examples/pytorch/llm/src/llm_sft.py +++ b/examples/pytorch/llm/src/llm_sft.py @@ -69,7 +69,7 @@ class SftArguments: lora_alpha: int = 32 lora_dropout_p: float = 0.1 - gradient_checkpoint: bool = True + gradient_checkpointing: bool = True batch_size: int = 1 num_train_epochs: int = 1 optim: str = 'adamw_torch' @@ -277,11 +277,12 @@ def llm_sft(args: SftArguments) -> None: push_to_hub=args.push_to_hub, resume_from_checkpoint=args.resume_from_ckpt, ddp_backend=args.ddp_backend, - gradient_checkpointing=args.gradient_checkpoint, + gradient_checkpointing=args.gradient_checkpointing, local_rank=local_rank) - if args.gradient_checkpoint: + if args.gradient_checkpointing: # fix: gradients will be None + model.config.use_cache = False model.enable_input_require_grads() if is_dist(): trainer_args.ddp_find_unused_parameters = False From f4074007a063429c20c0535b7ab7e1770546e3d4 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 24 Aug 2023 14:25:44 +0800 Subject: [PATCH 9/9] update readme --- examples/pytorch/llm/README.md | 2 +- examples/pytorch/llm/README_CN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md index 8f438febcb..759a6b5252 100644 --- a/examples/pytorch/llm/README.md +++ b/examples/pytorch/llm/README.md @@ -4,7 +4,7 @@

- +

diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md index 407c9b9ab5..35632dd624 100644 --- a/examples/pytorch/llm/README_CN.md +++ b/examples/pytorch/llm/README_CN.md @@ -4,7 +4,7 @@

- +