diff --git a/.gitignore b/.gitignore
index 6cc2df63a4..90e5d4a6cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -131,3 +131,5 @@ result.mp4
 
 # ast template
 ast_index_file.py
+
+runs/
\ No newline at end of file
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index cd002cce62..759a6b5252 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -4,7 +4,7 @@
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
-<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.8.1-5D91D4.svg"></a>
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.8.4-5D91D4.svg"></a>
 <a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-%E2%89%A51.0.0-6FEBB9.svg"></a>
 </p>
 
@@ -16,10 +16,10 @@
 
 ## Features
 1. supported sft method: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine tuning), ...
-2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ...
+2. supported models: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b
 3. supported feature: quantization, ddp, model parallelism(device map), gradient checkpoint, gradient accumulation steps, push to modelscope hub, custom datasets, ...
-4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ...
-5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ...
+4. supported datasets: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh
+5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default
 
 ## Prepare the Environment
 Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization)
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index a47849d222..35632dd624 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -4,7 +4,7 @@
 <p align="center">
 <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
 <img src="https://img.shields.io/badge/pytorch-%E2%89%A51.12%20%7C%20%E2%89%A52.0-orange.svg">
-<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.8.1-5D91D4.svg"></a>
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.8.4-5D91D4.svg"></a>
 <a href="https://github.com/modelscope/swift/"><img src="https://img.shields.io/badge/ms--swift-%E2%89%A51.0.0-6FEBB9.svg">
 </p>
 
@@ -17,10 +17,10 @@
 
 ## 特性
 1. [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调, ...
-2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b, ...
+2. 支持的模型: [**qwen-7b**](https://github.com/QwenLM/Qwen-7B), qwen-7b-chat, baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, polylm-13b
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpoint, 梯度累加, 支持推送modelscope hub, 支持自定义数据集, ...
-4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, ...
-5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default, ...
+4. 支持的数据集: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh
+5. 支持的template: chatml(qwen), baichuan, chatglm2, llama, openbuddy_llama, default
 
 ## 准备实验环境
 实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化)
diff --git a/examples/pytorch/llm/src/llm_infer.py b/examples/pytorch/llm/src/llm_infer.py
index d9d0ead270..1a0ecbd6b4 100644
--- a/examples/pytorch/llm/src/llm_infer.py
+++ b/examples/pytorch/llm/src/llm_infer.py
@@ -18,7 +18,8 @@
 @dataclass
 class InferArguments:
     model_type: str = field(
-        default='qwen-7b-chat', metadata={'choices': list(MODEL_MAPPING.keys())})
+        default='qwen-7b-chat',
+        metadata={'choices': list(MODEL_MAPPING.keys())})
     sft_type: str = field(
         default='lora', metadata={'choices': ['lora', 'full']})
     template_type: str = field(
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
index e3f4a73e22..96b824bad7 100644
--- a/examples/pytorch/llm/src/llm_sft.py
+++ b/examples/pytorch/llm/src/llm_sft.py
@@ -69,7 +69,7 @@ class SftArguments:
     lora_alpha: int = 32
     lora_dropout_p: float = 0.1
 
-    gradient_checkpoint: bool = True
+    gradient_checkpointing: bool = True
     batch_size: int = 1
     num_train_epochs: int = 1
     optim: str = 'adamw_torch'
@@ -84,6 +84,7 @@ class SftArguments:
     save_steps: Optional[int] = None
     save_total_limit: int = 2
     logging_steps: int = 5
+    dataloader_num_workers: int = 1
 
     push_to_hub: bool = False
     # 'user_name/repo_name' or 'repo_name'
@@ -263,7 +264,7 @@ def llm_sft(args: SftArguments) -> None:
         bf16=args.bf16,
         fp16=args.fp16,
         eval_steps=args.eval_steps,
-        dataloader_num_workers=1,
+        dataloader_num_workers=args.dataloader_num_workers,
         load_best_model_at_end=True,
         metric_for_best_model='loss',
         greater_is_better=False,
@@ -276,11 +277,12 @@ def llm_sft(args: SftArguments) -> None:
         push_to_hub=args.push_to_hub,
         resume_from_checkpoint=args.resume_from_ckpt,
         ddp_backend=args.ddp_backend,
-        gradient_checkpointing=args.gradient_checkpoint,
+        gradient_checkpointing=args.gradient_checkpointing,
         local_rank=local_rank)
 
-    if args.gradient_checkpoint:
+    if args.gradient_checkpointing:
         # fix: gradients will be None
+        model.config.use_cache = False
         model.enable_input_require_grads()
         if is_dist():
             trainer_args.ddp_find_unused_parameters = False
diff --git a/examples/pytorch/llm/src/utils/dataset.py b/examples/pytorch/llm/src/utils/dataset.py
index 164878a270..2f2a9a7d2c 100644
--- a/examples/pytorch/llm/src/utils/dataset.py
+++ b/examples/pytorch/llm/src/utils/dataset.py
@@ -110,6 +110,18 @@ def get_instinwild_en_dataset():
     return _processing_alpaca(dataset)
 
 
+def get_cot_en_dataset() -> HfDataset:
+    dataset: HfDataset = MsDataset.load(
+        'YorickHe/CoT', split='train').to_hf_dataset()
+    return _processing_alpaca(dataset)
+
+
+def get_cot_zh_dataset() -> HfDataset:
+    dataset: HfDataset = MsDataset.load(
+        'YorickHe/CoT_zh', split='train').to_hf_dataset()
+    return _processing_alpaca(dataset)
+
+
 DATASET_MAPPING = {
     'alpaca-en': get_alpaca_gpt4_en_dataset,
     'alpaca-zh': get_alpaca_gpt4_zh_dataset,
@@ -120,8 +132,10 @@ def get_instinwild_en_dataset():
         for k in _multi_alpaca_language_list
     },
     'code-en': get_code_alpaca_en_dataset,
-    'instinwild-zh': get_instinwild_zh_dataset,
     'instinwild-en': get_instinwild_en_dataset,
+    'instinwild-zh': get_instinwild_zh_dataset,
+    'cot-en': get_cot_en_dataset,
+    'cot-zh': get_cot_zh_dataset,
 }
 
 
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
index ec165bf247..7a3b162ab0 100644
--- a/examples/pytorch/llm/src/utils/model.py
+++ b/examples/pytorch/llm/src/utils/model.py
@@ -4,8 +4,8 @@
 from typing import NamedTuple, Optional
 
 import torch
-from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, Model,
-                        read_config, snapshot_download)
+from modelscope import (AutoConfig, AutoModel, AutoModelForCausalLM,
+                        AutoTokenizer, Model, read_config, snapshot_download)
 from torch import dtype as Dtype
 
 from swift import get_logger
@@ -18,6 +18,7 @@ def get_model_tokenizer_from_repo(model_dir: str,
                                   load_model: bool = True,
                                   model_config=None,
                                   tokenizer=None,
+                                  automodel_class=AutoModelForCausalLM,
                                   **model_kwargs):
     """load from an independent repository"""
     if model_config is None:
@@ -30,7 +31,7 @@ def get_model_tokenizer_from_repo(model_dir: str,
             model_dir, trust_remote_code=True)
     model = None
     if load_model:
-        model = AutoModelForCausalLM.from_pretrained(
+        model = automodel_class.from_pretrained(
             model_dir,
             config=model_config,
             torch_dtype=torch_dtype,
@@ -88,8 +89,12 @@ def get_model_tokenizer_chatglm2(model_dir: str,
         model_kwargs['quantization_config'].llm_int8_skip_modules = [
             'output_layer'
         ]
-    return get_model_tokenizer_from_repo(model_dir, torch_dtype, load_model,
-                                         **model_kwargs)
+    return get_model_tokenizer_from_repo(
+        model_dir,
+        torch_dtype,
+        load_model,
+        automodel_class=AutoModel,
+        **model_kwargs)
 
 
 def get_model_tokenizer_llama2(model_dir: str,
diff --git a/examples/pytorch/llm/src/utils/preprocess.py b/examples/pytorch/llm/src/utils/preprocess.py
index 19403122c5..9851dbcdc5 100644
--- a/examples/pytorch/llm/src/utils/preprocess.py
+++ b/examples/pytorch/llm/src/utils/preprocess.py
@@ -51,8 +51,8 @@
 
 
 def simplify_context_list(context_list: List[Context]) -> List[Context]:
-    res = []
-    temp = []
+    res: List[Context] = []
+    temp: List[str] = []
     for c in context_list:
         if isinstance(c, str):
             temp.append(c)
@@ -89,7 +89,7 @@ def concat_context_list(
 
 def _encode(tokenizer: PreTrainedTokenizer, context_list: List[Context],
             placeholder_list: List[str]) -> List[int]:
-    input_ids = []
+    input_ids: List[int] = []
     placeholder_it = iter(placeholder_list)
     for context in context_list:
         if isinstance(context, list):
@@ -126,8 +126,8 @@ def _preprocess(
     template_config = TEMPLATE_MAPPING[template_type]
     if system is None:
         system = DEFAULT_SYSTEM
-    total_context_list = []
-    placeholder_list = []
+    total_context_list: List[Context] = []
+    placeholder_list: List[str] = []
     concat_context_list(
         template_config['prefix'],
         total_context_list,
diff --git a/examples/pytorch/llm/src/utils/trainer_patch.py b/examples/pytorch/llm/src/utils/trainer_patch.py
new file mode 100644
index 0000000000..4b666af7aa
--- /dev/null
+++ b/examples/pytorch/llm/src/utils/trainer_patch.py
@@ -0,0 +1,62 @@
+import os
+
+import json
+from tqdm import tqdm
+from transformers.trainer_callback import (DefaultFlowCallback,
+                                           ProgressCallback, TrainerControl,
+                                           TrainerState)
+from transformers.trainer_utils import has_length
+
+from swift.trainers import TrainingArguments
+
+
+class ProgressCallbackNew(ProgressCallback):
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
+        self.current_step = 0
+
+    def on_prediction_step(self,
+                           args,
+                           state: TrainerState,
+                           control,
+                           eval_dataloader=None,
+                           **kwargs):
+        if state.is_local_process_zero and has_length(eval_dataloader):
+            if self.prediction_bar is None:
+                self.training_bar.refresh()
+                self.training_bar.fp.write('\n')
+                self.prediction_bar = tqdm(
+                    total=len(eval_dataloader),
+                    leave=True,
+                    dynamic_ncols=True,
+                    position=0)
+            self.prediction_bar.update()
+
+    def on_log(self,
+               args: TrainingArguments,
+               state: TrainerState,
+               control,
+               logs=None,
+               **kwargs):
+        logs['global_step'] = state.global_step
+        if 'learning_rate' in logs:
+            logs['learning_rate'] = round(logs['learning_rate'], 8)
+        if state.is_local_process_zero and self.training_bar is not None:
+            jsonl_path = os.path.join(args.output_dir, 'logging.jsonl')
+            with open(jsonl_path, 'a') as f:
+                f.write(json.dumps(logs) + '\n')
+        super().on_log(args, state, control, logs, **kwargs)
+
+
+class DefaultFlowCallbackNew(DefaultFlowCallback):
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState,
+                    control: TrainerControl, **kwargs):
+        control = super().on_step_end(args, state, control, **kwargs)
+        # save the last ckpt
+        if state.global_step == state.max_steps:
+            control.should_evaluate = True
+            control.should_save = True
+        return control
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py
index b279ac81f2..84383b2177 100644
--- a/examples/pytorch/llm/src/utils/utils.py
+++ b/examples/pytorch/llm/src/utils/utils.py
@@ -1,19 +1,39 @@
+import logging
 import os
 from typing import List, Optional, Tuple
 
 import matplotlib.pyplot as plt
 import torch
 import torch.distributed as dist
+from modelscope.utils.logger import get_logger as get_ms_logger
 from torch import dtype as Dtype
 from torch.nn import Linear, Module
-from transformers import GenerationConfig, TextStreamer
+from transformers import GenerationConfig, TextStreamer, trainer
 
 from swift import get_logger
+from swift.utils import is_master
 from swift.utils.tb_utils import (TB_COLOR, TB_COLOR_SMOOTH,
                                   read_tensorboard_file, tensorboard_smoothing)
+from .trainer_patch import DefaultFlowCallbackNew, ProgressCallbackNew
+
+# monkey patch
+trainer.DEFAULT_PROGRESS_CALLBACK = ProgressCallbackNew
+trainer.DEFAULT_CALLBACKS = [DefaultFlowCallbackNew]
 
-os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 logger = get_logger()
+ms_logger = get_ms_logger()
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+logger_format = logging.Formatter('[%(levelname)s:%(name)s] %(message)s')
+
+logger.handlers[0].setFormatter(logger_format)
+ms_logger.handlers[0].setFormatter(logger_format)
+if is_master():
+    logger.setLevel(logging.INFO)
+    ms_logger.setLevel(logging.INFO)
+else:
+    logger.setLevel(logging.ERROR)
+    ms_logger.setLevel(logging.ERROR)
 
 DTYPE_MAPPING = {
     'fp16': torch.float16,
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 1c8085fead..c90d4b38ab 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -97,15 +97,6 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
     logger.info(''.join(s))
 
 
-def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None:
-    named_p = list(model.named_parameters())
-    for i, (n, p) in enumerate(named_p):
-        if max_lines is not None and i >= max_lines:
-            logger.info('...')
-            break
-        logger.info(f'{n}: requires_grad={p.requires_grad}')
-
-
 def get_seed(random_state: RandomState) -> int:
     seed_max = np.iinfo(np.int32).max
     seed = random_state.randint(0, seed_max)