diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index f9e08a2fbf..e1bcf8da00 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -22,6 +22,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
 
     pip install -r requirements/framework.txt -U -i https://mirrors.aliyun.com/pypi/simple/
     pip install diffusers decord einops -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install autoawq!=0.2.7.post3 -U --no-deps
 
     # test with install
     pip install .
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 2b03562ead..283912e133 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -107,9 +107,9 @@
 - 🔥gradient_accumulation_steps: 梯度累加，默认为1
 - 🔥save_strategy: 保存模型的策略, 可选为'no', 'steps', 'epoch', 默认为'steps'
 - 🔥save_steps: 默认为500
-- 🔥save_total_limit: 默认为None, 保存所有的checkpoint
-- 🔥eval_strategy: 评估策略, 跟随`save_strategy`策略
+- 🔥eval_strategy: 默认为None, 评估策略, 跟随`save_strategy`的策略
 - 🔥eval_steps: 默认为None, 如果存在评估数据集, 则跟随`save_steps`的策略
+- 🔥save_total_limit: 默认为None, 保存所有的checkpoint
 - max_steps: 默认为-1，最大训练的steps数，在数据集为流式时需要设置
 - 🔥warmup_ratio: 默认为0.
 - save_on_each_node: 默认为False. 在多机训练时需要被考虑
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index db6ff7967e..fc959e9779 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -107,9 +107,9 @@ Other important parameters:
 - 🔥gradient_accumulation_steps: Gradient accumulation, default is 1.
 - 🔥save_strategy: Strategy for saving the model, options are 'no', 'steps', 'epoch', default is 'steps'.
 - 🔥save_steps: Default is 500.
-- 🔥save_total_limit: Default is None, saving all checkpoints.
-- 🔥eval_strategy: Evaluation strategy, follows `save_strategy`.
+- 🔥eval_strategy: Default is None. Evaluation strategy, follows `save_strategy`.
 - 🔥eval_steps: Default is None. If evaluation dataset exists, follows `save_steps`.
+- 🔥save_total_limit: Default is None, saving all checkpoints.
 - max_steps: Default is -1, maximum number of training steps. Must be set when the dataset is streaming.
 - 🔥warmup_ratio: Default is 0.
 - save_on_each_node: Default is False. To be considered in multi-machine training.
diff --git a/requirements/framework.txt b/requirements/framework.txt
index 4a3aac71bb..541d01414e 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -28,7 +28,7 @@ sentencepiece
 tensorboard
 tiktoken
 tqdm
-transformers>=4.33,<4.47
+transformers>=4.33,<4.48
 transformers_stream_generator
 trl>=0.11,<0.12
 uvicorn
diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
index 865fd9a466..165abe231d 100644
--- a/swift/llm/argument/train_args.py
+++ b/swift/llm/argument/train_args.py
@@ -34,12 +34,24 @@ class Seq2SeqTrainingOverrideArguments(Seq2SeqTrainingArguments):
     report_to: List[str] = field(default_factory=lambda: ['tensorboard'])
     remove_unused_columns: bool = False
     logging_first_step: bool = True
+    eval_strategy: Optional[str] = None  # steps, epoch
 
     def _init_output_dir(self):
         if self.output_dir is not None:
             return
         self.output_dir = f'output/{self.model_suffix}'
 
+    def _init_eval_strategy(self):
+        if self.eval_strategy is None:
+            self.eval_strategy = self.save_strategy
+        if self.eval_strategy == 'no':
+            self.eval_steps = None
+            self.split_dataset_ratio = 0.
+            logger.info(f'Setting args.split_dataset_ratio: {self.split_dataset_ratio}')
+        elif self.eval_strategy == 'steps' and self.eval_steps is None:
+            self.eval_steps = self.save_steps
+        self.evaluation_strategy = self.eval_strategy
+
     def __post_init__(self):
         self._init_output_dir()
         if self.metric_for_best_model is None:
@@ -56,16 +68,7 @@ def __post_init__(self):
             self.lr_scheduler_kwargs = self.parse_to_dict(self.lr_scheduler_kwargs)
         if getattr(self, 'gradient_checkpointing_kwargs', None):
             self.gradient_checkpointing_kwargs = self.parse_to_dict(self.gradient_checkpointing_kwargs)
-
-        if len(self.val_dataset) == 0 and self.split_dataset_ratio == 0:
-            self.evaluation_strategy = IntervalStrategy.NO
-            self.eval_strategy = IntervalStrategy.NO
-            self.eval_steps = None
-        else:
-            self.evaluation_strategy = self.save_strategy
-            self.eval_strategy = self.save_strategy
-            if self.eval_steps is None:
-                self.eval_steps = self.save_steps
+        self._init_eval_strategy()
 
 
 @dataclass
diff --git a/tests/train/test_sft.py b/tests/train/test_sft.py
index 4c580926ba..b160e0788d 100644
--- a/tests/train/test_sft.py
+++ b/tests/train/test_sft.py
@@ -304,6 +304,36 @@ def test_emu3_gen():
     infer_main(args)
 
 
+def test_eval_strategy():
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
+    from swift.llm import sft_main, TrainArguments, infer_main, InferArguments
+    result = sft_main(
+        TrainArguments(
+            model='qwen/Qwen2-7B-Instruct',
+            eval_strategy='no',
+            dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100', 'AI-ModelScope/alpaca-gpt4-data-en#100'],
+            **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
+
+
+def test_epoch():
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
+    from swift.llm import sft_main, TrainArguments, infer_main, InferArguments
+
+    train_kwargs = kwargs.copy()
+    train_kwargs['num_train_epochs'] = 3
+    # train_kwargs['save_steps'] = 2  # not use
+    sft_main(
+        TrainArguments(
+            model='qwen/Qwen2-7B-Instruct',
+            dataset=['AI-ModelScope/alpaca-gpt4-data-zh#50', 'AI-ModelScope/alpaca-gpt4-data-en#50'],
+            save_strategy='epoch',
+            **train_kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
+
+
 if __name__ == '__main__':
     # test_llm_ddp()
     # test_mllm_mp()
@@ -325,5 +355,7 @@ def test_emu3_gen():
     # test_template()
     # test_qwen_vl()
     # test_qwen2_audio()
-    test_emu3_gen()
+    # test_emu3_gen()
     # test_unsloth()
+    # test_eval_strategy()
+    test_epoch()