From d592f8e1ad8dccd5e731ca2f8fe8794ab574f1e0 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 10:55:01 +0800
Subject: [PATCH 1/7] merge main return

---
 README.md                         |  2 +-
 README_CN.md                      |  2 +-
 examples/pytorch/llm/README.md    |  2 +-
 examples/pytorch/llm/README_CN.md |  2 +-
 examples/pytorch/llm/llm_infer.py |  3 ++-
 examples/pytorch/llm/llm_sft.py   |  4 ++--
 swift/llm/infer.py                | 12 ++++++++++++
 swift/llm/sft.py                  |  9 +++++++--
 tests/llm/test_run.py             |  9 ++++++---
 9 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 4e8f2f9dcf..638b258fa2 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
diff --git a/README_CN.md b/README_CN.md
index e61258079c..34a392c425 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -138,7 +138,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 51b1f457fc..7303f30aaa 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -104,7 +104,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index def15e6c95..6124b933cd 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -103,7 +103,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py
index 3685d63241..2fb5551f37 100644
--- a/examples/pytorch/llm/llm_infer.py
+++ b/examples/pytorch/llm/llm_infer.py
@@ -4,4 +4,5 @@
 from swift.llm.run import infer_main
 
 if __name__ == '__main__':
-    infer_main()
+    result = infer_main()
+    print(f'infer_main result: {result}')
diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py
index 7473cf41ff..d50a532436 100644
--- a/examples/pytorch/llm/llm_sft.py
+++ b/examples/pytorch/llm/llm_sft.py
@@ -4,5 +4,5 @@
 from swift.llm.run import sft_main
 
 if __name__ == '__main__':
-    best_ckpt_dir = sft_main()
-    print(f'best_ckpt_dir: {best_ckpt_dir}')
+    output = sft_main()
+    print(f'sft_main output: {output}')
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index ad398fe662..43b07b509c 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -132,6 +132,7 @@ def llm_infer(args: InferArguments) -> None:
         assert args.ckpt_dir is not None
         model.generation_config.save_pretrained(args.ckpt_dir)
     # Inference
+    result = []
     jsonl_path = None
     if args.save_result:
         time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
@@ -143,6 +144,11 @@ def llm_infer(args: InferArguments) -> None:
             if jsonl_path is not None:
                 item = history[0]
                 save_result_to_jsonl(jsonl_path, item[0], item[1])
+            result.append({
+                'query': item[0],
+                'response': item[1],
+                'label': None
+            })
     else:
         _, val_dataset = get_dataset(args.dataset, args.dataset_test_ratio,
                                      args.dataset_seed)
@@ -163,9 +169,15 @@ def llm_infer(args: InferArguments) -> None:
             if jsonl_path is not None:
                 item = history[0]
                 save_result_to_jsonl(jsonl_path, item[0], item[1], label)
+            result.append({
+                'query': item[0],
+                'response': item[1],
+                'label': label
+            })
             print()
             print(f'[LABELS]{label}')
             print('-' * 80)
             # input('next[ENTER]')
     if args.save_result:
         logger.info(f'save_result_path: {jsonl_path}')
+    return {'result': result}
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index b9ecd258a6..5952404fa5 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -267,7 +267,7 @@ def llm_sft(args: SftArguments) -> str:
                     f,
                     ensure_ascii=False,
                     indent=2)
-    trainer.train(training_args.resume_from_checkpoint)
+    res = trainer.train(training_args.resume_from_checkpoint)
     logger.info(
         f'best_model_checkpoint: {trainer.state.best_model_checkpoint}')
 
@@ -280,4 +280,9 @@ def llm_sft(args: SftArguments) -> str:
         if args.push_to_hub:
             trainer._add_patterns_to_gitignores(['images/'])
             trainer.push_to_hub()
-    return trainer.state.best_model_checkpoint
+    return {
+        'best_model_checkpoint': trainer.state.best_model_checkpoint,
+        'best_metric': trainer.state.best_metric,
+        'global_step': res.global_step,
+        'log_history': trainer.state.log_history,
+    }
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
index 2b1cb87c3b..d3d4de40c3 100644
--- a/tests/llm/test_run.py
+++ b/tests/llm/test_run.py
@@ -39,7 +39,9 @@ def test_run_1(self):
             dataset=[DatasetName.jd_sentiment_zh],
             output_dir=output_dir,
             gradient_checkpointing=True)
-        best_ckpt_dir = sft_main(sft_args)
+        output = sft_main(sft_args)
+        print(output)
+        best_ckpt_dir = output['best_model_checkpoint']
         print(f'best_ckpt_dir: {best_ckpt_dir}')
         torch.cuda.empty_cache()
         if __name__ == '__main__':
@@ -48,7 +50,8 @@ def test_run_1(self):
                 stream=False,
                 show_dataset_sample=5,
                 merge_lora_and_save=True)
-            infer_main(infer_args)
+            result = infer_main(infer_args)
+            print(result)
             torch.cuda.empty_cache()
         # if __name__ == '__main__':
         #     web_ui_main(infer_args)
@@ -80,7 +83,7 @@ def test_run_2(self):
             'true',
             '--max_new_tokens',
             '100',
-        ])
+        ])['best_model_checkpoint']
         print(f'best_ckpt_dir: {best_ckpt_dir}')
         torch.cuda.empty_cache()
         infer_main([

From fd5f9197f1319e81682f4608d3b5acad19cf87bf Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:14:02 +0800
Subject: [PATCH 2/7] update temperature sh

---
 .../scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh    |  4 +---
 .../scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh    |  4 +---
 .../scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh   |  4 +---
 .../baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh      |  4 +---
 .../pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh    |  4 +---
 .../llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh    |  4 +---
 .../llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh |  4 +---
 .../scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh    |  4 +---
 .../baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh       |  4 +---
 .../scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh    |  4 +---
 .../pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh   |  4 +---
 .../llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh       |  4 +---
 examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh |  4 +---
 .../llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh       |  4 +---
 .../pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh |  4 +---
 .../pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh |  4 +---
 .../llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh  |  4 +---
 .../custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh     |  4 +---
 .../scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh    |  4 +---
 .../pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh |  4 +---
 .../pytorch/llm/scripts/internlm_20b/qlora/infer.sh    |  4 +---
 .../llm/scripts/internlm_20b_chat/lora_ddp/infer.sh    |  4 +---
 .../llm/scripts/internlm_20b_chat/qlora/infer.sh       |  4 +---
 .../llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh   |  4 +---
 .../llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh  |  4 +---
 .../llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh  |  4 +---
 .../llm/scripts/llama2_70b_chat/qlora_mp/infer.sh      |  4 +---
 .../llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh   |  4 +---
 .../llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh   |  4 +---
 .../openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh    |  4 +---
 .../openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh    |  4 +---
 .../openbuddy_llama2_70b_chat/qlora_mp/infer.sh        |  4 +---
 .../openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh     |  4 +---
 .../openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh     |  4 +---
 .../llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh       |  4 +---
 .../pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh  |  4 +---
 examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh   |  4 +---
 .../pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh |  4 +---
 .../llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh     |  4 +---
 .../pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh   |  4 +---
 .../llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh    |  4 +---
 .../llm/scripts/qwen_14b_chat_int4/qlora/infer.sh      |  4 +---
 .../scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh   |  4 +---
 .../llm/scripts/qwen_14b_chat_int8/qlora/infer.sh      |  4 +---
 .../scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh   |  4 +---
 .../pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh   |  4 +---
 .../pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh     |  4 +---
 .../pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh  |  4 +---
 .../llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh      |  4 +---
 .../pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh     |  4 +---
 .../pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh |  4 +---
 .../llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh      |  4 +---
 .../llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh      |  4 +---
 .../pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh    |  4 +---
 .../llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh        |  4 +---
 .../llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh     |  4 +---
 .../llm/scripts/qwen_7b_chat_int4/qlora/infer.sh       |  4 +---
 .../scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh    |  4 +---
 .../llm/scripts/qwen_7b_chat_int8/qlora/infer.sh       |  4 +---
 .../scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh    |  4 +---
 .../pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh   |  4 +---
 .../llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh      |  4 +---
 .../pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh    |  4 +---
 .../llm/scripts/qwen_vl_chat_int4/qlora/infer.sh       |  4 +---
 .../scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh    |  4 +---
 examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh |  4 +---
 .../pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh  |  4 +---
 .../pytorch/llm/scripts/skywork_13b/qlora/infer.sh     |  4 +---
 .../tongyi_finance_14b_chat_int4/qlora/infer.sh        |  4 +---
 examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh |  4 +---
 .../pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh   |  4 +---
 .../pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh    |  4 +---
 examples/pytorch/llm/scripts/yi_6b/lora/infer.sh       |  4 +---
 .../pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh  |  4 +---
 .../llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh   |  4 +---
 swift/llm/infer.py                                     |  6 +++---
 swift/llm/utils/argument.py                            | 10 +++++-----
 77 files changed, 83 insertions(+), 233 deletions(-)

diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
index 6760c2364c..8b1e838f60 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
index 9627c8b84d..c41a44566e 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
index c3a7622c8f..2b7f125caa 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
index 24aa5f670a..a1784975e9 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
index 0d6d352ab6..41a169e4c2 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
index bcab229e2f..def08b768d 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
index a83052ac99..f82997b55e 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
index a83052ac99..f82997b55e 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
index de565f58a2..ab2967d46d 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
index 9b86a969b2..27944dbe4a 100644
--- a/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
index 4055c4e1e0..f98bb1a502 100644
--- a/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
index 253db589ef..6daca4b5e7 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
index bfd10b75d3..e1f8f5239d 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
index e557ff7e18..61819a9333 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
index 67f4aef1c6..c5cdce1c2f 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
index 039773d331..0d00b9b74b 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
index c55164ef6d..9a85ff57f3 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
index d0e916ce42..7470ac92f3 100644
--- a/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
index 7f87800282..c3a5b0555e 100644
--- a/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
index f052c7e44e..940ab0ada8 100644
--- a/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
index e186723f97..2dc5933123 100644
--- a/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
index 5e85d10514..8aa84aab31 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
index a2370ccc27..5caa8fac22 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
index a2370ccc27..5caa8fac22 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
index 734781e2f2..5039d3ae65 100644
--- a/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
index a3a6d3da80..d7eb2e12c7 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
index 13898a94c3..94dc8b7d89 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
index a74b1acbd1..84479922dd 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
index 4a2cfb4701..6d1f97cc4b 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
index 2aa957fc83..48e862181b 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
index be363770d3..21d7935052 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
index 7c0c8a8675..14afadc0c8 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
index c73c117aff..2698b59160 100644
--- a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
index de551d54b1..620775a01e 100644
--- a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
index 6bccac7005..e4f571db80 100644
--- a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
index 62bb4e058b..1c9aaa8b9f 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn true \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
index 486a96ab06..a5eaafa8eb 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
index 486a96ab06..a5eaafa8eb 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
index d59a4833c8..0a5cb80521 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn true \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
index ede3d0cdb8..f19c14dc24 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
index 9f7905a3a1..44e1278cfc 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
index b53ae8f5f9..f60f9e18c8 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
index b53ae8f5f9..f60f9e18c8 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
index 933fb2cb30..ac1a7a90cb 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
index 933fb2cb30..73e64fd235 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
index bc0ee980bf..6ec7b9ff0f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
index 3e23f52f45..c2813f6c4d 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
index 6748277f70..60fe98d23f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
@@ -8,8 +8,6 @@ python llm_infer.py \
     --max_length 6144 \
     --use_flash_attn true \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
index 6748277f70..0c3b83b2df 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -8,8 +8,6 @@ python llm_infer.py \
     --max_length 6144 \
     --use_flash_attn true \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
index b618573ba7..480a89db35 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
index 6436ad0e87..65bf889d90 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
index 6436ad0e87..65bf889d90 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
index 9540587f4e..398e4f4ded 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
index 02ccf806c9..82fd16d225 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
index 0e4b2b80f3..2959370e59 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
index 0e4b2b80f3..2959370e59 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
index e130e0a910..510ba1be0e 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
index e130e0a910..510ba1be0e 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
index abe378d8d8..c58840c899 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
index abe378d8d8..c58840c899 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
index 70f443eebc..b3ca5a9436 100644
--- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
index 946aa74c6e..a1df4916ea 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
@@ -9,9 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
index 08b340b91e..0bf729f351 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
index 77ea922f86..915897101d 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
index 77ea922f86..915897101d 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
index 544d0359b5..89a0c40e42 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
@@ -7,8 +7,6 @@ python llm_infer.py \
     --eval_human false \
     --max_length 1024 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
index 544d0359b5..89a0c40e42 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
@@ -7,8 +7,6 @@ python llm_infer.py \
     --eval_human false \
     --max_length 1024 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh b/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
index 9ff67e0256..6caa3c6788 100644
--- a/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
index 4093b4f4bf..091bf4d63e 100644
--- a/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
@@ -8,9 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --use_flash_attn false \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.3 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh b/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
index afd8b5fa4a..90abe69d58 100644
--- a/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
index 2611eb66b9..3b4d9f5c02 100644
--- a/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
index bed2f2620b..9bb3c1762b 100644
--- a/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh b/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
index 696c04c810..25c177f5ab 100644
--- a/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
index b8f6ad1858..236d6840de 100644
--- a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
index b8f6ad1858..236d6840de 100644
--- a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,7 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 43b07b509c..85725b562d 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -134,7 +134,7 @@ def llm_infer(args: InferArguments) -> None:
     # Inference
     result = []
     jsonl_path = None
-    if args.save_result:
+    if args.save_result and args.ckpt_dir is not None:
         time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
         jsonl_path = os.path.join(args.ckpt_dir, f'infer_result_{time}.jsonl')
     if args.eval_human:
@@ -166,8 +166,8 @@ def llm_infer(args: InferArguments) -> None:
                 data.get('system'),
                 stream=args.stream)
             label = data.get('response')
+            item = history[0]
             if jsonl_path is not None:
-                item = history[0]
                 save_result_to_jsonl(jsonl_path, item[0], item[1], label)
             result.append({
                 'query': item[0],
@@ -178,6 +178,6 @@ def llm_infer(args: InferArguments) -> None:
             print(f'[LABELS]{label}')
             print('-' * 80)
             # input('next[ENTER]')
-    if args.save_result:
+    if args.save_result and args.ckpt_dir is not None:
         logger.info(f'save_result_path: {jsonl_path}')
     return {'result': result}
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index b4faf686ab..7852e5e13d 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -98,7 +98,7 @@ class SftArguments:
     learning_rate: Optional[float] = None
     weight_decay: float = 0.01
     gradient_accumulation_steps: int = 16
-    max_grad_norm: float = 1.
+    max_grad_norm: float = 0.5
     predict_with_generate: bool = False
     lr_scheduler_type: str = 'cosine'
     warmup_ratio: float = 0.05
@@ -145,9 +145,9 @@ class SftArguments:
     # generation config
     max_new_tokens: int = 2048
     do_sample: bool = True
-    temperature: float = 0.9
+    temperature: float = 0.3
     top_k: int = 20
-    top_p: float = 0.9
+    top_p: float = 0.7
     repetition_penalty: float = 1.05
 
     def __post_init__(self) -> None:
@@ -291,9 +291,9 @@ class InferArguments:
 
     max_new_tokens: int = 2048
     do_sample: bool = True
-    temperature: float = 0.9
+    temperature: float = 0.3
     top_k: int = 20
-    top_p: float = 0.9
+    top_p: float = 0.7
     repetition_penalty: float = 1.05
 
     # other

From 0fe4fb74998e178037c0426adc84441b78ca5d14 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:23:07 +0800
Subject: [PATCH 3/7] update readme

---
 README.md                         | 21 +++++++++++++++++++--
 README_CN.md                      | 21 +++++++++++++++++++--
 examples/pytorch/llm/README.md    | 21 +++++++++++++++++++--
 examples/pytorch/llm/README_CN.md | 21 +++++++++++++++++++--
 4 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 638b258fa2..90476403b9 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -169,18 +173,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **Inference**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
diff --git a/README_CN.md b/README_CN.md
index 34a392c425..aa2d69c7a5 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -156,7 +156,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -166,18 +170,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **推理**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 7303f30aaa..7b4c50465e 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -122,7 +122,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -132,18 +136,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **Inference**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index 6124b933cd..5a278804ed 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -121,7 +121,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -131,18 +135,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **推理**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 

From 897916a9f425cb6beaa700b832353c3c043a0847 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:31:18 +0800
Subject: [PATCH 4/7] update sh

---
 .../pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh  | 1 +
 .../pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh  | 1 +
 .../pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh | 1 +
 .../llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh    | 1 +
 examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh         | 1 +
 examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh | 1 +
 .../pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh   | 1 +
 .../pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh  | 1 +
 .../llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh     | 1 +
 .../pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh        | 1 +
 examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh    | 1 +
 examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh           | 1 +
 examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh    | 1 +
 examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh      | 1 +
 examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh      | 1 +
 .../pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh    | 1 +
 .../llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh   | 1 +
 .../pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh      | 1 +
 examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh         | 1 +
 examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh | 1 +
 examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh    | 1 +
 .../pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh     | 1 +
 .../pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh    | 1 +
 .../pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh    | 1 +
 examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh   | 1 +
 .../pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh     | 1 +
 .../pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh     | 1 +
 .../llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh  | 1 +
 .../llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh  | 1 +
 .../llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh      | 1 +
 .../llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh   | 1 +
 .../llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh   | 1 +
 examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh    | 1 +
 examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh       | 1 +
 examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh             | 1 +
 examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh      | 1 +
 examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh        | 1 +
 examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh | 1 +
 examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh   | 1 +
 .../pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh | 1 +
 examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh   | 1 +
 .../pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh | 1 +
 examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh        | 1 +
 examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh          | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh       | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh   | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh          | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh      | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh   | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh   | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh         | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh     | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh    | 1 +
 .../pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh    | 1 +
 .../pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh        | 1 +
 examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh   | 1 +
 examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh         | 1 +
 examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh    | 1 +
 .../pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh  | 1 +
 examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh           | 1 +
 examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh       | 1 +
 examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh          | 1 +
 .../llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh      | 1 +
 examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh           | 1 +
 examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh        | 1 +
 examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh         | 1 +
 examples/pytorch/llm/scripts/yi_6b/lora/infer.sh                 | 1 +
 examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh       | 1 +
 .../pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh     | 1 +
 75 files changed, 75 insertions(+)

diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
index 8b1e838f60..9a32b38709 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
index c41a44566e..9b24990791 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
index 2b7f125caa..1797bc7214 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
index a1784975e9..0f8eebaa56 100644
--- a/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
index 41a169e4c2..b690dc12c3 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
index def08b768d..0968f0eed8 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
index f82997b55e..6efd57bd5c 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
index f82997b55e..6efd57bd5c 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
index ab2967d46d..0f456007bb 100644
--- a/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
index 27944dbe4a..6473a4fa45 100644
--- a/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/baichuan_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
index f98bb1a502..b84e978a2e 100644
--- a/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/bluelm_7b_chat/lora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
index 6daca4b5e7..e4b53c266a 100644
--- a/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm2_6b/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
index e1f8f5239d..00de92e0a2 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b/lora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
index 61819a9333..fe39d7f29d 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
index c5cdce1c2f..67ad6dba56 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_32k/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
index 0d00b9b74b..cf131c9def 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
index 9a85ff57f3..d9636ba550 100644
--- a/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/chatglm3_6b_base/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
index 7470ac92f3..a41555fe1d 100644
--- a/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/custom/tigerbot_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
index c3a5b0555e..b501ea05d1 100644
--- a/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/custom/tigerbot_7b/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
index 940ab0ada8..b32417c153 100644
--- a/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b/lora_ddp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh b/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
index 2dc5933123..3ab6697104 100644
--- a/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
index 8aa84aab31..caaed57ba7 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/lora_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
index 5caa8fac22..0a40d44469 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
index 5caa8fac22..0a40d44469 100644
--- a/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/internlm_20b_chat/qlora_ddp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
index 5039d3ae65..be3ae4deb7 100644
--- a/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
index d7eb2e12c7..4a0a4ef9e7 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
index 94dc8b7d89..0ca5b03df7 100644
--- a/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/llama2_70b_chat/qlora_mp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
index 84479922dd..bb3ac923e9 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
index 6d1f97cc4b..d8bfb357ad 100644
--- a/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/mistral_7b_chat/lora_mp_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
index 48e862181b..20c493aab3 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
index 21d7935052..8999fc745a 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
index 14afadc0c8..44444d3eb5 100644
--- a/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_llama2_70b_chat/qlora_mp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
index 2698b59160..30869c1bc2 100644
--- a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
index 620775a01e..700ebd75a6 100644
--- a/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/openbuddy_mistral_7b_chat/lora_mp_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --max_length 4096 \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
index e4f571db80..c0193382f3 100644
--- a/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/polylm_13b/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
index 1c9aaa8b9f..99b61de7e3 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/lora_ddp_ds/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn true \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
index a5eaafa8eb..dbb067ea48 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
index a5eaafa8eb..dbb067ea48 100644
--- a/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
index 0a5cb80521..b6d05041ed 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/lora_ddp_ds/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn true \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
index f19c14dc24..e11a80ff11 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
index 44e1278cfc..d76cf78e17 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
index f60f9e18c8..628f9697f8 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
index f60f9e18c8..628f9697f8 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int4/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
index ac1a7a90cb..0d43d832ba 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
index 73e64fd235..4ec4fefc9d 100644
--- a/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_14b_chat_int8/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
index 6ec7b9ff0f..d0b537dc0a 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp_ds/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
index c2813f6c4d..3365a602e5 100644
--- a/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b/qlora_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
index 60fe98d23f..5c82fcb62a 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/infer.sh
@@ -9,5 +9,6 @@ python llm_infer.py \
     --use_flash_attn true \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
index 0c3b83b2df..f4ab5eeaa4 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -9,5 +9,6 @@ python llm_infer.py \
     --use_flash_attn true \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
index 480a89db35..08a5fe0802 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
index 65bf889d90..1b5d3f240c 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
index 65bf889d90..1b5d3f240c 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
index 398e4f4ded..d1789609cd 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
index 82fd16d225..a3aeb3351b 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
index 2959370e59..5597835d05 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
index 2959370e59..5597835d05 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
index 510ba1be0e..d4cbfd093f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
index 510ba1be0e..d4cbfd093f 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int4/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
index c58840c899..b8cbdd5942 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
index c58840c899..b8cbdd5942 100644
--- a/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_7b_chat_int8/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
index b3ca5a9436..69e4805fcf 100644
--- a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
index a1df4916ea..0ceb964ab4 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp_ds/infer.sh
@@ -10,6 +10,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
index 0bf729f351..59e0a2ef40 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
index 915897101d..bbf0347226 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
index 915897101d..bbf0347226 100644
--- a/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/qwen_vl_chat_int4/qlora_ddp_ds/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
index 89a0c40e42..e370a3f513 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full/infer.sh
@@ -8,5 +8,6 @@ python llm_infer.py \
     --max_length 1024 \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
index 89a0c40e42..e370a3f513 100644
--- a/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
+++ b/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/infer.sh
@@ -8,5 +8,6 @@ python llm_infer.py \
     --max_length 1024 \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
diff --git a/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh b/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
index 6caa3c6788..411304a68e 100644
--- a/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/skywork_13b/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
index 091bf4d63e..e94e5165a9 100644
--- a/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/tongyi_finance_14b_chat_int4/qlora/infer.sh
@@ -9,6 +9,7 @@ python llm_infer.py \
     --use_flash_attn false \
     --max_new_tokens 2048 \
     --temperature 0.3 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh b/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
index 90abe69d58..264354ad90 100644
--- a/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/xverse_13b/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh b/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
index 3b4d9f5c02..534abb20d5 100644
--- a/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
+++ b/examples/pytorch/llm/scripts/xverse_65b/qlora_mp/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
index 9bb3c1762b..362667e29e 100644
--- a/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/yi_34b/lora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh b/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
index 25c177f5ab..59279380fa 100644
--- a/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
+++ b/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
index 236d6840de..2e51f2d5ee 100644
--- a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
+++ b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
index 236d6840de..2e51f2d5ee 100644
--- a/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
+++ b/examples/pytorch/llm/scripts/ziya2_13b_chat/qlora_ddp_ds/infer.sh
@@ -8,6 +8,7 @@ python llm_infer.py \
     --max_length 2048 \
     --max_new_tokens 2048 \
     --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \

From 8fac91bded31f9363ac0a78fb8e74bf21c7d19d1 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:38:18 +0800
Subject: [PATCH 5/7] update readme

---
 README_CN.md                      |  2 +-
 examples/pytorch/llm/README.md    |  8 ++++----
 examples/pytorch/llm/README_CN.md | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/README_CN.md b/README_CN.md
index aa2d69c7a5..8bce5b3c60 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -189,7 +189,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --datase
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
-**Web-UI**
+**Web-UI**:
 ```bash
 # 原始模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 7b4c50465e..a951d9a314 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -591,9 +591,9 @@ The template initialization function retrieves the complete chat template based
 -- `check_model_is_latest`: Check if the model is the latest, default is `True`. If you need to train without internet connection, please set this parameter to `False`.
 - `--max_new_tokens`: The maximum number of new tokens to generate. The default value is `2048`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--do_sample`: Whether to use sampling during generation. The default value is `True`. This parameter only takes effect when `predict_with_generate` is set to True.
-- `--temperature`: The temperature value for sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--temperature`: The temperature value for sampling during generation. The default value is `0.3`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--top_k`: The value of k for top-k sampling during generation. The default value is `20`. This parameter only takes effect when `predict_with_generate` is set to True.
-- `--top_p`: The cumulative probability threshold for top-p sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--top_p`: The cumulative probability threshold for top-p sampling during generation. The default value is `0.7`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--repetition_penalty`: The repetition penalty applied during generation. The default value is `1.05`. This parameter only takes effect when `predict_with_generate` is set to True.
 
 
@@ -623,9 +623,9 @@ The template initialization function retrieves the complete chat template based
 - `--bnb_4bit_use_double_quant`: Default value is `True`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter is not effective if `quantization_bit` is set to 0.
 - `--max_new_tokens`: Maximum number of new tokens to generate. Default value is `2048`.
 - `--do_sample`: Whether to use greedy decoding or sampling for generation. Default value is `True`.
-- `--temperature`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--temperature`: Default value is `0.3`. This parameter only takes effect when `do_sample` is set to True.
 - `--top_k`: Default value is `20`. This parameter only takes effect when `do_sample` is set to True.
-- `--top_p`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--top_p`: Default value is `0.7`. This parameter only takes effect when `do_sample` is set to True.
 - `--repetition_penalty`: Default value is `1.05`.
 - `--use_flash_attn`: Default value is `None`, which means 'auto'. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. The models that support 'flash_attn' include: qwen series, qwen-vl series, llama series, openbuddy series, mistral series, yi series, ziya series.
 - `--ignore_args_error`: Default value is `False`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index 5a278804ed..7846c3e5c6 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -154,7 +154,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --datase
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
-**Web-UI**
+**Web-UI**:
 ```bash
 # 原始模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
@@ -594,9 +594,9 @@ if __name__ == '__main__':
 - `--check_model_is_latest`: 检查模型是否是最新, 默认为`True`. 如果你需要断网进行训练, 请将该参数设置为`False`.
 - `--max_new_tokens`: 默认为`2048`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--do_sample`: 默认为`True`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
-- `--temperature`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--temperature`: 默认为`0.3`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--top_k`: 默认为`20`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
-- `--top_p`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--top_p`: 默认为`0.7`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--repetition_penalty`: 默认为`1.05`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 
 
@@ -626,9 +626,9 @@ if __name__ == '__main__':
 - `--bnb_4bit_use_double_quant`: 默认值为`True`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
 - `--max_new_tokens`: 生成新token的最大数量, 默认值为`2048`.
 - `--do_sample`: 是使用贪婪生成的方式还是采样生成的方式, 默认值为`True`.
-- `--temperature`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--temperature`: 默认值为`0.3`. 该参数只有在`do_sample`设置为True时才生效.
 - `--top_k`: 默认值为`20`. 该参数只有在`do_sample`设置为True时才生效.
-- `--top_p`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--top_p`: 默认值为`0.7`. 该参数只有在`do_sample`设置为True时才生效.
 - `--repetition_penalty`: 默认值为`1.05`.
 - `--use_flash_attn`: 默认值为`None`, 即为'auto'. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--ignore_args_error`: 默认值为`False`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.

From c6a6785d1e54d1b708af36a341c7392e39afc22b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:39:37 +0800
Subject: [PATCH 6/7] update infer.py

---
 swift/llm/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 85725b562d..198e4bf36e 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -141,8 +141,8 @@ def llm_infer(args: InferArguments) -> None:
         while True:
             query = input('<<< ')
             _, history = inference(model, template, query, stream=args.stream)
+            item = history[0]
             if jsonl_path is not None:
-                item = history[0]
                 save_result_to_jsonl(jsonl_path, item[0], item[1])
             result.append({
                 'query': item[0],

From 3f676333804dbd4c54d1956c3f92874d59236ffd Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 21 Nov 2023 15:45:20 +0800
Subject: [PATCH 7/7] update app.py

---
 examples/pytorch/llm/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py
index bdbc6da10c..5ba72e0e1f 100644
--- a/examples/pytorch/llm/app.py
+++ b/examples/pytorch/llm/app.py
@@ -12,5 +12,5 @@
     # or chat
     args = InferArguments(model_type=ModelType.qwen_7b_chat_int4)
     # or load from ckpt dir
-    # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx', load_args_from_ckpt_dir=True)
+    # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx')
     web_ui_main(args)