modelscope · tastelikefeet · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/README.md b/README.md
@@ -141,7 +141,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
@@ -159,7 +159,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -169,18 +173,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **Inference**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 

diff --git a/README_CN.md b/README_CN.md
@@ -138,7 +138,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
@@ -156,7 +156,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -166,18 +170,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **推理**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
-**Web-UI**
+**Web-UI**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -104,7 +104,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
@@ -122,7 +122,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -132,18 +136,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **Inference**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 **Web-UI**:
 ```bash
+# Original Model
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# Fine-tuned Model
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
@@ -574,9 +591,9 @@ The template initialization function retrieves the complete chat template based
 -- `check_model_is_latest`: Check if the model is the latest, default is `True`. If you need to train without internet connection, please set this parameter to `False`.
 - `--max_new_tokens`: The maximum number of new tokens to generate. The default value is `2048`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--do_sample`: Whether to use sampling during generation. The default value is `True`. This parameter only takes effect when `predict_with_generate` is set to True.
-- `--temperature`: The temperature value for sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--temperature`: The temperature value for sampling during generation. The default value is `0.3`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--top_k`: The value of k for top-k sampling during generation. The default value is `20`. This parameter only takes effect when `predict_with_generate` is set to True.
-- `--top_p`: The cumulative probability threshold for top-p sampling during generation. The default value is `0.9`. This parameter only takes effect when `predict_with_generate` is set to True.
+- `--top_p`: The cumulative probability threshold for top-p sampling during generation. The default value is `0.7`. This parameter only takes effect when `predict_with_generate` is set to True.
 - `--repetition_penalty`: The repetition penalty applied during generation. The default value is `1.05`. This parameter only takes effect when `predict_with_generate` is set to True.
 
 
@@ -606,9 +623,9 @@ The template initialization function retrieves the complete chat template based
 - `--bnb_4bit_use_double_quant`: Default value is `True`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter is not effective if `quantization_bit` is set to 0.
 - `--max_new_tokens`: Maximum number of new tokens to generate. Default value is `2048`.
 - `--do_sample`: Whether to use greedy decoding or sampling for generation. Default value is `True`.
-- `--temperature`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--temperature`: Default value is `0.3`. This parameter only takes effect when `do_sample` is set to True.
 - `--top_k`: Default value is `20`. This parameter only takes effect when `do_sample` is set to True.
-- `--top_p`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
+- `--top_p`: Default value is `0.7`. This parameter only takes effect when `do_sample` is set to True.
 - `--repetition_penalty`: Default value is `1.05`.
 - `--use_flash_attn`: Default value is `None`, which means 'auto'. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. The models that support 'flash_attn' include: qwen series, qwen-vl series, llama series, openbuddy series, mistral series, yi series, ziya series.
 - `--ignore_args_error`: Default value is `False`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.

diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -103,7 +103,7 @@ sft_args = SftArguments(
     dataset=[DatasetName.blossom_math_zh],
     output_dir='output',
     gradient_checkpointing=True)
-best_ckpt_dir = sft_main(sft_args)
+best_ckpt_dir = sft_main(sft_args)['best_model_checkpoint']
 print(f'best_ckpt_dir: {best_ckpt_dir}')
 torch.cuda.empty_cache()
 infer_args = InferArguments(
@@ -121,7 +121,11 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 20GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -131,18 +135,31 @@ NPROC_PER_NODE=2 \
 swift sft \
     --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
+    --output_dir output \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_id_or_path qwen/Qwen-7B-Chat \
+    --custom_train_dataset_path chatml.jsonl \
+    --output_dir output \
 ```
 
 **推理**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
-**Web-UI**
+**Web-UI**:
 ```bash
+# 原始模型
+CUDA_VISIBLE_DEVICES=0 swift web-ui --model_id_or_path qwen/Qwen-7B-Chat
+
+# 微调后的模型
 CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
@@ -577,9 +594,9 @@ if __name__ == '__main__':
 - `--check_model_is_latest`: 检查模型是否是最新, 默认为`True`. 如果你需要断网进行训练, 请将该参数设置为`False`.
 - `--max_new_tokens`: 默认为`2048`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--do_sample`: 默认为`True`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
-- `--temperature`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--temperature`: 默认为`0.3`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--top_k`: 默认为`20`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
-- `--top_p`: 默认为`0.9`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
+- `--top_p`: 默认为`0.7`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--repetition_penalty`: 默认为`1.05`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 
 
@@ -609,9 +626,9 @@ if __name__ == '__main__':
 - `--bnb_4bit_use_double_quant`: 默认值为`True`.  具体的参数介绍可以在`sft.sh命令行参数`中查看. 若`quantization_bit`设置为0, 则该参数失效.
 - `--max_new_tokens`: 生成新token的最大数量, 默认值为`2048`.
 - `--do_sample`: 是使用贪婪生成的方式还是采样生成的方式, 默认值为`True`.
-- `--temperature`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--temperature`: 默认值为`0.3`. 该参数只有在`do_sample`设置为True时才生效.
 - `--top_k`: 默认值为`20`. 该参数只有在`do_sample`设置为True时才生效.
-- `--top_p`: 默认值为`0.9`. 该参数只有在`do_sample`设置为True时才生效.
+- `--top_p`: 默认值为`0.7`. 该参数只有在`do_sample`设置为True时才生效.
 - `--repetition_penalty`: 默认值为`1.05`.
 - `--use_flash_attn`: 默认值为`None`, 即为'auto'. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--ignore_args_error`: 默认值为`False`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.

diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py
@@ -12,5 +12,5 @@
     # or chat
     args = InferArguments(model_type=ModelType.qwen_7b_chat_int4)
     # or load from ckpt dir
-    # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx', load_args_from_ckpt_dir=True)
+    # args = InferArguments(ckpt_dir='xxx/vx_xxx/checkpoint-xxx')
     web_ui_main(args)
diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py
@@ -4,4 +4,5 @@
 from swift.llm.run import infer_main
 
 if __name__ == '__main__':
-    infer_main()
+    result = infer_main()
+    print(f'infer_main result: {result}')
diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py
@@ -4,5 +4,5 @@
 from swift.llm.run import sft_main
 
 if __name__ == '__main__':
-    best_ckpt_dir = sft_main()
-    print(f'best_ckpt_dir: {best_ckpt_dir}')
+    output = sft_main()
+    print(f'sft_main output: {output}')
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_ddp_ds/infer.sh
@@ -8,9 +8,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/lora_mp_ddp/infer.sh
@@ -8,9 +8,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_13b_chat_int4/qlora_ddp_ds/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b/qlora/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 2048 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.7 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp_ds/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/qlora_ddp_ds/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat_int4/qlora_ddp_ds/infer.sh
@@ -7,9 +7,8 @@ python llm_infer.py \
     --eval_human false \
     --max_length 4096 \
     --max_new_tokens 2048 \
-    --temperature 0.9 \
-    --top_k 20 \
-    --top_p 0.9 \
+    --temperature 0.1 \
+    --top_p 0.7 \
     --repetition_penalty 1.05 \
     --do_sample true \
     --merge_lora_and_save false \