From 61d93266dc18cce358631f3332f7e58fea646ba9 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 17 Nov 2023 15:20:04 +0800
Subject: [PATCH 1/4] fix register model bug

---
 examples/pytorch/llm/README.md    | 10 +++++-----
 examples/pytorch/llm/README_CN.md |  9 ++++-----
 swift/llm/infer.py                |  4 ++--
 swift/llm/utils/argument.py       |  4 ++--
 swift/llm/utils/dataset.py        |  3 ++-
 swift/llm/utils/model.py          |  4 ++--
 swift/llm/utils/utils.py          | 19 ++++++++++++-------
 swift/llm/web_ui.py               |  9 +++------
 swift/utils/np_utils.py           | 11 +++++++++--
 tests/llm/test_run.py             |  7 +++----
 tests/llm/test_template.py        | 19 +++++++++++++++++++
 11 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 8914c4f574..7b48201e98 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -152,13 +152,13 @@ Training GPU memory: qlora(low,3090) < lora < full(high,2*A100)
 **Tips**:
 - You can set `--gradient_checkpointing true` during training to **save GPU memory**, but this will slightly decrease the training speed. This is useful if you need to train LLM on **consumer-grade GPU**, e.g. 3090.
 - If you want to use quantization based on **auto_gptq**, you need to install auto_gptq first: `pip install auto_gptq -U`.
-  The models available with auto_gptq are: `qwen-7b-chat-int4`, `qwen-14b-chat-int4`, `qwen-7b-chat-int8`, `qwen-14b-chat-int8`.
+  The models available with auto_gptq are: 'qwen-7b-chat-int4, 'qwen-14b-chat-int4', 'qwen-7b-chat-int8', 'qwen-14b-chat-int8', 'qwen-vl-chat-int4'.
   If the script provides multiple versions of qlora SFT, including both non-quantized models and int4/int8 models, it is **recommended to use the script for the int4/int8 model versions**.
 - If you want to use the quantization parameter `quantization_bit`, you need to install `bitsandbytes` first: `pip install bitsandbytes -U`.
 - If you want to use deepspeed, you need to `pip install deepspeed -U`. Using deepspeed can **save GPU memory**, but this may slightly decrease the training speed.
 - If you are using older GPUs like **V100**, you need to set `--dtype fp16`, because they do not support bf16.
 - qwen recommends installing [**flash-attn**](https://github.com/Dao-AILab/flash-attention), which will accelerate the training and inference speed and reduce GPU memory usage (A10, 3090, V100 machines do not support flash-attn).
-- If you want to perform **second pre-training** instead of SFT, you can refer to the `DatasetName.tigerbot_law_zh` dataset and its corresponding sh file: `scripts/qwen_7b/qlora_ddp`.
+- If you want to conduct **secondary pre-training** instead of SFT, you need to only set 'response' during dataset registration, without setting 'query'. You can refer to the dataset `'tigerbot-law-zh'` and its corresponding sh file: `scripts/qwen_7b/qlora_ddp`.
 - If you want to push weights to the ModelScope Hub during training, you need to set `--push_to_hub true`.
 - If you want to merge LoRA weights and save them during inference, you need to set `--merge_lora_and_save true`. It is **not recommended to merge quantized models**, as this can result in performance degradation, specifically in the case of qlora.
 - Below is a shell script for running `qwen_7b_chat` directly (you just need to specify `ckpt_dir` during inference to execute it smoothly). For more model scripts, you can check the `scripts` folder. If you want to **customize a shell script**, it is recommended to refer to the script in `scripts/qwen_7b_chat`.
@@ -340,8 +340,8 @@ The `register_dataset` function registers the dataset in the `DATASET_MAPPING`.
 - `train_subset_split_list`: Default value is `None`. The meaning of this parameter is similar to `train_subset_split_list`.
 - `preprocess_func`: Default value is `None`. Represents the method for preprocessing the function.
 - `get_function`: Default value is `None`. The function used to retrieve the dataset. If None is passed, the decorator scheme is used for dataset registration, where the `register_dataset` function returns `Callable[[GetDatasetFunction], GetDatasetFunction]`. This scheme requires users with some python knowledge. If a function is passed, the normal registration scheme is used. If importing datasets from the ModelScope Hub, the `get_dataset_from_repo` function is commonly used.
-  The `get_function` function has no restrictions, you just need to return either `HfDataset` or `Tuple[HfDataset, Optional[HfDataset]]`. In the case where only the `train_dataset` is returned, the data processing function will split a portion of the dataset as the validation dataset (based on the command line hyperparameter `dataset_test_ratio`). If two datasets are returned, they will be used as the training and validation datasets respectively. We support fine-tuning with multiple datasets. The training and validation portions of each sub-dataset will be concatenated separately, and the final merged training and validation datasets will be returned.
-  The returned `HfDataset` needs to adhere to certain specifications. If it is for instruction fine-tuning (single-turn dialogue), it should include the `query` and `response` fields, representing the user's query for instruction fine-tuning and the AI assistant's response, respectively. You can refer to the `alpaca-zh` dataset for more details. If it is for multi-turn dialogue, it needs to include an additional `history` field, representing the history of the conversation. You can refer to the `damo-agent-mini-zh` dataset for more details. If each example in the dataset has a different `system`, an additional `system` field is required. You can also refer to the `damo-agent-mini-zh` dataset for more details.
+  The `get_function` function has no restrictions, you just need to return `HfDataset` or `Tuple[HfDataset, Optional[HfDataset]]`. If only the train_dataset is returned, the data processing function will split a portion of the dataset as the validation set (based on the command line hyperparameter `dataset_test_ratio`). If two datasets are returned, they will be used as the training set and validation set respectively. We support fine-tuning with multiple datasets. The training and validation parts of each sub-dataset will be concatenated and the merged training set and validation set will be returned.
+  The returned `HfDataset` needs to follow certain specifications. If you are doing **pre-training**, it only needs to include the `response` field, as seen in the `'tigerbot-law-zh'` dataset. If it is **instruction-based fine-tuning (single-turn dialogue)**, it needs to include the `query` and `response` fields, representing the user's query for instruction-based fine-tuning and the AI assistant's response, as seen in the `'alpaca-zh'` dataset. If it is **multi-turn dialogue**, the `history` field needs to be added to represent the dialogue history, as seen in the `'damo-agent-mini-zh'` dataset. If each example in the dataset has a different `system`, the `system` field needs to be added as well, as seen in the `'damo-agent-mini-zh'` dataset.
 - `task`: The task for which the dataset is intended. This parameter is generally not required to be set.
 - `function_kwargs`: Default is `{}`, used to pass arguments to `get_function` to support the `partial` functionality in the decorator scheme. This parameter is generally not required to be set.
 - `**kwargs`: Other parameters used for annotating the dataset. This parameter is generally not required to be set.
@@ -580,7 +580,7 @@ The template initialization function retrieves the complete chat template based
 - `--top_k`: Default value is `20`. This parameter only takes effect when `do_sample` is set to True.
 - `--top_p`: Default value is `0.9`. This parameter only takes effect when `do_sample` is set to True.
 - `--repetition_penalty`: Default value is `1.05`.
-- `--use_flash_attn`: Default value is `None`, which means 'auto'. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
+- `--use_flash_attn`: Default value is `None`, which means 'auto'. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. The models that support 'flash_attn' include: qwen series, qwen-vl series, llama series, openbuddy series, mistral series, yi series, ziya series.
 - `--ignore_args_error`: Default value is `False`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
 - `--stream`: Whether to use streaming output. Default value is `True`.
 - `--merge_lora_and_save`: Whether to merge the lora weights into the base model and save the complete weights. Default value is `False`. The weights will be saved in a directory named `checkpoint-xxx-merged` at the same level as `ckpt_dir`, e.g., `'/path/to/your/vx_xxx/checkpoint-xxx-merged'`.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index a6cf87119c..f185c8f554 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -66,7 +66,6 @@ cd examples/pytorch/llm
 pip install deepspeed -U
 
 # 如果你想要使用基于auto_gptq的qlora训练. (推荐, 效果优于bnb)
-# 使用auto_gptq的模型: qwen-7b-chat-int4, qwen-14b-chat-int4, qwen-7b-chat-int8, qwen-14b-chat-int8
 # auto_gptq和cuda版本有对应关系，请按照https://github.com/PanQiWei/AutoGPTQ#quick-installation选择版本
 pip install auto_gptq
 
@@ -153,12 +152,12 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 - 你可以在训练时设置`--gradient_checkpointing true`来**节约显存**, 但这会略微降低训练速度. 如果你需要在**消费级显卡**中训练大模型, 这很有用, 例如: 3090.
 - 如果你想要使用量化参数`quantization_bit`, 你需要先安装bnb: `pip install bitsandbytes -U`.
 - 如果你想要使用基于**auto_gptq**的量化, 你需要先安装auto_gptq: `pip install auto_gptq -U`.
-  使用auto_gptq的模型包含: `qwen-7b-chat-int4`, `qwen-14b-chat-int4`, `qwen-7b-chat-int8`, `qwen-14b-chat-int8`.
+  使用auto_gptq的模型包含: 'qwen-7b-chat-int4, 'qwen-14b-chat-int4', 'qwen-7b-chat-int8', 'qwen-14b-chat-int8', 'qwen-vl-chat-int4'.
   如果脚本提供了非量化模型和int4/int8模型的多个版本的qlora SFT版本, **推荐使用int4/int8模型版本的脚本**.
 - 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. 使用deepspeed可以**节约显存**, 但可能会略微降低训练速度.
 - 如果你使用的是**V100**等较老的GPU, 你需要设置`--dtype fp16`, 因为其不支持bf16.
 - 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练).
-- 如果你要进行**二次预训练**而不是SFT, 你可以参考`DatasetName.tigerbot_law_zh`数据集和其对于的sh文件: `scripts/qwen_7b/qlora_ddp`.
+- 如果你要进行**二次预训练**而不是SFT, 你需要在注册数据集时只设置'response'而不设置'query', 你可以参考`'tigerbot-law-zh'`数据集和其对于的sh文件: `scripts/qwen_7b/qlora_ddp`.
 - 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
 - 如何你想要在推理时, 合并LoRA权重并保存，你需要设置`--merge_lora_and_save true`. **不推荐对量化的模型进行merge**, 这会存在精度损失, 即qlora.
 - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看`scripts`文件夹. 如果你想要**自定义sh脚本**, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
@@ -344,7 +343,7 @@ if __name__ == '__main__':
 - `preprocess_func`: 默认为`None`. 表示对函数进行预处理的方法.
 - `get_function`: 默认值为`None`. 获取数据集的函数. 如果传入None, 则使用修饰器方案进行数据集注册, `register_dataset`函数将返回`Callable[[GetDatasetFunction], GetDatasetFunction]`, 该方案需要有一定python基础的用户使用. 如果传入一个函数, 则使用正常方案进行注册. 如果从ModelScope Hub导入数据集, 一般使用`get_dataset_from_repo`函数.
   `get_function`函数没有任何限制, 你只需要返回`HfDataset`或`Tuple[HfDataset, Optional[HfDataset]]`即可. 只返回train_dataset的情况下, 数据集处理函数会切分一部分的数据集作为验证集 (根据命令行超参数`dataset_test_ratio`); 如果返回两个数据集, 则分别作为其训练集和验证集. 我们支持使用多个数据集进行微调. 我们会将各个子数据集的训练集和验证集部分分别进行拼接, 最终返回合并后的训练集和验证集.
-  函数返回的`HfDataset`需要符合一定的规范. 如果是指令微调(单轮对话)的情况下, 需包含`query`, `response`字段, 分别代表指令微调的用户询问和AI助手的回答, 具体可以参考`alpaca-zh`数据集. 如果是多轮对话, 则需要额外加上`history`字段, 代表对话的历史信息, 具体可以参考`damo-agent-mini-zh`数据集. 如果每个数据集样例具有不同的`system`, 则需要额外加上system字段, 具体你也可以参考`damo-agent-mini-zh`数据集.
+  函数返回的`HfDataset`需要符合一定的规范. 如果你要进行**预训练**, 那么只需要包含`response`字段, 具体可以参考`'tigerbot-law-zh'`数据集. 如果是**指令微调(单轮对话)**的情况下, 需包含`query`, `response`字段, 分别代表指令微调的用户询问和AI助手的回答, 具体可以参考`'alpaca-zh'`数据集. 如果是**多轮对话**, 则需要额外加上`history`字段, 代表对话的历史信息, 具体可以参考`'damo-agent-mini-zh'`数据集. 如果每个数据集样例具有不同的`system`, 则需要额外加上system字段, 具体你也可以参考`'damo-agent-mini-zh'`数据集.
 - `task`: 注释数据集用作的任务. 该参数一般不需要设置.
 - `function_kwargs`: 默认为`{}`, 用于传递给`get_function`, 用于支持修饰器情况下的`partial`功能. 该参数一般不需要设置.
 - `**kwargs`: 其他用于注释数据集的参数. 该参数一般不需要设置.
@@ -543,7 +542,7 @@ if __name__ == '__main__':
 - `--push_hub_strategy`: 推送策略, 默认为`'push_best'`. 可选择的值包括: 'end', 'push_best', 'push_last', 'checkpoint', 'all_checkpoints'. 'push_best'表示在每次保存权重时, 将最好的模型进行推送并覆盖之前的权重, 'push_last'表示在每次保存权重时, 将最后的权重进行推送并覆盖之前的权重. 该参数只有在`push_to_hub`设置为True时才生效.
 - `--hub_token`: 推送时需要的SDK token. 可以从[https://modelscope.cn/my/myaccesstoken](https://modelscope.cn/my/myaccesstoken)获取, 默认为`None`, 即从环境变量`MODELSCOPE_API_TOKEN`中获取. 该参数只有在`push_to_hub`设置为True时才生效.
 - `--test_oom_error`: 用于检测训练是否会发生OOM, 默认为`False`. 如果设置为True, 则会将训练集按max_length倒序进行排列, 方便OOM的测试. 该参数一般用于测试, 请谨慎设置.
-- `--use_flash_attn`: 是否使用flash attn, 默认为`None`. 安装flash_attn的步骤可以查看[https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)
+- `--use_flash_attn`: 是否使用flash attn, 默认为`None`. 安装flash_attn的步骤可以查看[https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). 支持flash_attn的模型包括: qwen系列, qwen-vl系列, llama系列, openbuddy系列, mistral系列, yi系列, ziya系列.
 - `--ignore_args_error`: 是否忽略命令行传参错误抛出的Error, 默认为`False`. 如果需要拷贝代码到notebook中运行, 需要设置成True.
 - `--logging_dir`: 默认为`None`. 即设置为`f'{self.output_dir}/runs'`, 表示tensorboard文件存储路径.
 - `--max_new_tokens`: 默认为`2048`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 01b7d3db6d..2ea5974968 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -20,8 +20,8 @@
 def merge_lora(args: InferArguments, replace_if_exists=False) -> None:
     assert args.ckpt_dir is not None
     assert args.sft_type == 'lora'
-    assert not args.model_type.endswith('int4'), 'int4 model is not supported'
-    assert not args.model_type.endswith('int8'), 'int8 model is not supported'
+    assert 'int4' not in args.model_type, 'int4 model is not supported'
+    assert 'int8' not in args.model_type, 'int8 model is not supported'
     if args.quantization_bit != 0:
         logger.warning('It is not recommended to merge quantized models, '
                        'as this can result in performance degradation')
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 45cd1afc7f..521fbc4a93 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -363,8 +363,8 @@ def select_dtype(
         args: Union[SftArguments, InferArguments]) -> Tuple[Dtype, bool, bool]:
     if args.dtype == 'AUTO' and not torch.cuda.is_bf16_supported():
         args.dtype = 'fp16'
-    if args.dtype == 'AUTO' and (args.model_type.endswith('int4')
-                                 or args.model_type.endswith('int8')):
+    if args.dtype == 'AUTO' and ('int4' in args.model_type
+                                 or 'int8' in args.model_type):
         model_torch_dtype = MODEL_MAPPING[args.model_type]['torch_dtype']
         if model_torch_dtype is not None:
             args.dtype = dtype_mapping[model_torch_dtype]
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 6f40203d30..9e569a43eb 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -128,11 +128,12 @@ def register_dataset(
 
     def _register_dataset(
             get_function: GetDatasetFunction) -> GetDatasetFunction:
+        _old_get_function = get_function
         if len(function_kwargs) > 0:
             get_function = partial(get_function, **function_kwargs)
         dataset_info['get_function'] = get_function
         DATASET_MAPPING[dataset_name] = dataset_info
-        return get_function
+        return _old_get_function
 
     return _register_dataset
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index fc29f6cc4c..d449e9f012 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -15,7 +15,6 @@
 from packaging import version
 from torch import Tensor
 from torch import dtype as Dtype
-from torch.nn import Module
 from transformers import (PretrainedConfig, PreTrainedModel,
                           PreTrainedTokenizerBase)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
@@ -175,11 +174,12 @@ def register_model(
     def _register_model(
             get_function: GetModelTokenizerFunction
     ) -> GetModelTokenizerFunction:
+        _old_get_function = get_function
         if len(function_kwargs) > 0:
             get_function = partial(get_function, **function_kwargs)
         model_info['get_function'] = get_function
         MODEL_MAPPING[model_type] = model_info
-        return get_function
+        return _old_get_function
 
     return _register_model
 
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index bb2b474311..850acee452 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -297,7 +297,7 @@ def find_all_linear_for_lora(model: Module, quantization_bit: int,
         linear_cls = Linear8bitLt
     else:
         linear_cls = Linear
-    if model_type.endswith('int4') or model_type.endswith('int8'):
+    if 'int4' in model_type or 'int8' in model_type:
         from bitsandbytes.nn import Linear4bit
         from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
         gptq_quantization_config = get_quantization_config(model, 'gptq')
@@ -340,8 +340,9 @@ def inference_stream(
     example = {'query': query, 'history': history, 'system': system}
     input_ids = template.encode(example)['input_ids']
     tokenizer = template.tokenizer
-    input_ids = torch.tensor(input_ids)[None].cuda()
-    attention_mask = torch.ones_like(input_ids)
+    device = next(model.parameters()).device
+    input_ids = torch.tensor(input_ids)[None].to(device)
+    attention_mask = torch.ones_like(input_ids).to(device)
     model.eval()
     generation_config = getattr(model, 'generation_config', None)
     from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
@@ -380,8 +381,9 @@ def inference(model: PreTrainedModel,
     example = {'query': query, 'history': history, 'system': system}
     input_ids = template.encode(example)['input_ids']
     tokenizer = template.tokenizer
-    input_ids = torch.tensor(input_ids)[None].cuda()
-    attention_mask = torch.ones_like(input_ids)
+    device = next(model.parameters()).device
+    input_ids = torch.tensor(input_ids)[None].to(device)
+    attention_mask = torch.ones_like(input_ids).to(device)
     model.eval()
     generation_config = getattr(model, 'generation_config', None)
     if verbose:
@@ -418,8 +420,11 @@ def compute_token_length(history_length: int) -> int:
         input_ids = template.encode(example)['input_ids']
         return len(input_ids)
 
-    return upper_bound(0, len(history),
-                       lambda mid: compute_token_length(mid) <= max_length)
+    history_length = upper_bound(
+        0, len(history), lambda mid: compute_token_length(mid) <= max_length)
+    old_history = history[:len(history) - history_length]
+    history = history[len(history) - history_length:]
+    return old_history, history
 
 
 # monkey patching
diff --git a/swift/llm/web_ui.py b/swift/llm/web_ui.py
index a82a0abcce..402f9e1133 100644
--- a/swift/llm/web_ui.py
+++ b/swift/llm/web_ui.py
@@ -38,11 +38,8 @@ def gradio_chat_demo(args: InferArguments) -> None:
     model, template = prepare_model_template(args)
 
     def model_chat(query: str, history: History) -> Tuple[str, History]:
-        history_length = limit_history_length(template, query, history,
-                                              args.max_length)
-        # avoid history_length == 0
-        old_history = history[:len(history) - history_length]
-        history = history[len(history) - history_length:]
+        old_history, history = limit_history_length(template, query, history,
+                                                    args.max_length)
         gen = inference_stream(
             model, template, query, history, skip_special_tokens=True)
         for _, history in gen:
@@ -54,7 +51,7 @@ def model_chat(query: str, history: History) -> Tuple[str, History]:
         gr.Markdown(f'<center><font size=8>{model_name} Bot</center>')
 
         chatbot = gr.Chatbot(label=f'{model_name}')
-        message = gr.Textbox(lines=3)
+        message = gr.Textbox(lines=3, label='Input')
         with gr.Row():
             clear_history = gr.Button('🧹 清除历史对话')
             send = gr.Button('🚀 发送')
diff --git a/swift/utils/np_utils.py b/swift/utils/np_utils.py
index 70efa1b5e3..b229ab6add 100644
--- a/swift/utils/np_utils.py
+++ b/swift/utils/np_utils.py
@@ -34,5 +34,12 @@ def stat_array(
     std = array.std().item()
     min_ = array.min().item()
     max_ = array.max().item()
-    string = f'{mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={array.shape[0]}'
-    return {'mean': mean, 'std': std, 'min': min_, 'max': max_}, string
+    size = array.shape[0]
+    string = f'{mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={size}'
+    return {
+        'mean': mean,
+        'std': std,
+        'min': min_,
+        'max': max_,
+        'size': size
+    }, string
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
index 6c0f2b805c..2b1cb87c3b 100644
--- a/tests/llm/test_run.py
+++ b/tests/llm/test_run.py
@@ -9,9 +9,8 @@
 
 import torch
 
-from swift.llm import (DatasetName, InferArguments, ModelType, SftArguments,
-                       TemplateType)
-from swift.llm.run import infer_main, sft_main, web_ui_main
+from swift.llm import DatasetName, InferArguments, ModelType, SftArguments
+from swift.llm.run import infer_main, sft_main
 
 
 class TestRun(unittest.TestCase):
@@ -31,7 +30,7 @@ def test_run_1(self):
         model_type = ModelType.chatglm3_6b
         sft_args = SftArguments(
             model_type=model_type,
-            template_type=TemplateType.chatglm3,
+            template_type='AUTO',
             quantization_bit=4,
             eval_steps=5,
             check_dataset_strategy='warning',
diff --git a/tests/llm/test_template.py b/tests/llm/test_template.py
index bfebde1b94..d929d6d7b6 100644
--- a/tests/llm/test_template.py
+++ b/tests/llm/test_template.py
@@ -211,6 +211,25 @@ def test_bluelm_template(self):
         response = model.chat(tokenizer, query)[0]
         print(f'official response: {response}')
 
+    @unittest.skip(
+        'To avoid excessive testing time caused by downloading models and '
+        'to prevent OOM (Out of Memory) errors.')
+    def test_qwen_generation_template(self):
+        model_type = ModelType.qwen_7b
+        template_type = TemplateType.default_generation
+        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
+        template = get_template(template_type, tokenizer)
+        query = '蒙古国的首都是乌兰巴托（Ulaanbaatar）\n冰岛的首都是雷克雅未克（Reykjavik）\n埃塞俄比亚的首都是'
+        print(f'query: {query}')
+        response, _ = inference(model, template, query, verbose=False)
+        print(f'swift response: {response}')
+        model.generation_config.chat_format = 'raw'
+        model.generation_config.max_window_size = 1024
+        inputs = tokenizer(query, return_tensors='pt').to('cuda')
+        response = tokenizer.decode(
+            model.generate(**inputs)[0, len(inputs['input_ids'][0]):])
+        print(f'official response: {response}')
+
 
 if __name__ == '__main__':
     unittest.main()

From 98cc48b81b29f7001a9fd8e4c244429764e171f8 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 17 Nov 2023 16:19:19 +0800
Subject: [PATCH 2/4] update readme

---
 README.md                         | 3 ++-
 README_CN.md                      | 3 ++-
 examples/pytorch/llm/README.md    | 2 +-
 examples/pytorch/llm/README_CN.md | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 301d74b0f8..bb1e49a765 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 
 
 ### 🎉 News
+- 🔥 2023.11.16: Added support for more models in **flash attn**: qwen series, qwen-vl series, llama series, openbuddy series, mistral series, yi series, ziya series. Please use the `use_flash_attn` parameter.
 - 🔥 2023.11.11: **NEFTune** Supported, Use is with `Swift.prepare_model(model, NEFTuneConfig())`
 - 🔥 2023.11.11: Support training and inference with **CLI**, and inference with **Web-UI**. Check the [Run using Swift CLI](https://github.com/modelscope/swift/tree/main#run-using-swift-cli) chapter for details.
 - 🔥 2023.11.11: Support model **deployment**(vllm/chatglm.cpp/xinference)，Check [Official documentation](./docs/source/GetStarted/部署指南.md) for details.
@@ -147,8 +148,8 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 - Supported Models:
   - qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)
   - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
-  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - chatglm series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
+  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
diff --git a/README_CN.md b/README_CN.md
index fba56ea6eb..7b28876f14 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -39,6 +39,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 
 
 ## 🎉 新闻
+- 🔥 2023.11.16: 支持更多模型的**flash attn**支持: qwen系列, qwen-vl系列, llama系列, openbuddy系列, mistral系列, yi系列, ziya系列. 请使用`use_flash_attn`参数.
 - 🔥 2023.11.11: 支持**NEFTune**, 使用`Swift.prepare_model(model, NEFTuneConfig())`即可开启.
 - 🔥 2023.11.11: 支持**命令行**训练推理和**Web-UI**推理, 详情可以查看下方的`使用Swift CLI运行`章节.
 - 🔥 2023.11.11: 支持模型训练后的**部署**链路(vllm/chatglm.cpp/xinference)，详情可以查看[官方文档](./docs/source/GetStarted/部署指南.md).
@@ -145,8 +146,8 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 - 支持的模型
   - qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)
   - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
-  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - chatglm 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
+  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 7b48201e98..95568b94a5 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -21,8 +21,8 @@
 - Supported Models:
   - qwen series: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)
   - qwen-vl series: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
-  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - chatglm series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
+  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index f185c8f554..b140eaaae2 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -21,8 +21,8 @@
 - 支持的模型
   - qwen 系列: [qwen-7b](https://modelscope.cn/models/qwen/Qwen-7B/summary), [qwen-7b-chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary), [qwen-14b](https://modelscope.cn/models/qwen/Qwen-14B/summary), [qwen-14b-chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary), [qwen-7b-chat-int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary), [qwen-14b-chat-int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary), [qwen-7b-chat-int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary), [qwen-14b-chat-int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)
   - qwen-vl 系列: [qwen-vl](https://modelscope.cn/models/qwen/Qwen-VL/summary), [qwen-vl-chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary), [qwen-vl-chat-int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)
-  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - chatglm 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
+  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
   - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)

From cf9260184b66de55d8880d85e13473628d66ff11 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 17 Nov 2023 18:03:41 +0800
Subject: [PATCH 3/4] merge main

---
 README.md                         |  8 ++++----
 README_CN.md                      |  8 ++++----
 examples/pytorch/llm/README.md    |  8 ++++----
 examples/pytorch/llm/README_CN.md |  8 ++++----
 swift/tuners/lora.py              | 33 +++++++++++++++++++------------
 swift/version.py                  |  2 +-
 6 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index bb1e49a765..2516f14989 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ from swift.llm import (
 )
 from swift.llm.run import infer_main, sft_main, web_ui_main
 
-model_type = ModelType.qwen_7b_chat_int4
+model_type = ModelType.qwen_7b_chat
 sft_args = SftArguments(
     model_type=model_type,
     eval_steps=50,
@@ -117,7 +117,7 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 10GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -125,11 +125,11 @@ CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dat
 CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=2 \
 swift sft \
-    --model_id_or_path qwen/Qwen-7B-Chat-Int4 \
+    --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
 ```
 
 **Inference**:
diff --git a/README_CN.md b/README_CN.md
index 7b28876f14..4f262b62dc 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -88,7 +88,7 @@ from swift.llm import (
 )
 from swift.llm.run import infer_main, sft_main, web_ui_main
 
-model_type = ModelType.qwen_7b_chat_int4
+model_type = ModelType.qwen_7b_chat
 sft_args = SftArguments(
     model_type=model_type,
     eval_steps=50,
@@ -114,7 +114,7 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 10GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -122,11 +122,11 @@ CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dat
 CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=2 \
 swift sft \
-    --model_id_or_path qwen/Qwen-7B-Chat-Int4 \
+    --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
 ```
 
 **推理**:
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 95568b94a5..1d7da0102c 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -92,7 +92,7 @@ from swift.llm import (
 )
 from swift.llm.run import infer_main, sft_main, web_ui_main
 
-model_type = ModelType.qwen_7b_chat_int4
+model_type = ModelType.qwen_7b_chat
 sft_args = SftArguments(
     model_type=model_type,
     eval_steps=50,
@@ -118,7 +118,7 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 10GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
 
 # Using DDP
 # Experimental environment: 2 * 3090
@@ -126,11 +126,11 @@ CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dat
 CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=2 \
 swift sft \
-    --model_id_or_path qwen/Qwen-7B-Chat-Int4 \
+    --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
 
 # Using custom dataset
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
 ```
 
 **Inference**:
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index b140eaaae2..1716add5d5 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -91,7 +91,7 @@ from swift.llm import (
 )
 from swift.llm.run import infer_main, sft_main, web_ui_main
 
-model_type = ModelType.qwen_7b_chat_int4
+model_type = ModelType.qwen_7b_chat
 sft_args = SftArguments(
     model_type=model_type,
     eval_steps=50,
@@ -117,7 +117,7 @@ web_ui_main(infer_args)
 ```bash
 # Experimental environment: A10, 3090, A100, ...
 # 10GB GPU memory
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dataset blossom-math-zh
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
 
 # 使用DDP
 # Experimental environment: 2 * 3090
@@ -125,11 +125,11 @@ CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --dat
 CUDA_VISIBLE_DEVICES=0,1 \
 NPROC_PER_NODE=2 \
 swift sft \
-    --model_id_or_path qwen/Qwen-7B-Chat-Int4 \
+    --model_id_or_path qwen/Qwen-7B-Chat \
     --dataset blossom-math-zh \
 
 # 使用自己的数据集
-CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat-Int4 --custom_train_dataset_path chatml.jsonl
+CUDA_VISIBLE_DEVICES=0 swift sft --model_id_or_path qwen/Qwen-7B-Chat --custom_train_dataset_path chatml.jsonl
 ```
 
 **推理**:
diff --git a/swift/tuners/lora.py b/swift/tuners/lora.py
index 32d20ea5c9..b30efef412 100644
--- a/swift/tuners/lora.py
+++ b/swift/tuners/lora.py
@@ -132,7 +132,10 @@ def __init__(
             init_lora_weights = kwargs.pop('init_lora_weights', True)
             self.update_layer(adapter_name, r, lora_alpha, lora_dropout,
                               init_lora_weights)
-            self.set_adapter(adapter_name)
+            if version.parse(peft.__version__) >= version.parse('0.6.0'):
+                self.set_adapter(adapter_name)
+            else:
+                self.active_adapter = adapter_name
             super(QuantLinear, self).__init__()
             if self.use_qa_lora:
                 self.qa_pool = torch.nn.AvgPool1d(
@@ -140,32 +143,36 @@ def __init__(
                 )  # using pooling layer to conduct sum operation
 
         def forward(self, x: torch.Tensor):
+            if version.parse(peft.__version__) >= version.parse('0.6.0'):
+                active_adapter = self.active_adapter[0]
+            else:
+                active_adapter = self.active_adapter
             result = self.quant_linear_module(x)
             if not self.is_activated(
-            ) or self.disable_adapters or self.active_adapter[
-                    0] not in self.lora_A.keys():
+            ) or self.disable_adapters or active_adapter not in self.lora_A.keys(
+            ):
                 return result
-            elif self.r[self.active_adapter[0]] > 0:
+            elif self.r[active_adapter] > 0:
                 result = result.clone()
                 if not torch.is_autocast_enabled():
                     expected_dtype = result.dtype
-                    x = x.to(self.lora_A[self.active_adapter[0]].weight.dtype)
+                    x = x.to(self.lora_A[active_adapter].weight.dtype)
                     if self.use_qa_lora:
                         x = self.qa_pool(x) * self.group_size
                     output = (
-                        self.lora_B[self.active_adapter[0]](
-                            self.lora_A[self.active_adapter[0]](
-                                self.lora_dropout[self.active_adapter[0]]
+                        self.lora_B[active_adapter](
+                            self.lora_A[active_adapter](
+                                self.lora_dropout[active_adapter]
                                 (x))).to(expected_dtype)
-                        * self.scaling[self.active_adapter[0]])
+                        * self.scaling[active_adapter])
                 else:
                     if self.use_qa_lora:
                         x = self.qa_pool(x) * self.group_size
                     output = (
-                        self.lora_B[self.active_adapter[0]](
-                            self.lora_A[self.active_adapter[0]](
-                                self.lora_dropout[self.active_adapter[0]](x)))
-                        * self.scaling[self.active_adapter[0]])
+                        self.lora_B[active_adapter](
+                            self.lora_A[active_adapter](
+                                self.lora_dropout[active_adapter](x)))
+                        * self.scaling[active_adapter])
                 result += output
             return result
 
diff --git a/swift/version.py b/swift/version.py
index 1f4b62e764..4fa90b9311 100644
--- a/swift/version.py
+++ b/swift/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '1.2.0'
+__version__ = '1.3.0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'

From 53d4d52462544c0ab9e0cf9c691eb0f17bdcb3c8 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Fri, 17 Nov 2023 18:30:51 +0800
Subject: [PATCH 4/4] update readme

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 2516f14989..f62602684e 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,6 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - 🔥 2023.10.7: Supported **DeepSpeed ZeRO-2**, enabling LoRA (not just QLoRA) to run DDP on 2*A10.
 - 2023.10.4: Supported datasets in the fields of mathematics, law, SQL, and coding: blossom-math-zh, school-math-zh, text2sql-en, sql-create-context-en, lawyer-llama-zh, tigerbot-law-zh, leetcode-python-en.
 - 🔥 2023.9.25: Supported **qwen-14b** model series: qwen-14b, qwen-14b-chat.
-- 2023.9.12: Supported training with MP+DDP to accelerate full-parameter fine-tuning speed.
 - 2023.9.18: Supported **internlm-20b** model series: internlm-20b, internlm-20b-chat.
 - 2023.9.12: Supported training with **MP+DDP** to accelerate full-parameter fine-tuning speed.
 - 2023.9.5: Supported **openbuddy-llama2-70b-chat** model.