From e15c9f9128edf9855d2a97755753e03ba860fef7 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 16 Nov 2023 02:05:53 +0800
Subject: [PATCH 1/3] support flash_attn

---
 README.md                         |   2 +-
 README_CN.md                      |   2 +-
 examples/pytorch/llm/README.md    |   5 +-
 examples/pytorch/llm/README_CN.md |   5 +-
 swift/llm/utils/argument.py       |   3 +
 swift/llm/utils/model.py          | 141 +++++++++++++++++-------------
 6 files changed, 90 insertions(+), 68 deletions(-)

diff --git a/README.md b/README.md
index 990beb9664..d916825fad 100644
--- a/README.md
+++ b/README.md
@@ -155,9 +155,9 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
   - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
   - bluelm series: [bluelm-7b](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary), [bluelm-7b-chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary), [bluelm-7b-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary), [bluelm-7b-chat-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)
   - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)
+  - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - ziya series: [ziya2-13b](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary), [ziya2-13b-chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
   - skywork series: [skywork-13b](https://modelscope.cn/models/skywork/Skywork-13B-base/summary), [skywork-13b-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)
-  - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
 - Supported Datasets:
   - NLP:
diff --git a/README_CN.md b/README_CN.md
index 2bac06ccab..a06281a637 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -153,9 +153,9 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
   - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
   - bluelm 系列: [bluelm-7b](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary), [bluelm-7b-chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary), [bluelm-7b-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary), [bluelm-7b-chat-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)
   - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)
+  - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - ziya 系列: [ziya2-13b](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary), [ziya2-13b-chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
   - skywork 系列: [skywork-13b](https://modelscope.cn/models/skywork/Skywork-13B-base/summary), [skywork-13b-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)
-  - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
 - 支持的数据集:
   - NLP:
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 7c4ebb21db..5960311fc9 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -26,9 +26,10 @@
   - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-  - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
   - bluelm series: [bluelm-7b](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary), [bluelm-7b-chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary), [bluelm-7b-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary), [bluelm-7b-chat-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)
   - mistral series: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)
+  - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - ziya series: [ziya2-13b](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary), [ziya2-13b-chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
   - skywork series: [skywork-13b](https://modelscope.cn/models/skywork/Skywork-13B-base/summary), [skywork-13b-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)
   - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
@@ -146,7 +147,7 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ## 🌟 Run SFT and Inference
 Performace: full(nice) > lora > qlora(auto_gptq) > qlora(bnb)
 
-Training GPU memory: qlora(low,3090) > lora > full(2*A100)
+Training GPU memory: full(high,2*A100) > lora > qlora(low,3090)
 
 **Tips**:
 - You can set `--gradient_checkpointing true` during training to **save GPU memory**, but this will slightly decrease the training speed. This is useful if you need to train LLM on **consumer-grade GPU**, e.g. 3090.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index 2ad776404f..c06b47af4c 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -26,9 +26,10 @@
   - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
   - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)
   - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-  - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)
+  - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)
   - bluelm 系列: [bluelm-7b](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary), [bluelm-7b-chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary), [bluelm-7b-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary), [bluelm-7b-chat-32k](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)
   - mistral 系列: [mistral-7b](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary), [mistral-7b-chat](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)
+  - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary)
   - ziya 系列: [ziya2-13b](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary), [ziya2-13b-chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
   - skywork 系列: [skywork-13b](https://modelscope.cn/models/skywork/Skywork-13B-base/summary), [skywork-13b-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)
   - other: [polylm-13b](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary), [seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)
@@ -146,7 +147,7 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ## 🌟 微调和推理
 性能: full(优) > lora > qlora(auto_gptq) > qlora(bnb)
 
-训练显存: qlora(低,3090) > lora > full(2*A100)
+训练显存: full(高,2*A100) > lora > qlora(低,3090)
 
 **提示**:
 - 你可以在训练时设置`--gradient_checkpointing true`来**节约显存**, 但这会略微降低训练速度. 如果你需要在**消费级显卡**中训练大模型, 这很有用, 例如: 3090.
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 8b1ce0194f..b4ac729d0f 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -222,6 +222,7 @@ def __post_init__(self) -> None:
 
         self.deepspeed = None
         if self.deepspeed_config_path is not None:
+            require_version('deepspeed')
             with open(self.deepspeed_config_path, 'r') as f:
                 self.deepspeed = json.load(f)
             logger.info(f'Using deepspeed: {self.deepspeed}')
@@ -397,8 +398,10 @@ def select_bnb(
         torch.float16, torch.bfloat16, torch.float32
     }
     if quantization_bit == 4:
+        require_version('bitsandbytes')
         load_in_4bit, load_in_8bit = True, False
     elif quantization_bit == 8:
+        require_version('bitsandbytes')
         load_in_4bit, load_in_8bit = False, True
     else:
         load_in_4bit, load_in_8bit = False, False
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 47dac9fdb5..e1a85f04c6 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -93,6 +93,9 @@ class ModelType:
     # mistral
     mistral_7b = 'mistral-7b'
     mistral_7b_chat = 'mistral-7b-chat'
+    # yi
+    yi_6b = 'yi-6b'
+    yi_34b = 'yi-34b'
     # ziya
     ziya2_13b = 'ziya2-13b'
     ziya2_13b_chat = 'ziya2-13b-chat'
@@ -102,8 +105,6 @@ class ModelType:
     # other
     polylm_13b = 'polylm-13b'
     seqgpt_560m = 'seqgpt-560m'
-    yi_6b = 'yi-6b'
-    yi_34b = 'yi-34b'
 
 
 class LoRATM(NamedTuple):
@@ -191,28 +192,8 @@ def _register_model(
                 LoRATM.bluelm, TemplateType.default_generation)
 @register_model(ModelType.bluelm_7b, 'vivo-ai/BlueLM-7B-Base', LoRATM.bluelm,
                 TemplateType.default_generation)
-@register_model(ModelType.yi_34b, '01ai/Yi-34B', LoRATM.yi,
-                TemplateType.default_generation)
-@register_model(ModelType.yi_6b, '01ai/Yi-6B', LoRATM.yi,
-                TemplateType.default_generation)
 @register_model(ModelType.seqgpt_560m, 'damo/nlp_seqgpt-560m', LoRATM.bloom,
                 TemplateType.default_generation)
-@register_model(ModelType.ziya2_13b_chat, 'Fengshenbang/Ziya2-13B-Chat',
-                LoRATM.ziya, TemplateType.ziya)
-@register_model(ModelType.ziya2_13b, 'Fengshenbang/Ziya2-13B-Base',
-                LoRATM.ziya, TemplateType.default_generation)
-@register_model(
-    ModelType.mistral_7b_chat,
-    'AI-ModelScope/Mistral-7B-Instruct-v0.1',
-    LoRATM.mistral,
-    TemplateType.llama,
-    requires=['transformers>=4.34'])
-@register_model(
-    ModelType.mistral_7b,
-    'AI-ModelScope/Mistral-7B-v0.1',
-    LoRATM.mistral,
-    TemplateType.default_generation,
-    requires=['transformers>=4.34'])
 @register_model(ModelType.xverse_13b_chat, 'xverse/XVERSE-13B-Chat',
                 LoRATM.xverse, TemplateType.xverse)
 @register_model(ModelType.xverse_13b, 'xverse/XVERSE-13B', LoRATM.xverse,
@@ -236,39 +217,6 @@ def _register_model(
                 LoRATM.internlm, TemplateType.internlm)
 @register_model(ModelType.internlm_7b, 'Shanghai_AI_Laboratory/internlm-7b',
                 LoRATM.internlm, TemplateType.default_generation)
-@register_model(
-    ModelType.openbuddy_mistral_7b_chat,
-    'OpenBuddy/openbuddy-mistral-7b-v13.1',
-    LoRATM.mistral,
-    TemplateType.openbuddy,
-    requires=['transformers>=4.34'])
-@register_model(ModelType.openbuddy_llama2_70b_chat,
-                'OpenBuddy/openbuddy-llama2-70b-v10.1-bf16', LoRATM.llama2,
-                TemplateType.openbuddy)
-@register_model(ModelType.openbuddy_llama2_65b_chat,
-                'OpenBuddy/openbuddy-llama-65b-v8-bf16', LoRATM.llama2,
-                TemplateType.openbuddy)
-@register_model(ModelType.openbuddy_llama2_13b_chat,
-                'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16', LoRATM.llama2,
-                TemplateType.openbuddy)
-@register_model(
-    ModelType.llama2_7b_chat,
-    'modelscope/Llama-2-7b-chat-ms',
-    LoRATM.llama2,
-    TemplateType.llama,
-    ignore_file_pattern=[r'.+\.bin$'])
-@register_model(
-    ModelType.llama2_70b,
-    'modelscope/Llama-2-70b-ms',
-    LoRATM.llama2,
-    TemplateType.default_generation,
-    ignore_file_pattern=[r'.+\.bin$'])
-@register_model(
-    ModelType.llama2_7b,
-    'modelscope/Llama-2-7b-ms',
-    LoRATM.llama2,
-    TemplateType.default_generation,
-    ignore_file_pattern=[r'.+\.bin$'])
 @register_model(
     ModelType.baichuan_13b_chat,
     'baichuan-inc/Baichuan-13B-Chat',
@@ -492,9 +440,77 @@ def cross_entropy_forward(self, inputs: Tensor,
     return model, tokenizer
 
 
+@register_model(ModelType.yi_34b, '01ai/Yi-34B', LoRATM.yi,
+                TemplateType.default_generation)
+@register_model(ModelType.yi_6b, '01ai/Yi-6B', LoRATM.yi,
+                TemplateType.default_generation)
+@register_model(ModelType.ziya2_13b_chat, 'Fengshenbang/Ziya2-13B-Chat',
+                LoRATM.ziya, TemplateType.ziya)
+@register_model(ModelType.ziya2_13b, 'Fengshenbang/Ziya2-13B-Base',
+                LoRATM.ziya, TemplateType.default_generation)
 @register_model(
-    ModelType.llama2_70b_chat,
-    'modelscope/Llama-2-70b-chat-ms',
+    ModelType.openbuddy_mistral_7b_chat,
+    'OpenBuddy/openbuddy-mistral-7b-v13.1',
+    LoRATM.mistral,
+    TemplateType.openbuddy,
+    requires=['transformers>=4.34'])
+@register_model(ModelType.openbuddy_llama2_70b_chat,
+                'OpenBuddy/openbuddy-llama2-70b-v10.1-bf16', LoRATM.llama2,
+                TemplateType.openbuddy)
+@register_model(ModelType.openbuddy_llama2_65b_chat,
+                'OpenBuddy/openbuddy-llama-65b-v8-bf16', LoRATM.llama2,
+                TemplateType.openbuddy)
+@register_model(ModelType.openbuddy_llama2_13b_chat,
+                'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16', LoRATM.llama2,
+                TemplateType.openbuddy)
+@register_model(
+    ModelType.mistral_7b_chat,
+    'AI-ModelScope/Mistral-7B-Instruct-v0.1',
+    LoRATM.mistral,
+    TemplateType.llama,
+    requires=['transformers>=4.34'])
+@register_model(
+    ModelType.mistral_7b,
+    'AI-ModelScope/Mistral-7B-v0.1',
+    LoRATM.mistral,
+    TemplateType.default_generation,
+    requires=['transformers>=4.34'])
+def get_model_tokenizer_with_flash_attn(model_dir: str,
+                                        torch_dtype: Dtype,
+                                        model_kwargs: Dict[str, Any],
+                                        load_model: bool = True,
+                                        model_config=None,
+                                        **kwargs):
+    if model_config is None:
+        model_config = AutoConfig.from_pretrained(
+            model_dir, trust_remote_code=True)
+    _flash_attn_2_enabled = kwargs.pop('use_flash_attn', False)
+    model_config._flash_attn_2_enabled = _flash_attn_2_enabled
+    return get_model_tokenizer_from_repo(model_dir, torch_dtype, model_kwargs,
+                                         load_model, model_config, **kwargs)
+
+
+@register_model(
+    ModelType.llama2_7b,
+    'modelscope/Llama-2-7b-ms',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    ignore_file_pattern=[r'.+\.bin$'])
+@register_model(
+    ModelType.llama2_13b,
+    'modelscope/Llama-2-13b-ms',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    ignore_file_pattern=[r'.+\.bin$'])
+@register_model(
+    ModelType.llama2_70b,
+    'modelscope/Llama-2-70b-ms',
+    LoRATM.llama2,
+    TemplateType.default_generation,
+    ignore_file_pattern=[r'.+\.bin$'])
+@register_model(
+    ModelType.llama2_7b_chat,
+    'modelscope/Llama-2-7b-chat-ms',
     LoRATM.llama2,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'])
@@ -505,10 +521,10 @@ def cross_entropy_forward(self, inputs: Tensor,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'])
 @register_model(
-    ModelType.llama2_13b,
-    'modelscope/Llama-2-13b-ms',
+    ModelType.llama2_70b_chat,
+    'modelscope/Llama-2-70b-chat-ms',
     LoRATM.llama2,
-    TemplateType.default_generation,
+    TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'])
 def get_model_tokenizer_llama2(model_dir: str,
                                torch_dtype: Dtype,
@@ -518,8 +534,9 @@ def get_model_tokenizer_llama2(model_dir: str,
     model_config = AutoConfig.from_pretrained(
         model_dir, trust_remote_code=True)
     model_config.pretraining_tp = 1
-    return get_model_tokenizer_from_repo(model_dir, torch_dtype, model_kwargs,
-                                         load_model, model_config, **kwargs)
+    return get_model_tokenizer_with_flash_attn(model_dir, torch_dtype,
+                                               model_kwargs, load_model,
+                                               model_config, **kwargs)
 
 
 @register_model(ModelType.polylm_13b, 'damo/nlp_polylm_13b_text_generation',

From e85398c6271064b5fe0bf1e9ec57ebac9e5cf751 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 16 Nov 2023 02:08:50 +0800
Subject: [PATCH 2/3] update readme

---
 examples/pytorch/llm/README.md    | 2 +-
 examples/pytorch/llm/README_CN.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
index 5960311fc9..2fa64f2665 100644
--- a/examples/pytorch/llm/README.md
+++ b/examples/pytorch/llm/README.md
@@ -147,7 +147,7 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ## 🌟 Run SFT and Inference
 Performace: full(nice) > lora > qlora(auto_gptq) > qlora(bnb)
 
-Training GPU memory: full(high,2*A100) > lora > qlora(low,3090)
+Training GPU memory: qlora(low,3090) < lora < full(high,2*A100)
 
 **Tips**:
 - You can set `--gradient_checkpointing true` during training to **save GPU memory**, but this will slightly decrease the training speed. This is useful if you need to train LLM on **consumer-grade GPU**, e.g. 3090.
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
index c06b47af4c..a3d54cde46 100644
--- a/examples/pytorch/llm/README_CN.md
+++ b/examples/pytorch/llm/README_CN.md
@@ -147,7 +147,7 @@ CUDA_VISIBLE_DEVICES=0 swift web-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ## 🌟 微调和推理
 性能: full(优) > lora > qlora(auto_gptq) > qlora(bnb)
 
-训练显存: full(高,2*A100) > lora > qlora(低,3090)
+训练显存: qlora(低,3090) < lora < full(高,2*A100)
 
 **提示**:
 - 你可以在训练时设置`--gradient_checkpointing true`来**节约显存**, 但这会略微降低训练速度. 如果你需要在**消费级显卡**中训练大模型, 这很有用, 例如: 3090.

From f76c7c28cf976b832197ecefbd0c2bc9de7997da Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 16 Nov 2023 11:29:39 +0800
Subject: [PATCH 3/3] update model.py

---
 .dev_scripts/ci_container_test.sh |  2 +-
 swift/llm/utils/argument.py       | 20 +++++++++++---------
 swift/llm/utils/model.py          | 12 ++++++------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 173a8a483f..fcc7869ed3 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -24,7 +24,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
 
     # test with install
     pip install .
-    pip install auto_gptq -U -i https://mirrors.aliyun.com/pypi/simple/
+    pip install auto_gptq bitsandbytes deepspeed -U -i https://mirrors.aliyun.com/pypi/simple/
 else
     echo "Running case in release image, run case directly!"
 fi
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 6743d06bf4..45cd1afc7f 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -36,10 +36,11 @@ class SftArguments:
         metadata={'choices': ['lora', 'longlora', 'qalora', 'full']})
     tuner_backend: str = field(
         default='swift', metadata={'choices': ['swift', 'peft']})
-    template_type: Optional[str] = field(
-        default=None,
+    template_type: str = field(
+        default='AUTO',
         metadata={
-            'help': f'template_type choices: {list(TEMPLATE_MAPPING.keys())}'
+            'help':
+            f"template_type choices: {list(TEMPLATE_MAPPING.keys()) + ['AUTO']}"
         })
     output_dir: str = 'output'
     add_output_dir_suffix: bool = True
@@ -190,7 +191,7 @@ def __post_init__(self) -> None:
         else:
             raise ValueError(f'sft_type: {self.sft_type}')
 
-        if self.template_type is None:
+        if self.template_type == 'AUTO':
             self.template_type = MODEL_MAPPING[self.model_type]['template']
             logger.info(f'Setting template_type: {self.template_type}')
         if self.dataset is None:
@@ -244,10 +245,11 @@ class InferArguments:
     sft_type: str = field(
         default='lora',
         metadata={'choices': ['lora', 'longlora', 'qalora', 'full']})
-    template_type: Optional[str] = field(
-        default=None,
+    template_type: str = field(
+        default='AUTO',
         metadata={
-            'help': f'template_type choices: {list(TEMPLATE_MAPPING.keys())}'
+            'help':
+            f"template_type choices: {list(TEMPLATE_MAPPING.keys()) + ['AUTO']}"
         })
     ckpt_dir: Optional[str] = field(
         default=None, metadata={'help': '/path/to/your/vx_xxx/checkpoint-xxx'})
@@ -313,7 +315,7 @@ def __post_init__(self) -> None:
         handle_path(self)
 
         self.torch_dtype, _, _ = select_dtype(self)
-        if self.template_type is None:
+        if self.template_type == 'AUTO':
             self.template_type = MODEL_MAPPING[self.model_type]['template']
             logger.info(f'Setting template_type: {self.template_type}')
         if self.dataset is None:
@@ -346,7 +348,7 @@ def __post_init__(self) -> None:
         handle_path(self)
 
         self.torch_dtype, _, _ = select_dtype(self)
-        if self.template_type is None:
+        if self.template_type == 'AUTO':
             self.template_type = MODEL_MAPPING[self.model_type]['template']
             logger.info(f'Setting template_type: {self.template_type}')
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index e1a85f04c6..fc29f6cc4c 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -115,12 +115,12 @@ class LoRATM(NamedTuple):
     qwen = ['c_attn']
     polylm = ['c_attn']
     bloom = ['query_key_value']
-    internlm = ['q_proj', 'k_proj', 'v_proj']
-    xverse = ['q_proj', 'k_proj', 'v_proj']
-    mistral = ['q_proj', 'k_proj', 'v_proj']
-    ziya = ['q_proj', 'k_proj', 'v_proj']
-    yi = ['q_proj', 'k_proj', 'v_proj']
-    bluelm = ['q_proj', 'k_proj', 'v_proj']
+    internlm = llama2
+    xverse = llama2
+    mistral = llama2
+    ziya = llama2
+    yi = llama2
+    bluelm = llama2
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],