From 25e37c5fe1b387508da8299283fff59781f405c0 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 17 Dec 2023 17:42:59 +0800
Subject: [PATCH 01/19] update code

---
 scripts/utils/run_dataset_info.py | 11 +----------
 scripts/utils/run_template.py     | 12 +-----------
 swift/llm/utils/dataset.py        |  9 +++++++++
 swift/llm/utils/model.py          |  8 ++++----
 swift/llm/utils/template.py       |  9 +++++++++
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/scripts/utils/run_dataset_info.py b/scripts/utils/run_dataset_info.py
index 9e8e25c8a6..f08f2a86cf 100644
--- a/scripts/utils/run_dataset_info.py
+++ b/scripts/utils/run_dataset_info.py
@@ -9,15 +9,6 @@
 from swift.utils import stat_array
 
 
-def get_dataset_name_list() -> List[str]:
-    res = []
-    for k in DatasetName.__dict__.keys():
-        if k.startswith('__'):
-            continue
-        res.append(getattr(DatasetName, k))
-    return res
-
-
 def write_dataset_info(fpath: str) -> None:
     res_text_list = []
     if os.path.exists(fpath):
@@ -40,7 +31,7 @@ def write_dataset_info(fpath: str) -> None:
         text.split('|', 2)[1].lstrip('🔥 '): text
         for text in text_list
     }
-    dataset_name_list = get_dataset_name_list()
+    dataset_name_list = DatasetName.get_dataset_name_list()
     mapping = {}
     _iter = zip(
         ['llm', 'vision', 'audio'],
diff --git a/scripts/utils/run_template.py b/scripts/utils/run_template.py
index 49460633e6..d353ea7127 100644
--- a/scripts/utils/run_template.py
+++ b/scripts/utils/run_template.py
@@ -2,18 +2,8 @@
 
 from swift.llm import TemplateType
 
-
-def get_template_name_list() -> List[str]:
-    res = []
-    for k in TemplateType.__dict__.keys():
-        if k.startswith('__'):
-            continue
-        res.append(TemplateType.__dict__[k])
-    return res
-
-
 if __name__ == '__main__':
-    template_name_list = get_template_name_list()
+    template_name_list = TemplateType.get_template_name_list()
     tn_gen = ', '.join([tn for tn in template_name_list if 'generation' in tn])
     tn_chat = ', '.join(
         [tn for tn in template_name_list if 'generation' not in tn])
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 54b26b6a6a..0845fe03ea 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -110,6 +110,15 @@ class DatasetName:
     aishell1_zh = 'aishell1-zh'
     aishell1_mini_zh = 'aishell1-mini-zh'
 
+    @classmethod
+    def get_dataset_name_list(cls) -> List[str]:
+        res = []
+        for k in cls.__dict__.keys():
+            if k.startswith('__') or k == 'get_dataset_name_list':
+                continue
+            res.append(cls.__dict__[k])
+        return res
+
 
 def register_dataset(
         dataset_name: str,
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 99d3eff527..564b2dee37 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -153,13 +153,13 @@ class ModelType:
     deepseek_coder_33b = 'deepseek-coder-33b'
     deepseek_coder_33b_chat = 'deepseek-coder-33b-chat'
 
-    @staticmethod
-    def get_model_name_list() -> List[str]:
+    @classmethod
+    def get_model_name_list(cls) -> List[str]:
         res = []
-        for k in ModelType.__dict__.keys():
+        for k in cls.__dict__.keys():
             if k.startswith('__') or k == 'get_model_name_list':
                 continue
-            res.append(ModelType.__dict__[k])
+            res.append(cls.__dict__[k])
         return res
 
 
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index d8c08873ca..fb46b7ad24 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -36,6 +36,15 @@ class TemplateType:
     codefuse_codellama = 'codefuse-codellama'
     deepseek_coder = 'deepseek-coder'
 
+    @classmethod
+    def get_template_name_list(cls) -> List[str]:
+        res = []
+        for k in cls.__dict__.keys():
+            if k.startswith('__') or k == 'get_template_name_list':
+                continue
+            res.append(cls.__dict__[k])
+        return res
+
 
 Prompt = List[Union[str, List[Union[str, int]]]]
 StopWords = Prompt

From 50895c1397ed824d026a92ad5eccaabd39b9cf52 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 17 Dec 2023 20:02:18 +0800
Subject: [PATCH 02/19] update torch_dtype

---
 swift/llm/utils/model.py | 43 ++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 564b2dee37..ac80deecf1 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -12,12 +12,11 @@
 import transformers
 from modelscope import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                         BitsAndBytesConfig, GenerationConfig, GPTQConfig,
-                        Model, read_config, snapshot_download)
+                        snapshot_download)
 from packaging import version
 from torch import Tensor
 from torch import dtype as Dtype
-from transformers import (PretrainedConfig, PreTrainedModel,
-                          PreTrainedTokenizerBase)
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
@@ -803,12 +802,13 @@ def get_model_tokenizer_qwen(model_dir: str,
                              **kwargs):
     model_config = AutoConfig.from_pretrained(
         model_dir, trust_remote_code=True)
-    k_true = dtype_mapping[torch_dtype]
-    for k in dtype_mapping.values():
-        v = False
-        if k == k_true:
-            v = True
-        setattr(model_config, k, v)
+    if torch_dtype is not None:
+        k_true = dtype_mapping[torch_dtype]
+        for k in dtype_mapping.values():
+            v = False
+            if k == k_true:
+                v = True
+            setattr(model_config, k, v)
 
     use_flash_attn = kwargs.pop('use_flash_attn', None)
     if use_flash_attn is None:
@@ -1221,6 +1221,10 @@ def get_model_tokenizer(
         model_kwargs: Optional[Dict[str, Any]] = None,
         load_model: bool = True,
         **kwargs) -> Tuple[Optional[PreTrainedModel], PreTrainedTokenizerBase]:
+    """
+    torch_dtype: If you use None, it will retrieve the torch_dtype from the config.json file.
+        However, if torch.float32 is retrieved, torch.float16 will be used.
+    """
     model_info = MODEL_MAPPING[model_type]
     requires = model_info['requires']
     for require in requires:
@@ -1229,14 +1233,6 @@ def get_model_tokenizer(
     model_id_or_path = model_info['model_id_or_path']
     get_function = model_info['get_function']
     ignore_file_pattern = model_info['ignore_file_pattern']
-    if model_info.get('torch_dtype') is not None:
-        model_torch_dtype = model_info['torch_dtype']
-        if torch_dtype is None:
-            torch_dtype = model_torch_dtype
-        else:
-            assert torch_dtype == model_torch_dtype, f'please use `{model_torch_dtype}`'
-    elif torch_dtype is None:
-        torch_dtype = torch.float16
     if model_kwargs is None:
         model_kwargs = {}
     if 'device_map' not in model_kwargs:
@@ -1259,6 +1255,19 @@ def get_model_tokenizer(
     else:
         model_dir = os.path.expanduser(model_dir)
         assert os.path.isdir(model_dir)
+    if model_info.get('torch_dtype') is not None:
+        model_torch_dtype = model_info['torch_dtype']
+        if torch_dtype is None:
+            torch_dtype = model_torch_dtype
+        else:
+            assert torch_dtype == model_torch_dtype, f'please use `{model_torch_dtype}`'
+    else:
+        if torch_dtype is None:
+            model_config = AutoConfig.from_pretrained(
+                model_dir, trust_remote_code=True)
+            torch_dtype = getattr(model_config, 'torch_dtype', None)
+            if torch_dtype == torch.float32:
+                torch_dtype = torch.float16
     kwargs['automodel_class'] = model_info['automodel_class']
     kwargs['eos_token'] = model_info['eos_token']
     model, tokenizer = get_function(model_dir, torch_dtype, model_kwargs,

From c3cf05544d4beea336b23a72b84108f72ba198b3 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 17 Dec 2023 21:14:56 +0800
Subject: [PATCH 03/19] update inference generation_config

---
 swift/llm/utils/utils.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 255eed062a..b198cca765 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -439,11 +439,17 @@ def _is_chinese_char(cp):
 
 
 def inference_stream(
-        model: PreTrainedModel,
-        template: Template,
-        query: str,
-        history: Optional[History] = None,
-        system: Optional[str] = None) -> Iterator[Tuple[str, History]]:
+    model: PreTrainedModel,
+    template: Template,
+    query: str,
+    history: Optional[History] = None,
+    system: Optional[str] = None,
+    *,
+    generation_config: Optional[GenerationConfig] = None
+) -> Iterator[Tuple[str, History]]:
+    """
+    generation_config: Priority: generation_config > model.generation_config.
+    """
     if history is None:
         history = []
     else:
@@ -457,7 +463,8 @@ def inference_stream(
     input_ids = torch.tensor(input_ids)[None].to(device)
     attention_mask = torch.ones_like(input_ids).to(device)
     model.eval()
-    generation_config = getattr(model, 'generation_config', None)
+    if generation_config is None:
+        generation_config = getattr(model, 'generation_config', None)
     from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
     model.__class__.generate_stream = NewGenerationMixin.generate
     model.__class__.sample_stream = NewGenerationMixin.sample_stream
@@ -509,10 +516,14 @@ def inference(model: PreTrainedModel,
               history: Optional[History] = None,
               system: Optional[str] = None,
               *,
+              generation_config: Optional[GenerationConfig] = None,
               stream: bool = False,
               verbose: bool = False,
               prompt_prefix: str = '[PROMPT]',
               output_prefix: str = '[OUTPUT]') -> Tuple[str, History]:
+    """
+    generation_config: Priority: generation_config > model.generation_config.
+    """
     if history is None:
         history = []
     else:
@@ -526,7 +537,8 @@ def inference(model: PreTrainedModel,
     input_ids = torch.tensor(input_ids)[None].to(device)
     attention_mask = torch.ones_like(input_ids).to(device)
     model.eval()
-    generation_config = getattr(model, 'generation_config', None)
+    if generation_config is None:
+        generation_config = getattr(model, 'generation_config', None)
     if stream is True and verbose is False:
         logger.warning(
             'Please set verbose to True to support TextStreamer, or use `inference_stream.`'

From a86ed04539c07a7febbe71a0ff243d9dec0c0cc6 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 17 Dec 2023 23:39:31 +0800
Subject: [PATCH 04/19] support vllm inference

---
 swift/llm/deploy.py           | 214 ++++++++++++++++++++++++++++++++++
 swift/llm/utils/__init__.py   |  10 ++
 swift/llm/utils/utils.py      |  16 ++-
 swift/llm/utils/vllm_utils.py | 203 ++++++++++++++++++++++++++++++++
 tests/llm/test_vllm_utils.py  |  34 ++++++
 5 files changed, 472 insertions(+), 5 deletions(-)
 create mode 100644 swift/llm/deploy.py
 create mode 100644 swift/llm/utils/vllm_utils.py
 create mode 100644 tests/llm/test_vllm_utils.py

diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
new file mode 100644
index 0000000000..6ea789bffc
--- /dev/null
+++ b/swift/llm/deploy.py
@@ -0,0 +1,214 @@
+from copy import deepcopy
+from typing import *
+
+import torch
+from torch import dtype as Dtype
+from vllm import EngineArgs, LLMEngine, SamplingParams
+
+from swift.llm import *
+from swift.llm.utils.model import *
+
+
+def get_vllm_engine(
+    model_type: str,
+    torch_dtype: Optional[Dtype] = None,
+    gpu_memory_utilization: float = 0.9,
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    engine_kwargs: Optional[Dict[str, Any]] = None,
+) -> LLMEngine:
+    if engine_kwargs is None:
+        engine_kwargs = {}
+    model_info = MODEL_MAPPING[model_type]
+    model_id_or_path = model_info['model_id_or_path']
+    revision = model_info['revision']
+    ignore_file_pattern = model_info['ignore_file_pattern']
+    model_dir = snapshot_download(
+        model_id_or_path, revision, ignore_file_pattern=ignore_file_pattern)
+    dtype_mapping = {
+        torch.float16: 'float16',
+        torch.bfloat16: 'bfloat16',
+        torch.float32: 'float32',
+        None: 'auto'
+    }
+    disable_log_stats = engine_kwargs.pop('disable_log_stats', True)
+    engine_args = EngineArgs(
+        model=model_dir,
+        trust_remote_code=True,
+        dtype=dtype_mapping[torch_dtype],
+        gpu_memory_utilization=gpu_memory_utilization,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        disable_log_stats=disable_log_stats,
+        **engine_kwargs)
+    llm_engine = LLMEngine.from_engine_args(engine_args)
+    llm_engine.model_dir = model_dir
+    llm_engine.model_type = model_type
+    llm_engine.tokenizer = get_model_tokenizer(model_type, load_model=False)[1]
+    return llm_engine
+
+
+class VllmGenerationConfig(SamplingParams):
+
+    def __init__(
+        self,
+        max_length: int = 20,
+        max_new_tokens: Optional[int] = None,
+        temperature: float = 1.,
+        top_k: int = 50,  # -1: all
+        top_p: float = 1.0,
+        repetition_penalty: float = 1.,
+        length_penalty: float = 1.0,
+        stop: Optional[Union[str, List[str]]] = None,
+        skip_special_tokens: bool = True,
+        **kwargs,
+    ):
+        # The parameter design is similar to transformers.GenerationConfig.
+        self.max_new_tokens = max_new_tokens
+        super().__init__(
+            max_tokens=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            stop=stop,
+            skip_special_tokens=skip_special_tokens,
+            **kwargs)
+
+    @property
+    def max_length(self) -> int:
+        return self.max_tokens
+
+    @max_length.setter
+    def max_length(self, value: int) -> None:
+        self.max_tokens = value
+
+
+def inference_stream_vllm(
+    llm_engine: LLMEngine,
+    template: Template,
+    request_list: List[Dict[str, Any]],
+    *,
+    generation_config: Optional[VllmGenerationConfig] = None
+) -> List[Dict[str, Any]]:
+    """
+    request_list: e.g. [{'query': 'hello!'}].
+        The keys that can be included are: 'query', 'history', 'system'.
+    generation_config: Priority: generation_config > model.generation_config.
+    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
+        The keys to be included will be: 'response', 'history'.
+    """
+    if generation_config is None:
+        generation_config = getattr(model, 'generation_config',
+                                    VllmGenerationConfig())
+    assert isinstance(generation_config, VllmGenerationConfig)
+    request_list = deepcopy(request_list)
+    generation_config = deepcopy(generation_config)
+    for i, request in enumerate(request_list):
+        history = request.get('history', None)
+        if history is None:
+            history = []
+        request['history'] = history
+        inputs = template.encode(request)
+        input_ids = inputs['input_ids']
+        tokenizer = template.tokenizer
+        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
+            generation_config.stop.append(tokenizer.eos_token)
+        if generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + len(
+                input_ids)
+        llm_engine.add_request(str(i), None, generation_config, input_ids)
+    outputs = []
+    while llm_engine.has_unfinished_requests():
+        step_outputs = llm_engine.step()
+        for output in step_outputs:
+            if output.finished:
+                outputs.append(output)
+
+    response_list = []
+    for output, request in zip(outputs, request_list):
+        response = output.outputs[0].text
+        query = request['query']
+        history = request['history']
+        history.append((query, response))
+        response_list.append({'response': response, 'history': history})
+        if verbose:
+            print(
+                f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
+                end='')
+            print(tokenizer.decode(output.outputs[0].token_ids, False))
+    return response_list
+
+
+def inference_vllm(llm_engine: LLMEngine,
+                   template: Template,
+                   request_list: List[Dict[str, Any]],
+                   *,
+                   generation_config: Optional[VllmGenerationConfig] = None,
+                   verbose: bool = False,
+                   prompt_prefix: str = '[PROMPT]',
+                   output_prefix: str = '[OUTPUT]') -> List[Dict[str, Any]]:
+    """
+    request_list: e.g. [{'query': 'hello!'}].
+        The keys that can be included are: 'query', 'history', 'system'.
+    generation_config: Priority: generation_config > model.generation_config.
+    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
+        The keys to be included will be: 'response', 'history'.
+    """
+    if generation_config is None:
+        generation_config = getattr(model, 'generation_config',
+                                    VllmGenerationConfig())
+    assert isinstance(generation_config, VllmGenerationConfig)
+    request_list = deepcopy(request_list)
+    generation_config = deepcopy(generation_config)
+    for i, request in enumerate(request_list):
+        history = request.get('history', None)
+        if history is None:
+            history = []
+        request['history'] = history
+        inputs = template.encode(request)
+        input_ids = inputs['input_ids']
+        tokenizer = template.tokenizer
+        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
+            generation_config.stop.append(tokenizer.eos_token)
+        if generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + len(
+                input_ids)
+        llm_engine.add_request(str(i), None, generation_config, input_ids)
+    outputs = []
+    while llm_engine.has_unfinished_requests():
+        step_outputs = llm_engine.step()
+        for output in step_outputs:
+            if output.finished:
+                outputs.append(output)
+
+    response_list = []
+    for output, request in zip(outputs, request_list):
+        response = output.outputs[0].text
+        query = request['query']
+        history = request['history']
+        history.append((query, response))
+        response_list.append({'response': response, 'history': history})
+        if verbose:
+            print(
+                f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
+                end='')
+            print(tokenizer.decode(output.outputs[0].token_ids, False))
+    return response_list
+
+
+if __name__ == '__main__':
+    model_type = ModelType.qwen_7b_chat
+    llm_engine = get_vllm_engine(model_type, torch.float16)
+    template_type = get_default_template_type(model_type)
+    template = get_template(template_type, llm_engine.tokenizer)
+    request_list = [{'query': '浙江的省会在哪？'}, {'query': '你好!'}]
+    response_list = inference_vllm(llm_engine, template, request_list)
+    for response in response_list:
+        print(response)
+
+    gen = inference_stream_vllm(llm_engine, template, request_list)
+    for response_list in gen:
+        print(response_list[0]['history'])
+        print(response_list[1]['history'])
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index b5c1eadd0c..a4d39bbf24 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -22,3 +22,13 @@
                     inference_stream, limit_history_length,
                     messages_to_history, print_example, set_generation_config,
                     sort_by_max_length, stat_dataset)
+
+try:
+    from .vllm_utils import (VllmGenerationConfig, get_vllm_engine,
+                             inference_stream_vllm, inference_vllm)
+except ImportError:
+    pass
+except Exception as e:
+    from swift.utils import get_logger
+    logger = get_logger()
+    logger.warning(f'import vllm_utils error: {e}')
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index b198cca765..08cff505a6 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -5,6 +5,7 @@
 import os
 import shutil
 import time
+from copy import deepcopy
 from functools import partial, wraps
 from queue import Empty, Queue
 from tempfile import TemporaryDirectory
@@ -453,7 +454,7 @@ def inference_stream(
     if history is None:
         history = []
     else:
-        history = history.copy()
+        history = deepcopy(history)
     example = {'query': query, 'history': history, 'system': system}
     inputs = template.encode(example)
     audio_info = inputs.get('audio_info')  # Compatible with qwen-audio
@@ -465,6 +466,7 @@ def inference_stream(
     model.eval()
     if generation_config is None:
         generation_config = getattr(model, 'generation_config', None)
+    generation_config = deepcopy(generation_config)
     from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
     model.__class__.generate_stream = NewGenerationMixin.generate
     model.__class__.sample_stream = NewGenerationMixin.sample_stream
@@ -494,16 +496,19 @@ def inference_stream(
         **model_kwargs,
         seed=-1)
     generate_ids = []
+    response = ''
+    print_idx = 0
     history.append(None)  # dummy
     for token in gen:
         generate_ids.append(token.item())
         response = tokenizer.decode(generate_ids, True, **decode_kwargs)
         if response.endswith('\n') or len(response) > 0 and _is_chinese_char(
                 ord(response[-1])):
-            safe_response = response
+            print_idx = len(response)
         else:
-            safe_response = response[:response.rfind(' ')
-                                     + 1]  # avoid printing incomplete words
+            print_idx = max(response.rfind(' ') + 1, print_idx)
+        # avoid printing incomplete words
+        safe_response = response[:print_idx]
         history[-1] = (query, safe_response)
         yield safe_response, history
     history[-1] = (query, response)
@@ -527,7 +532,7 @@ def inference(model: PreTrainedModel,
     if history is None:
         history = []
     else:
-        history = history.copy()
+        history = deepcopy(history)
     example = {'query': query, 'history': history, 'system': system}
     inputs = template.encode(example)
     audio_info = inputs.get('audio_info')  # Compatible with qwen-audio
@@ -539,6 +544,7 @@ def inference(model: PreTrainedModel,
     model.eval()
     if generation_config is None:
         generation_config = getattr(model, 'generation_config', None)
+    generation_config = deepcopy(generation_config)
     if stream is True and verbose is False:
         logger.warning(
             'Please set verbose to True to support TextStreamer, or use `inference_stream.`'
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
new file mode 100644
index 0000000000..e071fc7cbe
--- /dev/null
+++ b/swift/llm/utils/vllm_utils.py
@@ -0,0 +1,203 @@
+from copy import deepcopy
+from typing import Any, Dict, List, Optional
+
+import torch
+from modelscope import snapshot_download
+from torch import dtype as Dtype
+from vllm import EngineArgs, LLMEngine, SamplingParams
+
+from .model import MODEL_MAPPING, get_model_tokenizer
+from .template import Template
+from .utils import _is_chinese_char
+
+
+def get_vllm_engine(
+    model_type: str,
+    torch_dtype: Optional[Dtype] = None,
+    gpu_memory_utilization: float = 0.9,
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    engine_kwargs: Optional[Dict[str, Any]] = None,
+) -> LLMEngine:
+    if engine_kwargs is None:
+        engine_kwargs = {}
+    model_info = MODEL_MAPPING[model_type]
+    model_id_or_path = model_info['model_id_or_path']
+    revision = model_info['revision']
+    ignore_file_pattern = model_info['ignore_file_pattern']
+    model_dir = snapshot_download(
+        model_id_or_path, revision, ignore_file_pattern=ignore_file_pattern)
+    dtype_mapping = {
+        torch.float16: 'float16',
+        torch.bfloat16: 'bfloat16',
+        torch.float32: 'float32',
+        None: 'auto'
+    }
+    disable_log_stats = engine_kwargs.pop('disable_log_stats', True)
+    engine_args = EngineArgs(
+        model=model_dir,
+        trust_remote_code=True,
+        dtype=dtype_mapping[torch_dtype],
+        gpu_memory_utilization=gpu_memory_utilization,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        disable_log_stats=disable_log_stats,
+        **engine_kwargs)
+    llm_engine = LLMEngine.from_engine_args(engine_args)
+    llm_engine.model_dir = model_dir
+    llm_engine.model_type = model_type
+    llm_engine.tokenizer = get_model_tokenizer(model_type, load_model=False)[1]
+    return llm_engine
+
+
+class VllmGenerationConfig(SamplingParams):
+
+    def __init__(
+        self,
+        max_length: int = 20,
+        max_new_tokens: Optional[int] = None,
+        temperature: float = 1.,
+        top_k: int = 50,  # -1: all
+        top_p: float = 1.0,
+        repetition_penalty: float = 1.,
+        length_penalty: float = 1.0,
+        stop: Optional[List[str]] = None,
+        skip_special_tokens: bool = True,
+        **kwargs,
+    ):
+        # The parameter design is similar to transformers.GenerationConfig.
+        self.max_new_tokens = max_new_tokens
+        super().__init__(
+            max_tokens=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            stop=stop,
+            skip_special_tokens=skip_special_tokens,
+            **kwargs)
+
+    @property
+    def max_length(self) -> int:
+        return self.max_tokens
+
+    @max_length.setter
+    def max_length(self, value: int) -> None:
+        self.max_tokens = value
+
+
+def inference_stream_vllm(
+    llm_engine: LLMEngine,
+    template: Template,
+    request_list: List[Dict[str, Any]],
+    *,
+    generation_config: Optional[VllmGenerationConfig] = None
+) -> List[Dict[str, Any]]:
+    """
+    request_list: e.g. [{'query': 'hello!'}].
+        The keys that can be included are: 'query', 'history', 'system'.
+    generation_config: Priority: generation_config > model.generation_config.
+    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
+        The keys to be included will be: 'response', 'history'.
+    """
+    if generation_config is None:
+        generation_config = getattr(llm_engine, 'generation_config',
+                                    VllmGenerationConfig())
+    assert isinstance(generation_config, VllmGenerationConfig)
+    request_list = deepcopy(request_list)
+    generation_config = deepcopy(generation_config)
+    for i, request in enumerate(request_list):
+        history = request.get('history', None)
+        if history is None:
+            history = []
+        request['history'] = history
+        inputs = template.encode(request)
+        input_ids = inputs['input_ids']
+        tokenizer = template.tokenizer
+        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
+            generation_config.stop.append(tokenizer.eos_token)
+        if generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + len(
+                input_ids)
+        llm_engine.add_request(str(i), None, generation_config, input_ids)
+    batch_size = len(request_list)
+    response_list = [None] * batch_size
+    while llm_engine.has_unfinished_requests():
+        step_outputs = llm_engine.step()
+        for output in step_outputs:
+            i = int(output.request_id)
+            request = request_list[i]
+            response = output.outputs[0].text
+            if output.finished or response.endswith(
+                    '\n') or len(response) > 0 and _is_chinese_char(
+                        ord(response[-1])):
+                print_idx = len(response)
+            else:
+                print_idx = max(response.rfind(' ') + 1, print_idx)
+            # avoid printing incomplete words
+            safe_response = response[:print_idx]
+            query = request['query']
+            history = request['history']
+            if response_list[i] is None:
+                history.append(None)
+            history[-1] = (query, safe_response)
+            response_list[i] = {'response': safe_response, 'history': history}
+        yield response_list
+
+
+def inference_vllm(llm_engine: LLMEngine,
+                   template: Template,
+                   request_list: List[Dict[str, Any]],
+                   *,
+                   generation_config: Optional[VllmGenerationConfig] = None,
+                   verbose: bool = False,
+                   prompt_prefix: str = '[PROMPT]',
+                   output_prefix: str = '[OUTPUT]') -> List[Dict[str, Any]]:
+    """
+    request_list: e.g. [{'query': 'hello!'}].
+        The keys that can be included are: 'query', 'history', 'system'.
+    generation_config: Priority: generation_config > model.generation_config.
+    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
+        The keys to be included will be: 'response', 'history'.
+    """
+    if generation_config is None:
+        generation_config = getattr(llm_engine, 'generation_config',
+                                    VllmGenerationConfig())
+    assert isinstance(generation_config, VllmGenerationConfig)
+    request_list = deepcopy(request_list)
+    generation_config = deepcopy(generation_config)
+    for i, request in enumerate(request_list):
+        history = request.get('history', None)
+        if history is None:
+            history = []
+        request['history'] = history
+        inputs = template.encode(request)
+        input_ids = inputs['input_ids']
+        tokenizer = template.tokenizer
+        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
+            generation_config.stop.append(tokenizer.eos_token)
+        if generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + len(
+                input_ids)
+        llm_engine.add_request(str(i), None, generation_config, input_ids)
+    outputs = []
+    while llm_engine.has_unfinished_requests():
+        step_outputs = llm_engine.step()
+        for output in step_outputs:
+            if output.finished:
+                outputs.append(output)
+
+    response_list = []
+    for output, request in zip(outputs, request_list):
+        response = output.outputs[0].text
+        query = request['query']
+        history = request['history']
+        history.append((query, response))
+        response_list.append({'response': response, 'history': history})
+        if verbose:
+            print(
+                f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
+                end='')
+            print(tokenizer.decode(output.outputs[0].token_ids, False))
+    return response_list
diff --git a/tests/llm/test_vllm_utils.py b/tests/llm/test_vllm_utils.py
new file mode 100644
index 0000000000..aa9604cb65
--- /dev/null
+++ b/tests/llm/test_vllm_utils.py
@@ -0,0 +1,34 @@
+import os
+import unittest
+
+import torch
+
+from swift.llm import *
+from swift.utils import lower_bound, seed_everything
+
+SKPT_TEST = True
+
+
+class TestVllmUtils(unittest.TestCase):
+
+    @unittest.skipIf(SKPT_TEST, 'To avoid citest error: OOM')
+    def test_inference_vllm(self):
+        model_type = ModelType.qwen_7b_chat
+        llm_engine = get_vllm_engine(model_type, torch.float16)
+        template_type = get_default_template_type(model_type)
+        template = get_template(template_type, llm_engine.tokenizer)
+        request_list = [{'query': '浙江的省会在哪？'}, {'query': '你好!'}]
+        # test inference_vllm
+        response_list = inference_vllm(llm_engine, template, request_list)
+        for response in response_list:
+            print(response)
+
+        # test inference_stream_vllm
+        gen = inference_stream_vllm(llm_engine, template, request_list)
+        for response_list in gen:
+            print(response_list[0]['history'])
+            print(response_list[1]['history'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From cb1d51a86f768352604302cde547a6a408aab2e7 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Sun, 17 Dec 2023 23:57:15 +0800
Subject: [PATCH 05/19] fix version bug

---
 swift/llm/utils/vllm_utils.py | 34 +++++++++++++++++++++-------------
 tests/llm/test_vllm_utils.py  |  4 ++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index e071fc7cbe..44d570c8dd 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -1,3 +1,4 @@
+import inspect
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
@@ -6,10 +7,13 @@
 from torch import dtype as Dtype
 from vllm import EngineArgs, LLMEngine, SamplingParams
 
+from swift.utils import get_logger
 from .model import MODEL_MAPPING, get_model_tokenizer
 from .template import Template
 from .utils import _is_chinese_char
 
+logger = get_logger()
+
 
 def get_vllm_engine(
     model_type: str,
@@ -62,21 +66,25 @@ def __init__(
         repetition_penalty: float = 1.,
         length_penalty: float = 1.0,
         stop: Optional[List[str]] = None,
-        skip_special_tokens: bool = True,
         **kwargs,
     ):
         # The parameter design is similar to transformers.GenerationConfig.
         self.max_new_tokens = max_new_tokens
-        super().__init__(
-            max_tokens=max_length,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            length_penalty=length_penalty,
-            stop=stop,
-            skip_special_tokens=skip_special_tokens,
-            **kwargs)
+        kwargs['max_tokens'] = max_length
+        kwargs['temperature'] = temperature
+        kwargs['top_k'] = top_k
+        kwargs['top_p'] = top_p
+        kwargs['repetition_penalty'] = repetition_penalty
+        kwargs['length_penalty'] = length_penalty
+        kwargs['stop'] = stop
+        parameters = inspect.signature(SamplingParams.__init__).parameters
+        for k in kwargs.copy().keys():
+            if k not in parameters:
+                logger.info(
+                    f'The VLLM version is too old and does not support the parameter: {k}.'
+                )
+                kwargs.pop(k)
+        super().__init__(**kwargs)
 
     @property
     def max_length(self) -> int:
@@ -128,7 +136,7 @@ def inference_stream_vllm(
         for output in step_outputs:
             i = int(output.request_id)
             request = request_list[i]
-            response = output.outputs[0].text
+            response = tokenizer.decode(output.outputs[0].token_ids, True)
             if output.finished or response.endswith(
                     '\n') or len(response) > 0 and _is_chinese_char(
                         ord(response[-1])):
@@ -190,7 +198,7 @@ def inference_vllm(llm_engine: LLMEngine,
 
     response_list = []
     for output, request in zip(outputs, request_list):
-        response = output.outputs[0].text
+        response = tokenizer.decode(output.outputs[0].token_ids, True)
         query = request['query']
         history = request['history']
         history.append((query, response))
diff --git a/tests/llm/test_vllm_utils.py b/tests/llm/test_vllm_utils.py
index aa9604cb65..d2e91718ff 100644
--- a/tests/llm/test_vllm_utils.py
+++ b/tests/llm/test_vllm_utils.py
@@ -26,8 +26,8 @@ def test_inference_vllm(self):
         # test inference_stream_vllm
         gen = inference_stream_vllm(llm_engine, template, request_list)
         for response_list in gen:
-            print(response_list[0]['history'])
-            print(response_list[1]['history'])
+            print(response_list[0]['response'], response_list[0]['history'])
+            print(response_list[1]['response'], response_list[1]['history'])
 
 
 if __name__ == '__main__':

From bf9243b9f09dbfa1d5f90f6c88fb7e7b6dff7777 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 00:06:53 +0800
Subject: [PATCH 06/19] update

---
 swift/llm/deploy.py         | 214 ------------------------------------
 swift/llm/utils/__init__.py |   9 +-
 swift/llm/utils/utils.py    |   6 +-
 3 files changed, 9 insertions(+), 220 deletions(-)
 delete mode 100644 swift/llm/deploy.py

diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
deleted file mode 100644
index 6ea789bffc..0000000000
--- a/swift/llm/deploy.py
+++ /dev/null
@@ -1,214 +0,0 @@
-from copy import deepcopy
-from typing import *
-
-import torch
-from torch import dtype as Dtype
-from vllm import EngineArgs, LLMEngine, SamplingParams
-
-from swift.llm import *
-from swift.llm.utils.model import *
-
-
-def get_vllm_engine(
-    model_type: str,
-    torch_dtype: Optional[Dtype] = None,
-    gpu_memory_utilization: float = 0.9,
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    engine_kwargs: Optional[Dict[str, Any]] = None,
-) -> LLMEngine:
-    if engine_kwargs is None:
-        engine_kwargs = {}
-    model_info = MODEL_MAPPING[model_type]
-    model_id_or_path = model_info['model_id_or_path']
-    revision = model_info['revision']
-    ignore_file_pattern = model_info['ignore_file_pattern']
-    model_dir = snapshot_download(
-        model_id_or_path, revision, ignore_file_pattern=ignore_file_pattern)
-    dtype_mapping = {
-        torch.float16: 'float16',
-        torch.bfloat16: 'bfloat16',
-        torch.float32: 'float32',
-        None: 'auto'
-    }
-    disable_log_stats = engine_kwargs.pop('disable_log_stats', True)
-    engine_args = EngineArgs(
-        model=model_dir,
-        trust_remote_code=True,
-        dtype=dtype_mapping[torch_dtype],
-        gpu_memory_utilization=gpu_memory_utilization,
-        tensor_parallel_size=tensor_parallel_size,
-        pipeline_parallel_size=pipeline_parallel_size,
-        disable_log_stats=disable_log_stats,
-        **engine_kwargs)
-    llm_engine = LLMEngine.from_engine_args(engine_args)
-    llm_engine.model_dir = model_dir
-    llm_engine.model_type = model_type
-    llm_engine.tokenizer = get_model_tokenizer(model_type, load_model=False)[1]
-    return llm_engine
-
-
-class VllmGenerationConfig(SamplingParams):
-
-    def __init__(
-        self,
-        max_length: int = 20,
-        max_new_tokens: Optional[int] = None,
-        temperature: float = 1.,
-        top_k: int = 50,  # -1: all
-        top_p: float = 1.0,
-        repetition_penalty: float = 1.,
-        length_penalty: float = 1.0,
-        stop: Optional[Union[str, List[str]]] = None,
-        skip_special_tokens: bool = True,
-        **kwargs,
-    ):
-        # The parameter design is similar to transformers.GenerationConfig.
-        self.max_new_tokens = max_new_tokens
-        super().__init__(
-            max_tokens=max_length,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            length_penalty=length_penalty,
-            stop=stop,
-            skip_special_tokens=skip_special_tokens,
-            **kwargs)
-
-    @property
-    def max_length(self) -> int:
-        return self.max_tokens
-
-    @max_length.setter
-    def max_length(self, value: int) -> None:
-        self.max_tokens = value
-
-
-def inference_stream_vllm(
-    llm_engine: LLMEngine,
-    template: Template,
-    request_list: List[Dict[str, Any]],
-    *,
-    generation_config: Optional[VllmGenerationConfig] = None
-) -> List[Dict[str, Any]]:
-    """
-    request_list: e.g. [{'query': 'hello!'}].
-        The keys that can be included are: 'query', 'history', 'system'.
-    generation_config: Priority: generation_config > model.generation_config.
-    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
-        The keys to be included will be: 'response', 'history'.
-    """
-    if generation_config is None:
-        generation_config = getattr(model, 'generation_config',
-                                    VllmGenerationConfig())
-    assert isinstance(generation_config, VllmGenerationConfig)
-    request_list = deepcopy(request_list)
-    generation_config = deepcopy(generation_config)
-    for i, request in enumerate(request_list):
-        history = request.get('history', None)
-        if history is None:
-            history = []
-        request['history'] = history
-        inputs = template.encode(request)
-        input_ids = inputs['input_ids']
-        tokenizer = template.tokenizer
-        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
-            generation_config.stop.append(tokenizer.eos_token)
-        if generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + len(
-                input_ids)
-        llm_engine.add_request(str(i), None, generation_config, input_ids)
-    outputs = []
-    while llm_engine.has_unfinished_requests():
-        step_outputs = llm_engine.step()
-        for output in step_outputs:
-            if output.finished:
-                outputs.append(output)
-
-    response_list = []
-    for output, request in zip(outputs, request_list):
-        response = output.outputs[0].text
-        query = request['query']
-        history = request['history']
-        history.append((query, response))
-        response_list.append({'response': response, 'history': history})
-        if verbose:
-            print(
-                f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
-                end='')
-            print(tokenizer.decode(output.outputs[0].token_ids, False))
-    return response_list
-
-
-def inference_vllm(llm_engine: LLMEngine,
-                   template: Template,
-                   request_list: List[Dict[str, Any]],
-                   *,
-                   generation_config: Optional[VllmGenerationConfig] = None,
-                   verbose: bool = False,
-                   prompt_prefix: str = '[PROMPT]',
-                   output_prefix: str = '[OUTPUT]') -> List[Dict[str, Any]]:
-    """
-    request_list: e.g. [{'query': 'hello!'}].
-        The keys that can be included are: 'query', 'history', 'system'.
-    generation_config: Priority: generation_config > model.generation_config.
-    return: e.g. [{'response': 'hi!', 'history': [('hello!', 'hi!')]}].
-        The keys to be included will be: 'response', 'history'.
-    """
-    if generation_config is None:
-        generation_config = getattr(model, 'generation_config',
-                                    VllmGenerationConfig())
-    assert isinstance(generation_config, VllmGenerationConfig)
-    request_list = deepcopy(request_list)
-    generation_config = deepcopy(generation_config)
-    for i, request in enumerate(request_list):
-        history = request.get('history', None)
-        if history is None:
-            history = []
-        request['history'] = history
-        inputs = template.encode(request)
-        input_ids = inputs['input_ids']
-        tokenizer = template.tokenizer
-        if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop:
-            generation_config.stop.append(tokenizer.eos_token)
-        if generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + len(
-                input_ids)
-        llm_engine.add_request(str(i), None, generation_config, input_ids)
-    outputs = []
-    while llm_engine.has_unfinished_requests():
-        step_outputs = llm_engine.step()
-        for output in step_outputs:
-            if output.finished:
-                outputs.append(output)
-
-    response_list = []
-    for output, request in zip(outputs, request_list):
-        response = output.outputs[0].text
-        query = request['query']
-        history = request['history']
-        history.append((query, response))
-        response_list.append({'response': response, 'history': history})
-        if verbose:
-            print(
-                f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
-                end='')
-            print(tokenizer.decode(output.outputs[0].token_ids, False))
-    return response_list
-
-
-if __name__ == '__main__':
-    model_type = ModelType.qwen_7b_chat
-    llm_engine = get_vllm_engine(model_type, torch.float16)
-    template_type = get_default_template_type(model_type)
-    template = get_template(template_type, llm_engine.tokenizer)
-    request_list = [{'query': '浙江的省会在哪？'}, {'query': '你好!'}]
-    response_list = inference_vllm(llm_engine, template, request_list)
-    for response in response_list:
-        print(response)
-
-    gen = inference_stream_vllm(llm_engine, template, request_list)
-    for response_list in gen:
-        print(response_list[0]['history'])
-        print(response_list[1]['history'])
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index a4d39bbf24..872c60cbcd 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -19,15 +19,14 @@
 from .utils import (LazyLLMDataset, LLMDataset, data_collate_fn, dataset_map,
                     download_dataset, find_all_linear_for_lora,
                     fix_fp16_trainable_bug, history_to_messages, inference,
-                    inference_stream, limit_history_length,
+                    inference_stream, is_vllm_available, limit_history_length,
                     messages_to_history, print_example, set_generation_config,
                     sort_by_max_length, stat_dataset)
 
 try:
-    from .vllm_utils import (VllmGenerationConfig, get_vllm_engine,
-                             inference_stream_vllm, inference_vllm)
-except ImportError:
-    pass
+    if is_vllm_available():
+        from .vllm_utils import (VllmGenerationConfig, get_vllm_engine,
+                                 inference_stream_vllm, inference_vllm)
 except Exception as e:
     from swift.utils import get_logger
     logger = get_logger()
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 08cff505a6..b69816ff1c 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -1,10 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
 import heapq
+import importlib
 import logging
 import os
 import shutil
-import time
 from copy import deepcopy
 from functools import partial, wraps
 from queue import Empty, Queue
@@ -664,6 +664,10 @@ def fix_fp16_trainable_bug(model: Module) -> None:
             p.data = p.data.to(dtype=torch.float32)
 
 
+def is_vllm_available():
+    return importlib.util.find_spec('vllm') is not None
+
+
 # monkey patching
 MsDataset.load = _msdataset_ddp_load
 if is_ddp_plus_mp():

From c5dcdef9c7a3766fa1ba4b34c1d6bdfd7a90c44e Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 02:40:54 +0800
Subject: [PATCH 07/19] update docs

---
 ...56\350\260\203\346\226\207\346\241\243.md" |   4 +
 ...50\347\220\206\346\226\207\346\241\243.md" |   2 +
 ...37\344\270\216\351\203\250\347\275\262.md" | 178 ++++++++++++++++++
 swift/llm/utils/model.py                      |   5 +-
 swift/llm/utils/vllm_utils.py                 |  58 ++++--
 tests/llm/test_vllm_utils.py                  |   3 +-
 6 files changed, 231 insertions(+), 19 deletions(-)
 create mode 100644 "docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index 8f23042b20..ee18191261 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -222,6 +222,8 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 ## 推理
+如果你要使用vllm进行推理加速, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#推理加速)
+
 ### 原始模型
 **单样本推理**可以查看[LLM推理文档](./LLM推理文档.md#-推理)
 
@@ -292,6 +294,8 @@ CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged'
 ```
 
 ## Web-UI
+如果你要使用vllm进行部署并提供api接口, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署)
+
 ### 原始模型
 使用原始模型的web-ui可以查看[LLM推理文档](./LLM推理文档.md#-Web-UI)
 
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index 2b5a7b8645..f28b3e8b3a 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -1,4 +1,6 @@
 # LLM推理文档
+如果你要使用vllm进行推理加速, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#推理加速)
+
 ## 目录
 - [环境准备](#环境准备)
 - [推理](#推理)
diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
new file mode 100644
index 0000000000..cdf0c06cb0
--- /dev/null
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -0,0 +1,178 @@
+
+# VLLM推理加速与部署
+
+## 目录
+- [环境准备](#环境准备)
+- [推理加速](#推理加速)
+- [部署](#部署)
+
+## 环境准备
+GPU设备: A10, 3090, V100, A100均可.
+```bash
+# 设置pip全局镜像
+pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
+# 安装ms-swift
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e .[llm]
+
+# vllm与cuda版本有对应关系，请按照`https://docs.vllm.ai/en/latest/getting_started/installation.html`选择版本
+pip install vllm -U
+
+# 如果你想要使用基于auto_gptq的模型进行推理.
+# 使用auto_gptq的模型: `https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md#模型`
+# auto_gptq和cuda版本有对应关系，请按照`https://github.com/PanQiWei/AutoGPTQ#quick-installation`选择版本
+pip install auto_gptq -U
+
+# 环境对齐 (如果你运行错误, 可以跑下面的代码, 仓库使用最新环境测试)
+pip install -r requirements/framework.txt  -U
+pip install -r requirements/llm.txt  -U
+```
+
+## 推理加速
+
+### qwen-7b-chat
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import torch
+from swift.llm import (
+    ModelType, get_vllm_engine, get_default_template_type,
+    get_template, inference_vllm
+)
+
+model_type = ModelType.qwen_7b_chat
+llm_engine = get_vllm_engine(model_type, torch.float16)
+template_type = get_default_template_type(model_type)
+template = get_template(template_type, llm_engine.tokenizer)
+# 与`transformers.GenerationConfig`类似的接口
+llm_engine.generation_config.max_new_tokens = 256
+
+request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
+response_list = inference_vllm(llm_engine, template, request_list)
+for request, response in zip(request_list, response_list):
+    print(f"query: {request['query']}")
+    print(f"response: {response['response']}")
+
+history1 = response_list[1]['history']
+request_list = [{'query': '这有什么好吃的', 'history': history1}]
+response_list = inference_vllm(llm_engine, template, request_list)
+for request, response in zip(request_list, response_list):
+    print(f"query: {request['query']}")
+    print(f"response: {response['response']}")
+    print(f"history: {response['history']}")
+
+"""Out[0]
+query: 你好!
+response: 你好！很高兴为你服务。有什么我可以帮助你的吗？
+query: 浙江的省会在哪？
+response: 浙江省会是杭州市。
+query: 这有什么好吃的
+response: 杭州是一个美食之城，拥有许多著名的菜肴和小吃，例如西湖醋鱼、东坡肉、叫化童子鸡等。此外，杭州还有许多小吃店，可以品尝到各种各样的本地美食。
+history: [('浙江的省会在哪？', '浙江省会是杭州市。'), ('这有什么好吃的', '杭州是一个美食之城，拥有许多著名的菜肴和小吃，例如西湖醋鱼、东坡肉、叫化童子鸡等。此外，杭州还有许多小吃店，可以品尝到各种各样的本地美食。')]
+"""
+```
+
+### 流式输出
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import torch
+from swift.llm import (
+    ModelType, get_vllm_engine, get_default_template_type,
+    get_template, inference_stream_vllm
+)
+
+model_type = ModelType.qwen_7b_chat
+llm_engine = get_vllm_engine(model_type, torch.float16)
+template_type = get_default_template_type(model_type)
+template = get_template(template_type, llm_engine.tokenizer)
+# 与`transformers.GenerationConfig`类似的接口
+llm_engine.generation_config.max_new_tokens = 256
+
+request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
+gen = inference_stream_vllm(llm_engine, template, request_list)
+query_list = [request['query'] for request in request_list]
+print(f"query_list: {query_list}")
+for response_list in gen:
+    resp_list = [response['response'] for response in response_list]
+    print(f'response_list: {resp_list}')
+
+history1 = response_list[1]['history']
+request_list = [{'query': '这有什么好吃的', 'history': history1}]
+gen = inference_stream_vllm(llm_engine, template, request_list)
+query = request_list[0]['query']
+print(f"query: {query}")
+for response_list in gen:
+    resp = response_list[0]['response']
+    print(f'response: {resp}')
+
+history = response_list[0]['history']
+print(f'history: {history}')
+
+"""Out[0]
+query_list: ['你好!', '浙江的省会在哪？']
+...
+response_list: ['你好！很高兴为你服务。有什么我可以帮助你的吗？', '浙江省会是杭州市。']
+query: 这有什么好吃的
+...
+response: 杭州是一个美食之城，拥有许多著名的菜肴和小吃，例如西湖醋鱼、东坡肉、叫化童子鸡等。此外，杭州还有许多小吃店，可以品尝到各种各样的本地美食。
+history: [('浙江的省会在哪？', '浙江省会是杭州市。'), ('这有什么好吃的', '杭州是一个美食之城，拥有许多著名的菜肴和小吃，例如西湖醋鱼、东坡肉、叫化童子鸡等。此外，杭州还有许多小吃店，可以品尝到各种各样的本地美食。')]
+"""
+```
+
+### chatglm3
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+import torch
+from swift.llm import (
+    ModelType, get_vllm_engine, get_default_template_type,
+    get_template, inference_vllm
+)
+
+model_type = ModelType.chatglm3_6b
+llm_engine = get_vllm_engine(model_type, torch.float16)
+template_type = get_default_template_type(model_type)
+template = get_template(template_type, llm_engine.tokenizer)
+# 与`transformers.GenerationConfig`类似的接口
+llm_engine.generation_config.max_new_tokens = 256
+
+request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
+response_list = inference_vllm(llm_engine, template, request_list)
+for request, response in zip(request_list, response_list):
+    print(f"query: {request['query']}")
+    print(f"response: {response['response']}")
+
+history1 = response_list[1]['history']
+request_list = [{'query': '这有什么好吃的', 'history': history1}]
+response_list = inference_vllm(llm_engine, template, request_list)
+for request, response in zip(request_list, response_list):
+    print(f"query: {request['query']}")
+    print(f"response: {response['response']}")
+    print(f"history: {response['history']}")
+
+"""Out[0]
+query: 你好!
+response: 您好，我是人工智能助手。很高兴为您服务！请问有什么问题我可以帮您解答？
+query: 浙江的省会在哪？
+response: 浙江的省会是杭州。
+query: 这有什么好吃的
+response: 浙江有很多美食,其中一些非常有名的包括杭州的龙井虾仁、东坡肉、西湖醋鱼、叫化童子鸡等。另外,浙江还有很多特色小吃和糕点,比如宁波的汤团、年糕,温州的炒螃蟹、温州肉圆等。
+history: [('浙江的省会在哪？', '浙江的省会是杭州。'), ('这有什么好吃的', '浙江有很多美食,其中一些非常有名的包括杭州的龙井虾仁、东坡肉、西湖醋鱼、叫化童子鸡等。另外,浙江还有很多特色小吃和糕点,比如宁波的汤团、年糕,温州的炒螃蟹、温州肉圆等。')]
+"""
+```
+
+### 微调后的模型
+
+使用LoRA进行微调:
+
+
+
+使用全参数微调:
+
+
+## 部署
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index ac80deecf1..7707ec706b 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1252,9 +1252,8 @@ def get_model_tokenizer(
                 ignore_file_pattern=ignore_file_pattern)
         if is_dist() and is_local_master():
             dist.barrier()
-    else:
-        model_dir = os.path.expanduser(model_dir)
-        assert os.path.isdir(model_dir)
+    model_dir = os.path.expanduser(model_dir)
+    assert os.path.isdir(model_dir)
     if model_info.get('torch_dtype') is not None:
         model_torch_dtype = model_info['torch_dtype']
         if torch_dtype is None:
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index 44d570c8dd..3cd772a86d 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -1,9 +1,10 @@
 import inspect
+import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
 import torch
-from modelscope import snapshot_download
+from modelscope import GenerationConfig, snapshot_download
 from torch import dtype as Dtype
 from vllm import EngineArgs, LLMEngine, SamplingParams
 
@@ -15,22 +16,31 @@
 logger = get_logger()
 
 
-def get_vllm_engine(
-    model_type: str,
-    torch_dtype: Optional[Dtype] = None,
-    gpu_memory_utilization: float = 0.9,
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    engine_kwargs: Optional[Dict[str, Any]] = None,
-) -> LLMEngine:
+def get_vllm_engine(model_type: str,
+                    torch_dtype: Optional[Dtype] = None,
+                    gpu_memory_utilization: float = 0.9,
+                    tensor_parallel_size: int = 1,
+                    pipeline_parallel_size: int = 1,
+                    engine_kwargs: Optional[Dict[str, Any]] = None,
+                    **kwargs) -> LLMEngine:
     if engine_kwargs is None:
         engine_kwargs = {}
     model_info = MODEL_MAPPING[model_type]
     model_id_or_path = model_info['model_id_or_path']
-    revision = model_info['revision']
     ignore_file_pattern = model_info['ignore_file_pattern']
-    model_dir = snapshot_download(
-        model_id_or_path, revision, ignore_file_pattern=ignore_file_pattern)
+    model_dir = kwargs.get('model_dir', None)
+    if model_dir is None:
+        model_dir = model_id_or_path
+        if model_id_or_path is not None and not os.path.exists(
+                model_id_or_path):
+            revision = model_info['revision']
+            model_dir = snapshot_download(
+                model_id_or_path,
+                revision,
+                ignore_file_pattern=ignore_file_pattern)
+    model_dir = os.path.expanduser(model_dir)
+    assert os.path.isdir(model_dir)
+
     dtype_mapping = {
         torch.float16: 'float16',
         torch.bfloat16: 'bfloat16',
@@ -51,6 +61,18 @@ def get_vllm_engine(
     llm_engine.model_dir = model_dir
     llm_engine.model_type = model_type
     llm_engine.tokenizer = get_model_tokenizer(model_type, load_model=False)[1]
+    generation_config_path = os.path.join(model_dir, 'generation_config.json')
+    if os.path.isfile(generation_config_path):
+        generation_config = GenerationConfig.from_pretrained(model_dir)
+        kwargs = generation_config.to_dict()
+        parameters = inspect.signature(
+            VllmGenerationConfig.__init__).parameters
+        for k in kwargs.copy().keys():
+            if k not in parameters:
+                kwargs.pop(k)
+        llm_engine.generation_config = VllmGenerationConfig(**kwargs)
+    else:
+        llm_engine.generation_config = VllmGenerationConfig()
     return llm_engine
 
 
@@ -69,6 +91,8 @@ def __init__(
         **kwargs,
     ):
         # The parameter design is similar to transformers.GenerationConfig.
+        if top_k == 0:
+            top_k = -1
         self.max_new_tokens = max_new_tokens
         kwargs['max_tokens'] = max_length
         kwargs['temperature'] = temperature
@@ -129,6 +153,7 @@ def inference_stream_vllm(
             generation_config.max_length = generation_config.max_new_tokens + len(
                 input_ids)
         llm_engine.add_request(str(i), None, generation_config, input_ids)
+
     batch_size = len(request_list)
     response_list = [None] * batch_size
     while llm_engine.has_unfinished_requests():
@@ -196,13 +221,16 @@ def inference_vllm(llm_engine: LLMEngine,
             if output.finished:
                 outputs.append(output)
 
-    response_list = []
-    for output, request in zip(outputs, request_list):
+    batch_size = len(request_list)
+    response_list = [None] * batch_size
+    for output in outputs:
+        i = int(output.request_id)
+        request = request_list[i]
         response = tokenizer.decode(output.outputs[0].token_ids, True)
         query = request['query']
         history = request['history']
         history.append((query, response))
-        response_list.append({'response': response, 'history': history})
+        response_list[i] = {'response': response, 'history': history}
         if verbose:
             print(
                 f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
diff --git a/tests/llm/test_vllm_utils.py b/tests/llm/test_vllm_utils.py
index d2e91718ff..a2a529e651 100644
--- a/tests/llm/test_vllm_utils.py
+++ b/tests/llm/test_vllm_utils.py
@@ -19,7 +19,8 @@ def test_inference_vllm(self):
         template = get_template(template_type, llm_engine.tokenizer)
         request_list = [{'query': '浙江的省会在哪？'}, {'query': '你好!'}]
         # test inference_vllm
-        response_list = inference_vllm(llm_engine, template, request_list)
+        response_list = inference_vllm(
+            llm_engine, template, request_list, verbose=True)
         for response in response_list:
             print(response)
 

From 026b981f2956545a16dc692425ceadd7af97599d Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 12:24:14 +0800
Subject: [PATCH 08/19] fix  bug

---
 ...44\350\241\214\345\217\202\346\225\260.md" |   7 +
 swift/llm/infer.py                            | 223 +++++++++++-------
 swift/llm/utils/__init__.py                   |   3 +-
 swift/llm/utils/argument.py                   |   8 +
 swift/llm/utils/vllm_utils.py                 |  78 +++++-
 tests/llm/test_run.py                         |  74 +++---
 6 files changed, 262 insertions(+), 131 deletions(-)

diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 67de273f35..c1a9d116ff 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -91,6 +91,7 @@
 - `--model_cache_dir`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--sft_type`: 默认值为`'lora'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--template_type`: 默认值为`'AUTO'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
+- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`False`. 推荐设置为True, 但这需要安装vllm, 环境准备可以参考[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#环境准备).
 - `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
 - `--load_args_from_ckpt_dir`: 是否从`ckpt_dir`的`sft_args.json`文件中读取配置信息. 默认是`True`.
 - `--load_dataset_config`: 该参数只有在`--load_args_from_ckpt_dir true`时才生效. 即是否从`ckpt_dir`的`sft_args.json`文件中读取数据集相关的配置信息. 默认为`True`.
@@ -121,4 +122,10 @@
 - `--ignore_args_error`: 默认值为`False`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--stream`: 是否使用流式输出, 默认为`True`.
 - `--merge_lora_and_save`: 是否将lora权重merge到基模型中, 并保存完整的权重, 默认为`False`. 权重会保存在`ckpt_dir`的同级目录中,  e.g. `'/path/to/your/vx_xxx/checkpoint-xxx-merged'`目录下.
+- `--safe_serialization`: 保存成`safetensors`文件还是`bin`文件. 默认为True, 即保存为safetensors文件.
 - `--overwrite_generation_config`: 是否将评估所使用的generation_config保存成`generation_config.json`文件, 默认为`False`. 训练时保存的generation_config文件将被覆盖.
+- `--verbose`: 如果设置为False, 则使用tqdm样式推理. 如果设置为True, 则输出推理的query, response, label. 默认为`None`, 进行自动选择, 即`len(val_dataset) >= 100`时, 设置为False, 否则设置为True. 该参数只有在`--eval_human false`时才生效.
+- `--share`: 传递给gradio的`demo.queue().launch(...)`函数. 该参数只有在使用`app-ui`时才生效.
+- `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在`--use_vllm true`时才生效.
+- `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在`--use_vllm true`时才生效.
+- `--pipeline_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在`--use_vllm true`时才生效.
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index c1bf07f2ba..54343aa43a 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -32,20 +32,6 @@ def merge_lora(args: InferArguments,
     if args.quantization_bit != 0:
         logger.warning('It is not recommended to merge quantized models, '
                        'as this can result in performance degradation')
-    # Loading Model and Tokenizer
-    kwargs = {}
-    model_kwargs = {'low_cpu_mem_usage': True, 'device_map': device_map}
-    if args.model_cache_dir is not None:
-        kwargs['model_dir'] = args.model_cache_dir
-    model, tokenizer = get_model_tokenizer(args.model_type, args.torch_dtype,
-                                           model_kwargs, **kwargs)
-    logger.info(f'model_config: {model.config}')
-
-    # Preparing LoRA
-    model = Swift.from_pretrained(model, args.ckpt_dir, inference_mode=True)
-    Swift.merge_and_unload(model)
-    model = model.model
-
     old_ckpt_dir = args.ckpt_dir
     ckpt_dir, ckpt_name = os.path.split(args.ckpt_dir)
     merged_lora_path = os.path.join(ckpt_dir, f'{ckpt_name}-merged')
@@ -54,45 +40,56 @@ def merge_lora(args: InferArguments,
     logger.info(f'Setting args.ckpt_dir: {merged_lora_path}')
     args.sft_type = 'full'
     args.ckpt_dir = merged_lora_path
-
-    if not os.path.exists(args.ckpt_dir) or replace_if_exists:
-        logger.info('Saving merged weights...')
-        model.save_pretrained(
-            args.ckpt_dir, safe_serialization=args.safe_serialization)
-        tokenizer.save_pretrained(args.ckpt_dir)
-        for fname in os.listdir(old_ckpt_dir):
-            if fname in {'generation_config.json'}:
-                src_path = os.path.join(old_ckpt_dir, fname)
-                tgt_path = os.path.join(args.ckpt_dir, fname)
-                shutil.copy(src_path, tgt_path)
-        # configuration.json
-        configuration_fname = 'configuration.json'
-        old_configuration_path = os.path.join(old_ckpt_dir,
-                                              configuration_fname)
-        new_configuration_path = os.path.join(args.ckpt_dir,
-                                              configuration_fname)
-        if os.path.exists(old_configuration_path):
-            with open(old_configuration_path, 'r', encoding='utf-8') as f:
-                res = json.load(f)
-            res.pop('adapter_cfg', None)
-            with open(new_configuration_path, 'w', encoding='utf-8') as f:
-                json.dump(res, f, ensure_ascii=False, indent=4)
-        # sft_args.json
-        sft_args_fname = 'sft_args.json'
-        old_sft_args_path = os.path.join(old_ckpt_dir, sft_args_fname)
-        new_sft_args_path = os.path.join(args.ckpt_dir, sft_args_fname)
-        if os.path.exists(old_sft_args_path):
-            with open(old_sft_args_path, 'r', encoding='utf-8') as f:
-                res = json.load(f)
-            res['sft_type'] = 'full'
-            with open(new_sft_args_path, 'w', encoding='utf-8') as f:
-                json.dump(res, f, ensure_ascii=False, indent=2)
-        logger.info(f'Successfully merged LoRA and saved in {args.ckpt_dir}.')
-    else:
+    if os.path.exists(args.ckpt_dir) and not replace_if_exists:
         logger.info(
             f'The weight directory for the merged LoRA already exists in {args.ckpt_dir}, '
             'skipping the saving process. '
             'you can pass `replace_if_exists=True` to overwrite it.')
+        return
+    # Loading Model and Tokenizer
+    kwargs = {}
+    model_kwargs = {'low_cpu_mem_usage': True, 'device_map': device_map}
+    if args.model_cache_dir is not None:
+        kwargs['model_dir'] = args.model_cache_dir
+    model, tokenizer = get_model_tokenizer(args.model_type, args.torch_dtype,
+                                           model_kwargs, **kwargs)
+    logger.info(f'model_config: {model.config}')
+
+    # Preparing LoRA
+    model = Swift.from_pretrained(model, old_ckpt_dir, inference_mode=True)
+    Swift.merge_and_unload(model)
+    model = model.model
+    logger.info('Saving merged weights...')
+    model.save_pretrained(
+        merged_lora_path, safe_serialization=args.safe_serialization)
+    tokenizer.save_pretrained(merged_lora_path)
+    for fname in os.listdir(old_ckpt_dir):
+        if fname in {'generation_config.json'}:
+            src_path = os.path.join(old_ckpt_dir, fname)
+            tgt_path = os.path.join(merged_lora_path, fname)
+            shutil.copy(src_path, tgt_path)
+    # configuration.json
+    configuration_fname = 'configuration.json'
+    old_configuration_path = os.path.join(old_ckpt_dir, configuration_fname)
+    new_configuration_path = os.path.join(merged_lora_path,
+                                          configuration_fname)
+    if os.path.exists(old_configuration_path):
+        with open(old_configuration_path, 'r', encoding='utf-8') as f:
+            res = json.load(f)
+        res.pop('adapter_cfg', None)
+        with open(new_configuration_path, 'w', encoding='utf-8') as f:
+            json.dump(res, f, ensure_ascii=False, indent=4)
+    # sft_args.json
+    sft_args_fname = 'sft_args.json'
+    old_sft_args_path = os.path.join(old_ckpt_dir, sft_args_fname)
+    new_sft_args_path = os.path.join(merged_lora_path, sft_args_fname)
+    if os.path.exists(old_sft_args_path):
+        with open(old_sft_args_path, 'r', encoding='utf-8') as f:
+            res = json.load(f)
+        res['sft_type'] = 'full'
+        with open(new_sft_args_path, 'w', encoding='utf-8') as f:
+            json.dump(res, f, ensure_ascii=False, indent=2)
+    logger.info(f'Successfully merged LoRA and saved in {merged_lora_path}.')
     return merged_lora_path
 
 
@@ -154,10 +151,14 @@ def prepare_model_template(
 def llm_infer(args: InferArguments) -> None:
     if args.merge_lora_and_save:
         merge_lora(args)
-    model, template = prepare_model_template(args)
-    if args.overwrite_generation_config:
-        assert args.ckpt_dir is not None
-        model.generation_config.save_pretrained(args.ckpt_dir)
+    if args.use_vllm:
+        from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm
+        llm_engine, template = prepare_vllm_engine_template(args)
+    else:
+        model, template = prepare_model_template(args)
+        if args.overwrite_generation_config:
+            assert args.ckpt_dir is not None
+            model.generation_config.save_pretrained(args.ckpt_dir)
     # Inference
     result = []
     jsonl_path = None
@@ -195,18 +196,29 @@ def llm_infer(args: InferArguments) -> None:
                 continue
             if not template.support_multi_round:
                 history = []
-            gen = inference_stream(model, template, query, history)
             print_idx = 0
-            for response, history in gen:
-                if len(response) > print_idx:
-                    print(response[print_idx:], end='', flush=True)
-                    print_idx = len(response)
+            if args.use_vllm:
+                gen = inference_stream_vllm(llm_engine, template,
+                                            [{
+                                                'query': query,
+                                                'history': history
+                                            }])
+                for response_list in gen:
+                    response = response_list[0]['response']
+                    if len(response) > print_idx:
+                        print(response[print_idx:], end='', flush=True)
+                        print_idx = len(response)
+            else:
+                gen = inference_stream(model, template, query, history)
+                for response, _ in gen:
+                    if len(response) > print_idx:
+                        print(response[print_idx:], end='', flush=True)
+                        print_idx = len(response)
             print()
             print('-' * 50)
-            item = history[-1]
             obj = {
-                'query': item[0],
-                'response': item[1],
+                'query': query,
+                'response': response,
                 'history': history,
             }
             if jsonl_path is not None:
@@ -219,33 +231,74 @@ def llm_infer(args: InferArguments) -> None:
             val_dataset = val_dataset.select(
                 range(min(args.val_dataset_sample, val_dataset.shape[0])))
         logger.info(f'val_dataset: {val_dataset}')
-        if args.verbose is None:
-            if len(val_dataset) >= 100:
-                args.verbose = False
-            else:
-                args.verbose = True
+
+        if args.use_vllm and not args.stream:
+            args.verbose = False
             logger.info(f'Setting args.verbose: {args.verbose}')
-        if not args.verbose:
-            val_dataset = tqdm(val_dataset)
-        for data in val_dataset:
-            _, history = inference(
-                model,
+            label_list = None
+            if 'response' in val_dataset.features:
+                label_list = val_dataset['response']
+            val_dataset = val_dataset.remove_columns('response')
+            request_list = val_dataset.to_list()
+            response_list = inference_vllm(
+                llm_engine,
                 template,
-                data.get('query'),
-                data.get('history'),
-                data.get('system'),
-                stream=args.stream and args.verbose,
+                request_list,
+                use_tqdm=not args.verbose,
                 verbose=args.verbose)
-            label = data.get('response')
-            item = history[0]
-            obj = {'query': item[0], 'response': item[1], 'label': label}
-            if jsonl_path is not None:
-                append_to_jsonl(jsonl_path, obj)
-            result.append(obj)
-            if args.verbose:
-                print()
-                print(f'[LABELS]{label}')
-                print('-' * 50)
+            result = []
+            if label_list is not None:
+                for request, label in zip(request_list, label_list):
+                    request['label'] = label
+            for request, response in zip(request_list, response_list):
+                obj = {'response': response['response'], **request}
+                if jsonl_path is not None:
+                    append_to_jsonl(jsonl_path, obj)
+                result.append(obj)
+        else:
+            if args.verbose is None:
+                if len(val_dataset) >= 100:
+                    args.verbose = False
+                else:
+                    args.verbose = True
+                logger.info(f'Setting args.verbose: {args.verbose}')
+            if not args.verbose:
+                val_dataset = tqdm(val_dataset)
+            for data in val_dataset:
+                kwargs = {'query': data['query']}
+                history = data.get('history')
+                system = data.get('system')
+                if history is not None:
+                    kwargs['history'] = history
+                if system is not None:
+                    kwargs['system'] = system
+                if args.use_vllm:
+                    assert args.stream is True
+                    gen = inference_stream_vllm(llm_engine, template, [kwargs])
+                    print_idx = 0
+                    for response_list in gen:
+                        response = response_list[0]['response']
+                        if len(response) > print_idx:
+                            print(response[print_idx:], end='', flush=True)
+                            print_idx = len(response)
+                else:
+                    response, _ = inference(
+                        model,
+                        template,
+                        stream=args.stream and args.verbose,
+                        verbose=args.verbose,
+                        **kwargs)
+                label = data.pop('response')
+                if label is not None:
+                    kwargs['label'] = label
+                obj = {'response': response, **kwargs}
+                if jsonl_path is not None:
+                    append_to_jsonl(jsonl_path, obj)
+                result.append(obj)
+                if args.verbose:
+                    print()
+                    print(f'[LABELS]{label}')
+                    print('-' * 50)
     if args.save_result and args.ckpt_dir is not None:
         logger.info(f'save_result_path: {jsonl_path}')
     return {'result': result}
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index 872c60cbcd..198ecaff6f 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -26,7 +26,8 @@
 try:
     if is_vllm_available():
         from .vllm_utils import (VllmGenerationConfig, get_vllm_engine,
-                                 inference_stream_vllm, inference_vllm)
+                                 inference_stream_vllm, inference_vllm,
+                                 prepare_vllm_engine_template)
 except Exception as e:
     from swift.utils import get_logger
     logger = get_logger()
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 03a9a16529..ee932595db 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -321,6 +321,7 @@ class InferArguments:
             'help':
             f"template_type choices: {list(TEMPLATE_MAPPING.keys()) + ['AUTO']}"
         })
+    use_vllm: bool = False
     ckpt_dir: Optional[str] = field(
         default=None, metadata={'help': '/path/to/your/vx_xxx/checkpoint-xxx'})
     load_args_from_ckpt_dir: bool = True
@@ -372,6 +373,10 @@ class InferArguments:
     verbose: Optional[bool] = None
     # app-ui
     share: bool = False
+    # vllm
+    gpu_memory_utilization: float = 0.9
+    tensor_parallel_size: int = 1
+    pipeline_parallel_size: int = 1
     # compatibility
     show_dataset_sample: int = 10
 
@@ -420,6 +425,9 @@ def __post_init__(self) -> None:
         if self.ckpt_dir is None and self.overwrite_generation_config:
             self.overwrite_generation_config = False
             logger.warning('Setting overwrite_generation_config: False')
+        if self.use_vllm:
+            assert self.quantization_bit == 0, 'not support bnb'
+            assert self.merge_lora_and_save is True, 'please set `--merge_lora_and_save true`'
 
 
 @dataclass
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index 3cd772a86d..380154111d 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -1,16 +1,18 @@
 import inspect
 import os
 from copy import deepcopy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 from modelscope import GenerationConfig, snapshot_download
 from torch import dtype as Dtype
+from tqdm import tqdm
 from vllm import EngineArgs, LLMEngine, SamplingParams
 
-from swift.utils import get_logger
+from swift.utils import get_logger, seed_everything
+from .argument import InferArguments
 from .model import MODEL_MAPPING, get_model_tokenizer
-from .template import Template
+from .template import Template, get_template
 from .utils import _is_chinese_char
 
 logger = get_logger()
@@ -120,12 +122,12 @@ def max_length(self, value: int) -> None:
 
 
 def inference_stream_vllm(
-    llm_engine: LLMEngine,
-    template: Template,
-    request_list: List[Dict[str, Any]],
-    *,
-    generation_config: Optional[VllmGenerationConfig] = None
-) -> List[Dict[str, Any]]:
+        llm_engine: LLMEngine,
+        template: Template,
+        request_list: List[Dict[str, Any]],
+        *,
+        generation_config: Optional[VllmGenerationConfig] = None,
+        use_tqdm: bool = False) -> List[Dict[str, Any]]:
     """
     request_list: e.g. [{'query': 'hello!'}].
         The keys that can be included are: 'query', 'history', 'system'.
@@ -156,6 +158,8 @@ def inference_stream_vllm(
 
     batch_size = len(request_list)
     response_list = [None] * batch_size
+    print_idx_list = [0] * batch_size
+    prog_bar = tqdm(total=batch_size, dynamic_ncols=True, disable=not use_tqdm)
     while llm_engine.has_unfinished_requests():
         step_outputs = llm_engine.step()
         for output in step_outputs:
@@ -165,17 +169,20 @@ def inference_stream_vllm(
             if output.finished or response.endswith(
                     '\n') or len(response) > 0 and _is_chinese_char(
                         ord(response[-1])):
-                print_idx = len(response)
+                print_idx_list[i] = len(response)
             else:
-                print_idx = max(response.rfind(' ') + 1, print_idx)
+                print_idx_list[i] = max(
+                    response.rfind(' ') + 1, print_idx_list[i])
             # avoid printing incomplete words
-            safe_response = response[:print_idx]
+            safe_response = response[:print_idx_list[i]]
             query = request['query']
             history = request['history']
             if response_list[i] is None:
                 history.append(None)
             history[-1] = (query, safe_response)
             response_list[i] = {'response': safe_response, 'history': history}
+            if output.finished:
+                prog_bar.update()
         yield response_list
 
 
@@ -184,6 +191,7 @@ def inference_vllm(llm_engine: LLMEngine,
                    request_list: List[Dict[str, Any]],
                    *,
                    generation_config: Optional[VllmGenerationConfig] = None,
+                   use_tqdm: bool = False,
                    verbose: bool = False,
                    prompt_prefix: str = '[PROMPT]',
                    output_prefix: str = '[OUTPUT]') -> List[Dict[str, Any]]:
@@ -214,14 +222,19 @@ def inference_vllm(llm_engine: LLMEngine,
             generation_config.max_length = generation_config.max_new_tokens + len(
                 input_ids)
         llm_engine.add_request(str(i), None, generation_config, input_ids)
+
+    batch_size = len(request_list)
+    if use_tqdm is True:
+        assert verbose is False
+    prog_bar = tqdm(total=batch_size, dynamic_ncols=True, disable=not use_tqdm)
     outputs = []
     while llm_engine.has_unfinished_requests():
         step_outputs = llm_engine.step()
         for output in step_outputs:
             if output.finished:
                 outputs.append(output)
+                prog_bar.update()
 
-    batch_size = len(request_list)
     response_list = [None] * batch_size
     for output in outputs:
         i = int(output.request_id)
@@ -237,3 +250,42 @@ def inference_vllm(llm_engine: LLMEngine,
                 end='')
             print(tokenizer.decode(output.outputs[0].token_ids, False))
     return response_list
+
+
+def prepare_vllm_engine_template(
+        args: InferArguments) -> Tuple[LLMEngine, Template]:
+    logger.info(f'args: {args}')
+    logger.info(f'device_count: {torch.cuda.device_count()}')
+    seed_everything(args.seed)
+
+    assert args.quantization_bit == 0, 'not support bnb'
+    assert args.sft_type == 'full', 'you need to merge lora'
+    # Loading Model and Tokenizer
+    kwargs = {}
+    if args.sft_type == 'full' and args.ckpt_dir is not None:
+        kwargs['model_dir'] = args.ckpt_dir
+    elif args.model_cache_dir is not None:
+        kwargs['model_dir'] = args.model_cache_dir
+    llm_engine = get_vllm_engine(args.model_type, args.torch_dtype,
+                                 args.gpu_memory_utilization,
+                                 args.tensor_parallel_size,
+                                 args.pipeline_parallel_size, **kwargs)
+    tokenizer = llm_engine.tokenizer
+    logger.info(f'model_config: {llm_engine.model_config.hf_config}')
+    if not args.do_sample:
+        args.temperature = 0
+    generation_config = VllmGenerationConfig(
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        repetition_penalty=args.repetition_penalty,
+        stop=[tokenizer.eos_token])
+    logger.info(f'generation_config: {generation_config}')
+    llm_engine.generation_config = generation_config
+    template: Template = get_template(args.template_type, tokenizer,
+                                      args.system, args.max_length,
+                                      args.truncation_strategy)
+    args.system = template.default_system
+    logger.info(f'system: {args.system}')
+    return llm_engine, template
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
index 7b36b1eb34..9e7234f4a9 100644
--- a/tests/llm/test_run.py
+++ b/tests/llm/test_run.py
@@ -28,33 +28,36 @@ def test_basic(self):
         if not __name__ == '__main__':
             output_dir = self.tmp_dir
         model_type = ModelType.chatglm3_6b
-        sft_args = SftArguments(
-            model_type=model_type,
-            template_type='AUTO',
-            quantization_bit=4,
-            batch_size=2,
-            eval_steps=5,
-            check_dataset_strategy='warning',
-            train_dataset_sample=200,
-            predict_with_generate=False,
-            dataset=[DatasetName.jd_sentiment_zh],
-            output_dir=output_dir,
-            gradient_checkpointing=True)
-        self.assertTrue(sft_args.gradient_accumulation_steps == 8)
-        output = sft_main(sft_args)
-        print(output)
-        best_model_checkpoint = output['best_model_checkpoint']
-        print(f'best_model_checkpoint: {best_model_checkpoint}')
-        torch.cuda.empty_cache()
-        if __name__ == '__main__':
-            infer_args = InferArguments(
-                ckpt_dir=best_model_checkpoint,
-                stream=False,
-                show_dataset_sample=5)
-            merge_lora_main(infer_args)
-            result = infer_main(infer_args)
-            print(result)
+        for quantization_bit in [0, 4]:
+            predict_with_generate = True
+            if quantization_bit == 0:
+                predict_with_generate = False
+            sft_args = SftArguments(
+                model_type=model_type,
+                template_type='AUTO',
+                quantization_bit=quantization_bit,
+                batch_size=2,
+                eval_steps=5,
+                check_dataset_strategy='warning',
+                train_dataset_sample=200,
+                predict_with_generate=predict_with_generate,
+                dataset=[DatasetName.jd_sentiment_zh],
+                output_dir=output_dir,
+                gradient_checkpointing=True)
+            self.assertTrue(sft_args.gradient_accumulation_steps == 8)
+            output = sft_main(sft_args)
+            print(output)
+            best_model_checkpoint = output['best_model_checkpoint']
+            print(f'best_model_checkpoint: {best_model_checkpoint}')
             torch.cuda.empty_cache()
+            if __name__ == '__main__':
+                infer_args = InferArguments(
+                    ckpt_dir=best_model_checkpoint,
+                    stream=False,
+                    show_dataset_sample=5)
+                result = infer_main(infer_args)
+                print(result)
+                torch.cuda.empty_cache()
         # if __name__ == '__main__':
         #     app_ui_main(infer_args)
 
@@ -65,11 +68,14 @@ def test_loss_matching(self):
             return
         losses = []
         for tuner_backend in ['swift', 'peft']:
+            if tuner_backend == 'swift':
+                bool_var = True
+            else:
+                bool_var = False
             output = sft_main([
-                '--model_type', ModelType.qwen_7b_chat_int4, '--eval_steps',
-                '5', '--tuner_backend', tuner_backend,
-                '--train_dataset_sample', '200', '--predict_with_generate',
-                'true', '--dataset', DatasetName.leetcode_python_en,
+                '--model_type', ModelType.qwen_7b_chat, '--eval_steps', '5',
+                '--tuner_backend', tuner_backend, '--train_dataset_sample',
+                '200', '--dataset', DatasetName.leetcode_python_en,
                 '--output_dir', output_dir, '--gradient_checkpointing', 'true',
                 '--max_new_tokens', '100', '--use_flash_attn', 'true',
                 '--lora_target_modules', 'ALL'
@@ -79,7 +85,11 @@ def test_loss_matching(self):
             torch.cuda.empty_cache()
             infer_main([
                 '--ckpt_dir', best_model_checkpoint, '--show_dataset_sample',
-                '2', '--max_new_tokens', '100', '--use_flash_attn', 'true'
+                '2', '--max_new_tokens', '100', '--use_flash_attn',
+                str(bool_var), '--use_vllm',
+                str(bool_var), '--verbose',
+                str(bool_var), '--merge_lora_and_save',
+                str(bool_var)
             ])
             loss = output['log_history'][-1]['train_loss']
             losses.append(loss)
@@ -182,7 +192,7 @@ def test_self_cognition(self):
             if dataset is None:
                 continue
             infer_args = InferArguments(
-                ckpt_dir=ckpt_dir, show_dataset_sample=2)
+                ckpt_dir=ckpt_dir, show_dataset_sample=2, verbose=False)
             # merge_lora_main(infer_args)
             result = infer_main(infer_args)
             print(result)

From 1bb31b548e35a408160dfec250d7a9b3a7cac186 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 12:38:18 +0800
Subject: [PATCH 09/19] fix eval_human bug

---
 swift/llm/infer.py    | 4 +++-
 tests/llm/test_run.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 54343aa43a..133d0ec8a9 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -205,12 +205,13 @@ def llm_infer(args: InferArguments) -> None:
                                             }])
                 for response_list in gen:
                     response = response_list[0]['response']
+                    new_history = response_list[0]['history']
                     if len(response) > print_idx:
                         print(response[print_idx:], end='', flush=True)
                         print_idx = len(response)
             else:
                 gen = inference_stream(model, template, query, history)
-                for response, _ in gen:
+                for response, new_history in gen:
                     if len(response) > print_idx:
                         print(response[print_idx:], end='', flush=True)
                         print_idx = len(response)
@@ -221,6 +222,7 @@ def llm_infer(args: InferArguments) -> None:
                 'response': response,
                 'history': history,
             }
+            history = new_history
             if jsonl_path is not None:
                 append_to_jsonl(jsonl_path, obj)
             result.append(obj)
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
index 9e7234f4a9..e670c237bb 100644
--- a/tests/llm/test_run.py
+++ b/tests/llm/test_run.py
@@ -78,7 +78,7 @@ def test_loss_matching(self):
                 '200', '--dataset', DatasetName.leetcode_python_en,
                 '--output_dir', output_dir, '--gradient_checkpointing', 'true',
                 '--max_new_tokens', '100', '--use_flash_attn', 'true',
-                '--lora_target_modules', 'ALL'
+                '--lora_target_modules', 'ALL', '--seed', '0'
             ])
             best_model_checkpoint = output['best_model_checkpoint']
             print(f'best_model_checkpoint: {best_model_checkpoint}')

From c374e19ea8bbf11be5239a44622f9c5da6203e6d Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 15:00:01 +0800
Subject: [PATCH 10/19] update docs

---
 README.md                                          |  2 ++
 README_CN.md                                       |  2 ++
 ...276\256\350\260\203\346\226\207\346\241\243.md" | 14 +++++++-------
 ...216\250\347\220\206\346\226\207\346\241\243.md" | 10 +++++++++-
 ...273\244\350\241\214\345\217\202\346\225\260.md" | 10 +++++-----
 ...234\200\344\275\263\345\256\236\350\267\265.md" |  4 +++-
 6 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 8e0c0b0695..af7a5b46f8 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 
 
 ## 🎉 News
+- 2023.12.18: Support for **VLLM** for inference acceleration and deployment. For more details, refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md).
 - 2023.12.15: Support **deepseek**, **deepseek-coder** series: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat.
 - 2023.12.13: Support mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat).
 - 2023.12.9: Support the `freeze_parameters` parameter as a compromise between LoRA and full parameter. Corresponding shell scripts can be found at [full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). Support `disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc` parameters, for details please refer to [Command-Line parameters](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
@@ -102,6 +103,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
 - **Self-cognitionfine-tuning** for large models in **10 minutes**, creating a personalized large model, please refer to [Best Practices for Self-cognition Fine-tuning](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自我认知微调最佳实践.md).
 - Quickly perform **inference** on LLM and build a **Web-UI**, see the [LLM Inference Documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM推理文档.md).
 - Rapidly **fine-tune** and perform inference on LLM, and build a Web-UI. See the [LLM Fine-tuning Documentation](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM微调文档.md).
+- Utilize VLLM for **inference acceleration** and **deployment**. Please refer to [VLLM Inference Acceleration and Deployment](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md) for more information.
 - View the models and datasets supported by Swift. You can check [supported models and datasets](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md).
 - Expand and customize models, datasets, and dialogue templates in Swift, see [Customization and Expansion](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
 - Check command-line parameters for fine-tuning and inference, see [Command-Line parameters](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
diff --git a/README_CN.md b/README_CN.md
index c550a66718..4f3318b2f7 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -58,6 +58,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 用户可以查看 [SWIFT官方文档](docs/source/GetStarted/快速使用.md) 来了解详细信息。
 
 ## 🎉 新闻
+- 2023.12.18: 支持**VLLM**进行推理加速和部署. 具体可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md).
 - 2023.12.15: 支持**deepseek**, **deepseek-coder**系列: deepseek-7b, deepseek-7b-chat, deepseek-67b, deepseek-67b-chat, openbuddy-deepseek-67b-chat, deepseek-coder-1_3b, deepseek-coder-1_3b-chat, deepseek-coder-6_7b, deepseek-coder-6_7b-chat, deepseek-coder-33b, deepseek-coder-33b-chat.
 - 2023.12.13: 支持mistral-7b-chat-v2, [mixtral-7b-moe](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe), [mixtral-7b-moe-chat](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/mixtral_7b_moe_chat).
 - 2023.12.9: 支持`freeze_parameters`参数, 作为lora和全参数训练的折中方案. 对应的sh可以查看[full_freeze_ddp](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp). 支持`disable_tqdm`, `lazy_tokenize`, `preprocess_num_proc`参数, 具体可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
@@ -100,6 +101,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 - **10分钟**对大模型进行**自我认知微调**, 创建专属于自己的大模型, 可以查看[自我认知微调最佳实践](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自我认知微调最佳实践.md).
 - 快速对LLM进行**推理**, 搭建**Web-UI**, 可以查看[LLM推理文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM推理文档.md).
 - 快速对LLM进行**微调**, 推理并搭建Web-UI. 可以查看[LLM微调文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM微调文档.md).
+- 使用VLLM进行**推理加速**和**部署**. 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md).
 - 查看swift支持的模型和数据集. 可以查看[支持的模型和数据集](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md).
 - 对swift中的模型, 数据集, 对话模板进行**拓展**, 可以查看[自定义与拓展](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
 - 查询微调和推理的命令行参数, 可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index ee18191261..d2f0c0bf15 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -136,13 +136,13 @@ cd examples/pytorch/llm
 - 我们默认在训练时设置`--gradient_checkpointing true`来**节约显存**, 这会略微降低训练速度.
 - 如果你想要使用量化参数`--quantization_bit 4`, 你需要先安装[bnb](https://github.com/TimDettmers/bitsandbytes): `pip install bitsandbytes -U`. 这会减少显存消耗, 但通常会降低训练速度.
 - 如果你想要使用基于**auto_gptq**的量化, 你需要先安装对应cuda版本的[auto_gptq](https://github.com/PanQiWei/AutoGPTQ): `pip install auto_gptq -U`.
-  > 使用auto_gptq的模型可以查看[LLM支持的模型](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md#模型). 建议使用auto_gptq, 而不是bnb.
+  > 使用auto_gptq的模型可以查看[LLM支持的模型](./支持的模型和数据集.md#模型). 建议使用auto_gptq, 而不是bnb.
 - 如果你想要使用deepspeed, 你需要`pip install deepspeed -U`. 使用deepspeed可以**节约显存**, 但可能会略微降低训练速度.
-- 如果你的训练涉及到**知识编辑**的内容, 例如: [自我认知微调](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自我认知微调最佳实践.md), 你需要在MLP上也加上LoRA, 否则可能会效果不佳. 你可以简单传入参数`--lora_target_modules ALL`来对所有的linear(qkvo, mlp)加上lora, **这通常是效果最好的**.
+- 如果你的训练涉及到**知识编辑**的内容, 例如: [自我认知微调](./自我认知微调最佳实践.md), 你需要在MLP上也加上LoRA, 否则可能会效果不佳. 你可以简单传入参数`--lora_target_modules ALL`来对所有的linear(qkvo, mlp)加上lora, **这通常是效果最好的**.
 - 如果你使用的是**V100**等较老的GPU, 你需要设置`--dtype AUTO`或者`--dtype fp16`, 因为其不支持bf16.
-- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). 支持flash-attn的模型可以查看[LLM支持的模型](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md#模型)
-- 如果你要进行**二次预训练**, **多轮对话**, 你可以参考[自定义与拓展](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md#注册数据集的方式)
-- 如果你需要断网进行训练, 请使用`--model_cache_dir`和设置`--check_model_is_latest false`. 具体参数含义请查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
+- 如果你的机器是A100等高性能显卡, 且使用的是qwen系列模型, 推荐你安装[**flash-attn**](https://github.com/Dao-AILab/flash-attention), 这将会加快训练和推理的速度以及显存占用(A10, 3090, V100等显卡不支持flash-attn进行训练). 支持flash-attn的模型可以查看[LLM支持的模型](./支持的模型和数据集.md#模型)
+- 如果你要进行**二次预训练**, **多轮对话**, 你可以参考[自定义与拓展](./自定义与拓展.md#注册数据集的方式)
+- 如果你需要断网进行训练, 请使用`--model_cache_dir`和设置`--check_model_is_latest false`. 具体参数含义请查看[命令行参数](./命令行参数.md).
 - 如果你想在训练时, 将权重push到ModelScope Hub中, 你需要设置`--push_to_hub true`.
 - 如何你想要在推理时, 合并LoRA权重并保存，你需要设置`--merge_lora_and_save true`. **不推荐对qlora训练的模型进行merge**, 这会存在精度损失.
 - 以下提供了可以直接运行的`qwen_7b_chat`的sh脚本(你只需要在推理时指定`--ckpt_dir`即可顺利执行). 更多模型的scripts脚本, 可以查看[scripts文件夹](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts). 如果你想要**自定义sh脚本**, 推荐你参考`scripts/qwen_7b_chat`中的脚本进行书写.
@@ -222,7 +222,7 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 ## 推理
-如果你要使用vllm进行推理加速, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#推理加速)
+如果你要使用VLLM进行推理加速, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#推理加速)
 
 ### 原始模型
 **单样本推理**可以查看[LLM推理文档](./LLM推理文档.md#-推理)
@@ -294,7 +294,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged'
 ```
 
 ## Web-UI
-如果你要使用vllm进行部署并提供api接口, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#部署)
+如果你要使用VLLM进行部署并提供**API**接口, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#部署)
 
 ### 原始模型
 使用原始模型的web-ui可以查看[LLM推理文档](./LLM推理文档.md#-Web-UI)
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index f28b3e8b3a..8e57d82ab8 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -1,5 +1,5 @@
 # LLM推理文档
-如果你要使用vllm进行推理加速, 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#推理加速)
+如果你要使用vllm进行推理加速, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#推理加速)
 
 ## 目录
 - [环境准备](#环境准备)
@@ -395,6 +395,11 @@ response: 浙江有很多美食,以下是一些著名的:
 history: [('浙江的省会在哪里？', '浙江的省会是杭州。'), ('这有什么好吃的？', '浙江有很多美食,以下是一些著名的:\n\n1. 杭州小笼包:这是杭州著名的传统小吃,外皮薄而有韧性,内馅鲜美多汁。\n\n2. 浙江粽子:浙江粽子有多种口味,如咸蛋黄肉粽、豆沙粽等,其中以杭州粽子最为著名。\n\n3. 油爆虾:这是浙江海鲜中的代表之一,用热油爆炒虾仁,口感鲜嫩。\n\n4. 椒盐土豆丝:这是浙江传统的素菜之一,用土豆丝和椒盐一起炒制,口感清爽。\n')]
 """
 ```
+
+### 微调后模型
+如果你要使用微调后模型进行推理, 可以查看[LLM微调文档](./LLM微调文档.md#微调后模型)
+
+
 ## Web-UI
 ### qwen-7b-chat
 使用CLI:
@@ -440,3 +445,6 @@ from swift.llm import InferArguments, ModelType, app_ui_main
 infer_args = InferArguments(model_type=ModelType.qwen_7b)
 app_ui_main(infer_args)
 ```
+
+### 微调后模型
+使用微调后模型的web-ui可以查看LLM微调文档](./LLM微调文档.md#微调后模型-1)
diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index c1a9d116ff..63c8c26331 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -27,9 +27,9 @@
 - `--max_length`: token的最大长度, 默认为`2048`. 可以避免个别过长的数据样本造成OOM的问题. 如果某数据样本长度超过max_length, 我们会切除最前面的token: `input_ids[-max_length:]`. 如果设置为-1, 则无限制.
 - `--truncation_strategy`: 默认是`'delete'`表示把超过max_length的句子从数据集中删除. `'truncation_left'`表示会将超过文本的左边给切除掉, 这可能会切到special token, 会影响性能, 并不推荐.
 - `--check_dataset_strategy`: 默认值为`'none'`, 即不做检查. 如果你训练的模型是LLM, 则推荐使用`'warning'`作为数据检查的策略. 如果你的训练目标为句子分类等任务, 则建议设置为'`none`'.
-- `--custom_train_dataset_path`: 默认值为`None`. 具体的含义参考[Customization and Expansion](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
-- `--custom_val_dataset_path`: 默认值为`None`. 具体的含义参考[Customization and Expansion](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
-- `--self_cognition_sample`: 自我认知数据集的采样数. 默认为`0`. 你该值设置为>0时, 需要同时指定`--model_name`, `--model_author`. 如果你想了解更多, 可以查看[自我认知微调最佳实践](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自我认知微调最佳实践.md).
+- `--custom_train_dataset_path`: 默认值为`None`. 具体的含义参考[自定义与拓展](./自定义与拓展.md).
+- `--custom_val_dataset_path`: 默认值为`None`. 具体的含义参考[自定义与拓展](./自定义与拓展.md).
+- `--self_cognition_sample`: 自我认知数据集的采样数. 默认为`0`. 你该值设置为>0时, 需要同时指定`--model_name`, `--model_author`. 如果你想了解更多, 可以查看[自我认知微调最佳实践](./自我认知微调最佳实践.md).
 - `--model_name`: 默认为`None`. 如果开启了自我认知数据集的采样(即self_cognition_sample>0), 你需要传入两个值, 分别代表模型的中文名和英文名. 例如: `--model_name 小黄 'Xiao Huang'`.
 - `--model_author`: 默认为`None`. 如果开启了自我认知数据集的采样, 你需要传入两个值, 分别代表作者的中文名和英文名. 例如: `--model_author 魔搭 ModelScope`.
 - `--quantization_bit`: 用于指定是否进行量化和量化的bit数, 默认为`0`, 即不进行量化. 如果要使用4bit qlora, 你需要设置`--sft_type lora --quantization_bit 4`
@@ -70,7 +70,7 @@
 - `--disable_tqdm`: 是否不启用tqdm, 这在`nohup`启动脚本时很有用. 默认为`False`, 即为启动tqdm.
 - `--lazy_tokenize`: 用于延迟对文本进行编码, 减少预处理的等待并减少内存占用, 这在处理大数据集时很有用. 默认为`False`, 即在`trainer.train()`之前提前对所有文本进行预处理.
 - `--preprocess_num_proc`: 在对数据集预处理时(对文本进行tokenize), 使用多进程. 默认为`1`. 与`lazy_tokenize`命令行参数一样, 用于解决预处理速度慢的问题. 但该策略无法减少内存占用, 所以如果当数据集巨大时, 建议使用`lazy_tokenize`. 推荐设置的值: 4, 8. 请注意: 当使用qwen-audio时, 该参数会强制设置为1, 因为qwen-audio的预处理函数中使用了torch的多进程, 会造成不兼容问题.
-- `--use_flash_attn`: 是否使用flash attn, 默认为`None`. 安装flash_attn的步骤可以查看[https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). 支持flash_attn的模型可以查看[LLM支持的模型](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md#模型).
+- `--use_flash_attn`: 是否使用flash attn, 默认为`None`. 安装flash_attn的步骤可以查看[https://github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention). 支持flash_attn的模型可以查看[LLM支持的模型](./支持的模型和数据集.md#模型).
 - `--ignore_args_error`: 是否忽略命令行传参错误抛出的Error, 默认为`False`. 如果需要拷贝代码到notebook中运行, 需要设置成True.
 - `--logging_dir`: 默认为`None`. 即设置为`f'{self.output_dir}/runs'`, 表示tensorboard文件存储路径.
 - `--check_model_is_latest`: 检查模型是否是最新, 默认为`True`. 如果你需要断网进行训练, 请将该参数设置为`False`.
@@ -91,7 +91,7 @@
 - `--model_cache_dir`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--sft_type`: 默认值为`'lora'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--template_type`: 默认值为`'AUTO'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
-- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`False`. 推荐设置为True, 但这需要安装vllm, 环境准备可以参考[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md#环境准备).
+- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`False`. 推荐设置为True, 但这需要安装vllm, 环境准备可以参考[VLLM推理加速与部署](./VLLM推理加速与部署.md#环境准备).
 - `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
 - `--load_args_from_ckpt_dir`: 是否从`ckpt_dir`的`sft_args.json`文件中读取配置信息. 默认是`True`.
 - `--load_dataset_config`: 该参数只有在`--load_args_from_ckpt_dir true`时才生效. 即是否从`ckpt_dir`的`sft_args.json`文件中读取数据集相关的配置信息. 默认为`True`.
diff --git "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
index af41df3e6f..97636e549b 100644
--- "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -283,6 +283,8 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'qwen-7b-chat/vx-xxx/checkpoint-x
 ## 了解更多
 - 快速对LLM进行**推理**, 搭建**Web-UI**, 可以查看[LLM推理文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM推理文档.md).
 - 快速对LLM进行**微调**, 推理并搭建Web-UI. 可以查看[LLM微调文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/LLM微调文档.md).
+- 使用VLLM进行**推理加速**和**部署**. 可以查看[VLLM推理加速与部署](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM推理加速与部署.md).
 - 查看swift支持的模型和数据集. 可以查看[支持的模型和数据集](https://github.com/modelscope/swift/blob/main/docs/source/LLM/支持的模型和数据集.md).
 - 对swift中的模型, 数据集, 对话模板进行**拓展**, 可以查看[自定义与拓展](https://github.com/modelscope/swift/blob/main/docs/source/LLM/自定义与拓展.md).
-- 命令行参数, 可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
+- 查询微调和推理的命令行参数, 可以查看[命令行参数](https://github.com/modelscope/swift/blob/main/docs/source/LLM/命令行参数.md).
+- 查看不同参数下的训练时间和训练显存对比, 可以查看[Benchmark](https://github.com/modelscope/swift/blob/main/docs/source/LLM/Benchmark.md).

From 2591f495cefeae829377780bac2457fb6a2ab3fa Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 15:25:24 +0800
Subject: [PATCH 11/19] update resp_list

---
 ...37\344\270\216\351\203\250\347\275\262.md" | 72 ++++++++++++-------
 swift/llm/infer.py                            | 16 ++---
 swift/llm/utils/vllm_utils.py                 | 14 ++--
 3 files changed, 62 insertions(+), 40 deletions(-)

diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
index cdf0c06cb0..4d234dc4ec 100644
--- "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -50,18 +50,18 @@ template = get_template(template_type, llm_engine.tokenizer)
 llm_engine.generation_config.max_new_tokens = 256
 
 request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
-response_list = inference_vllm(llm_engine, template, request_list)
-for request, response in zip(request_list, response_list):
+resp_list = inference_vllm(llm_engine, template, request_list)
+for request, resp in zip(request_list, resp_list):
     print(f"query: {request['query']}")
-    print(f"response: {response['response']}")
+    print(f"response: {resp['response']}")
 
-history1 = response_list[1]['history']
+history1 = resp_list[1]['history']
 request_list = [{'query': '这有什么好吃的', 'history': history1}]
-response_list = inference_vllm(llm_engine, template, request_list)
-for request, response in zip(request_list, response_list):
+resp_list = inference_vllm(llm_engine, template, request_list)
+for request, resp in zip(request_list, resp_list):
     print(f"query: {request['query']}")
-    print(f"response: {response['response']}")
-    print(f"history: {response['history']}")
+    print(f"response: {resp['response']}")
+    print(f"history: {resp['history']}")
 
 """Out[0]
 query: 你好!
@@ -96,20 +96,20 @@ request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
 gen = inference_stream_vllm(llm_engine, template, request_list)
 query_list = [request['query'] for request in request_list]
 print(f"query_list: {query_list}")
-for response_list in gen:
-    resp_list = [response['response'] for response in response_list]
-    print(f'response_list: {resp_list}')
+for resp_list in gen:
+    response_list = [resp['response'] for resp in resp_list]
+    print(f'response_list: {response_list}')
 
-history1 = response_list[1]['history']
+history1 = resp_list[1]['history']
 request_list = [{'query': '这有什么好吃的', 'history': history1}]
 gen = inference_stream_vllm(llm_engine, template, request_list)
 query = request_list[0]['query']
 print(f"query: {query}")
-for response_list in gen:
-    resp = response_list[0]['response']
-    print(f'response: {resp}')
+for resp_list in gen:
+    response = resp_list[0]['response']
+    print(f'response: {response}')
 
-history = response_list[0]['history']
+history = resp_list[0]['history']
 print(f'history: {history}')
 
 """Out[0]
@@ -142,18 +142,18 @@ template = get_template(template_type, llm_engine.tokenizer)
 llm_engine.generation_config.max_new_tokens = 256
 
 request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
-response_list = inference_vllm(llm_engine, template, request_list)
-for request, response in zip(request_list, response_list):
+resp_list = inference_vllm(llm_engine, template, request_list)
+for request, resp in zip(request_list, resp_list):
     print(f"query: {request['query']}")
-    print(f"response: {response['response']}")
+    print(f"response: {resp['response']}")
 
-history1 = response_list[1]['history']
+history1 = resp_list[1]['history']
 request_list = [{'query': '这有什么好吃的', 'history': history1}]
-response_list = inference_vllm(llm_engine, template, request_list)
-for request, response in zip(request_list, response_list):
+resp_list = inference_vllm(llm_engine, template, request_list)
+for request, resp in zip(request_list, resp_list):
     print(f"query: {request['query']}")
-    print(f"response: {response['response']}")
-    print(f"history: {response['history']}")
+    print(f"response: {resp['response']}")
+    print(f"history: {resp['history']}")
 
 """Out[0]
 query: 你好!
@@ -167,9 +167,31 @@ history: [('浙江的省会在哪？', '浙江的省会是杭州。'), ('这有
 ```
 
 ### 微调后的模型
+使用LoRA进行微调的模型你需要先merge-lora, 产生完整的checkpoint目录. 具体可以查看[]()
 
-使用LoRA进行微调:
+```python
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
+import torch
+from swift.llm import (
+    ModelType, get_vllm_engine, get_default_template_type,
+    get_template, inference_vllm
+)
+from swift.tuners import Swift
+
+model_dir = 'vx_xxx/checkpoint-100-merged'
+model_type = ModelType.qwen_7b_chat
+template_type = get_default_template_type(model_type)
+
+llm_engine = get_vllm_engine(model_type, torch.bfloat16, model_dir=model_dir)
+tokenizer = llm_engine.tokenizer
+template = get_template(template_type, tokenizer)
+query = '你好'
+resp = inference_vllm(llm_engine, template, [{'query': query}])[0]
+print(f"response: {resp['response']}")
+print(f"history: {resp['history']}")
+```
 
 
 使用全参数微调:
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 133d0ec8a9..6a7bc287d9 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -203,9 +203,9 @@ def llm_infer(args: InferArguments) -> None:
                                                 'query': query,
                                                 'history': history
                                             }])
-                for response_list in gen:
-                    response = response_list[0]['response']
-                    new_history = response_list[0]['history']
+                for resp_list in gen:
+                    response = resp_list[0]['response']
+                    new_history = resp_list[0]['history']
                     if len(response) > print_idx:
                         print(response[print_idx:], end='', flush=True)
                         print_idx = len(response)
@@ -242,7 +242,7 @@ def llm_infer(args: InferArguments) -> None:
                 label_list = val_dataset['response']
             val_dataset = val_dataset.remove_columns('response')
             request_list = val_dataset.to_list()
-            response_list = inference_vllm(
+            resp_list = inference_vllm(
                 llm_engine,
                 template,
                 request_list,
@@ -252,8 +252,8 @@ def llm_infer(args: InferArguments) -> None:
             if label_list is not None:
                 for request, label in zip(request_list, label_list):
                     request['label'] = label
-            for request, response in zip(request_list, response_list):
-                obj = {'response': response['response'], **request}
+            for request, resp in zip(request_list, resp_list):
+                obj = {'response': resp['response'], **request}
                 if jsonl_path is not None:
                     append_to_jsonl(jsonl_path, obj)
                 result.append(obj)
@@ -278,8 +278,8 @@ def llm_infer(args: InferArguments) -> None:
                     assert args.stream is True
                     gen = inference_stream_vllm(llm_engine, template, [kwargs])
                     print_idx = 0
-                    for response_list in gen:
-                        response = response_list[0]['response']
+                    for resp_list in gen:
+                        response = resp_list[0]['response']
                         if len(response) > print_idx:
                             print(response[print_idx:], end='', flush=True)
                             print_idx = len(response)
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index 380154111d..d36e610412 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -157,7 +157,7 @@ def inference_stream_vllm(
         llm_engine.add_request(str(i), None, generation_config, input_ids)
 
     batch_size = len(request_list)
-    response_list = [None] * batch_size
+    resp_list = [None] * batch_size
     print_idx_list = [0] * batch_size
     prog_bar = tqdm(total=batch_size, dynamic_ncols=True, disable=not use_tqdm)
     while llm_engine.has_unfinished_requests():
@@ -177,13 +177,13 @@ def inference_stream_vllm(
             safe_response = response[:print_idx_list[i]]
             query = request['query']
             history = request['history']
-            if response_list[i] is None:
+            if resp_list[i] is None:
                 history.append(None)
             history[-1] = (query, safe_response)
-            response_list[i] = {'response': safe_response, 'history': history}
+            resp_list[i] = {'response': safe_response, 'history': history}
             if output.finished:
                 prog_bar.update()
-        yield response_list
+        yield resp_list
 
 
 def inference_vllm(llm_engine: LLMEngine,
@@ -235,7 +235,7 @@ def inference_vllm(llm_engine: LLMEngine,
                 outputs.append(output)
                 prog_bar.update()
 
-    response_list = [None] * batch_size
+    resp_list = [None] * batch_size
     for output in outputs:
         i = int(output.request_id)
         request = request_list[i]
@@ -243,13 +243,13 @@ def inference_vllm(llm_engine: LLMEngine,
         query = request['query']
         history = request['history']
         history.append((query, response))
-        response_list[i] = {'response': response, 'history': history}
+        resp_list[i] = {'response': response, 'history': history}
         if verbose:
             print(
                 f'{prompt_prefix}{tokenizer.decode(output.prompt_token_ids, False)}{output_prefix}',
                 end='')
             print(tokenizer.decode(output.outputs[0].token_ids, False))
-    return response_list
+    return resp_list
 
 
 def prepare_vllm_engine_template(

From 47dc1c259df3b2499c1487d27f12121d68d3724e Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 15:32:54 +0800
Subject: [PATCH 12/19] update docs

---
 ...LM\345\276\256\350\260\203\346\226\207\346\241\243.md" | 2 +-
 ...40\351\200\237\344\270\216\351\203\250\347\275\262.md" | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index d2f0c0bf15..a788bead02 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -216,7 +216,7 @@ bash scripts/qwen_7b_chat/qlora_ddp_ds/infer.sh
 ```
 
 ## Merge LoRA
-提示: **暂时**不支持bnb和auto_gptq量化模型的merge lora.
+提示: **暂时**不支持bnb和auto_gptq量化模型的merge lora, 这会产生较大的精度损失.
 ```bash
 swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
index 4d234dc4ec..c33c5bf372 100644
--- "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -167,7 +167,9 @@ history: [('浙江的省会在哪？', '浙江的省会是杭州。'), ('这有
 ```
 
 ### 微调后的模型
-使用LoRA进行微调的模型你需要先merge-lora, 产生完整的checkpoint目录. 具体可以查看[]()
+使用LoRA进行微调的模型你需要先[merge-lora](./LLM微调文档.md#merge-lora), 产生完整的checkpoint目录.
+
+使用全参数微调的模型可以无缝使用VLLM进行推理加速.
 
 ```python
 import os
@@ -194,7 +196,5 @@ print(f"history: {resp['history']}")
 ```
 
 
-使用全参数微调:
-
-
 ## 部署
+TODO

From 6382fd94cbc267aa35940cad54434393370eaf64 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 16:33:39 +0800
Subject: [PATCH 13/19] update readme

---
 ...5\276\256\350\260\203\346\226\207\346\241\243.md" |  6 ++----
 ...6\216\250\347\220\206\346\226\207\346\241\243.md" | 11 ++---------
 ...1\200\237\344\270\216\351\203\250\347\275\262.md" | 12 ++++--------
 swift/llm/utils/model.py                             |  2 ++
 4 files changed, 10 insertions(+), 21 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index a788bead02..45d766f6aa 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -243,13 +243,12 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type
 )
 from swift.tuners import Swift
-import torch
 
 model_dir = 'vx_xxx/checkpoint-100'
 model_type = ModelType.qwen_7b_chat
 template_type = get_default_template_type(model_type)
 
-model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16, {'device_map': 'auto'})
+model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'})
 
 model = Swift.from_pretrained(model, model_dir, inference_mode=True)
 template = get_template(template_type, tokenizer)
@@ -267,13 +266,12 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type
 )
-import torch
 
 model_dir = 'vx_xxx/checkpoint-100-merged'
 model_type = ModelType.qwen_7b_chat
 template_type = get_default_template_type(model_type)
 
-model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16, {'device_map': 'auto'},
+model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'},
                                        model_dir=model_dir)
 
 template = get_template(template_type, tokenizer)
diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index 8e57d82ab8..6f5133a30f 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -36,7 +36,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_7b_chat
 template_type = get_default_template_type(model_type)
@@ -46,7 +45,7 @@ print(f'template_type: {template_type}')  # template_type: chatml
 kwargs = {}
 # kwargs['use_flash_attn'] = True  # 使用flash_attn
 
-model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16, {'device_map': 'auto'}, **kwargs)
+model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'}, **kwargs)
 # 修改max_new_tokens
 model.generation_config.max_new_tokens = 128
 
@@ -99,7 +98,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_7b_chat_int4
 template_type = get_default_template_type(model_type)
@@ -137,13 +135,12 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_7b
 template_type = get_default_template_type(model_type)
 print(f'template_type: {template_type}')  # template_type: default-generation
 
-model, tokenizer = get_model_tokenizer(model_type, torch.bfloat16, {'device_map': 'auto'})
+model, tokenizer = get_model_tokenizer(model_type, model_kwargs={'device_map': 'auto'})
 model.generation_config.max_new_tokens = 64
 template = get_template(template_type, tokenizer)
 seed_everything(42)
@@ -179,7 +176,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference_stream, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_7b_chat
 template_type = get_default_template_type(model_type)
@@ -221,7 +217,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_vl_chat
 template_type = get_default_template_type(model_type)
@@ -264,7 +259,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.qwen_audio_chat
 template_type = get_default_template_type(model_type)
@@ -306,7 +300,6 @@ from swift.llm import (
     get_model_tokenizer, get_template, inference, ModelType, get_default_template_type,
 )
 from swift.utils import seed_everything
-import torch
 
 model_type = ModelType.chatglm3_6b
 template_type = get_default_template_type(model_type)
diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
index c33c5bf372..eb04da7710 100644
--- "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -36,14 +36,13 @@ pip install -r requirements/llm.txt  -U
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-import torch
 from swift.llm import (
     ModelType, get_vllm_engine, get_default_template_type,
     get_template, inference_vllm
 )
 
 model_type = ModelType.qwen_7b_chat
-llm_engine = get_vllm_engine(model_type, torch.float16)
+llm_engine = get_vllm_engine(model_type)
 template_type = get_default_template_type(model_type)
 template = get_template(template_type, llm_engine.tokenizer)
 # 与`transformers.GenerationConfig`类似的接口
@@ -79,14 +78,13 @@ history: [('浙江的省会在哪？', '浙江省会是杭州市。'), ('这有
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-import torch
 from swift.llm import (
     ModelType, get_vllm_engine, get_default_template_type,
     get_template, inference_stream_vllm
 )
 
 model_type = ModelType.qwen_7b_chat
-llm_engine = get_vllm_engine(model_type, torch.float16)
+llm_engine = get_vllm_engine(model_type)
 template_type = get_default_template_type(model_type)
 template = get_template(template_type, llm_engine.tokenizer)
 # 与`transformers.GenerationConfig`类似的接口
@@ -128,14 +126,13 @@ history: [('浙江的省会在哪？', '浙江省会是杭州市。'), ('这有
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-import torch
 from swift.llm import (
     ModelType, get_vllm_engine, get_default_template_type,
     get_template, inference_vllm
 )
 
 model_type = ModelType.chatglm3_6b
-llm_engine = get_vllm_engine(model_type, torch.float16)
+llm_engine = get_vllm_engine(model_type)
 template_type = get_default_template_type(model_type)
 template = get_template(template_type, llm_engine.tokenizer)
 # 与`transformers.GenerationConfig`类似的接口
@@ -175,7 +172,6 @@ history: [('浙江的省会在哪？', '浙江的省会是杭州。'), ('这有
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
-import torch
 from swift.llm import (
     ModelType, get_vllm_engine, get_default_template_type,
     get_template, inference_vllm
@@ -186,7 +182,7 @@ model_dir = 'vx_xxx/checkpoint-100-merged'
 model_type = ModelType.qwen_7b_chat
 template_type = get_default_template_type(model_type)
 
-llm_engine = get_vllm_engine(model_type, torch.bfloat16, model_dir=model_dir)
+llm_engine = get_vllm_engine(model_type, model_dir=model_dir)
 tokenizer = llm_engine.tokenizer
 template = get_template(template_type, tokenizer)
 query = '你好'
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 7707ec706b..8e7c465e80 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -1258,6 +1258,7 @@ def get_model_tokenizer(
         model_torch_dtype = model_info['torch_dtype']
         if torch_dtype is None:
             torch_dtype = model_torch_dtype
+            logger.info(f'Setting torch_dtype: {torch_dtype}')
         else:
             assert torch_dtype == model_torch_dtype, f'please use `{model_torch_dtype}`'
     else:
@@ -1267,6 +1268,7 @@ def get_model_tokenizer(
             torch_dtype = getattr(model_config, 'torch_dtype', None)
             if torch_dtype == torch.float32:
                 torch_dtype = torch.float16
+            logger.info(f'Setting torch_dtype: {torch_dtype}')
     kwargs['automodel_class'] = model_info['automodel_class']
     kwargs['eos_token'] = model_info['eos_token']
     model, tokenizer = get_function(model_dir, torch_dtype, model_kwargs,

From af1df42b6b8d459d35031cd9b19d8adfcd022f96 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 16:53:59 +0800
Subject: [PATCH 14/19] update code

---
 examples/pytorch/llm/app.py             |  2 ++
 scripts/{utils => tests}/test_readme.py |  0
 scripts/tests/test_vllm.py/main.py      | 18 ++++++++++++++
 scripts/tests/test_vllm.py/utils.py     | 31 +++++++++++++++++++++++++
 4 files changed, 51 insertions(+)
 rename scripts/{utils => tests}/test_readme.py (100%)
 create mode 100644 scripts/tests/test_vllm.py/main.py
 create mode 100644 scripts/tests/test_vllm.py/utils.py

diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py
index 8d7e9c40a9..ab9e61316c 100644
--- a/examples/pytorch/llm/app.py
+++ b/examples/pytorch/llm/app.py
@@ -1,5 +1,7 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
 # import os
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+import custom
 
 from swift.llm import InferArguments, ModelType
 from swift.llm.run import app_ui_main
diff --git a/scripts/utils/test_readme.py b/scripts/tests/test_readme.py
similarity index 100%
rename from scripts/utils/test_readme.py
rename to scripts/tests/test_readme.py
diff --git a/scripts/tests/test_vllm.py/main.py b/scripts/tests/test_vllm.py/main.py
new file mode 100644
index 0000000000..7bf7379bca
--- /dev/null
+++ b/scripts/tests/test_vllm.py/main.py
@@ -0,0 +1,18 @@
+import os
+import subprocess
+
+from swift.llm import ModelType
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+if __name__ == '__main__':
+    model_name_list = ModelType.get_model_name_list()
+    success_model_list = []
+    fpath = os.path.join(os.path.dirname(__file__), 'utils.py')
+    for model_name in model_name_list:
+        code = subprocess.run(['python', fpath, '--model_type', model_name])
+        if code.returncode == 0:
+            success_model_list.append(model_name)
+        else:
+            print(f'model_name: {model_name} not support vllm.')
+    print(success_model_list)
diff --git a/scripts/tests/test_vllm.py/utils.py b/scripts/tests/test_vllm.py/utils.py
new file mode 100644
index 0000000000..4abe735288
--- /dev/null
+++ b/scripts/tests/test_vllm.py/utils.py
@@ -0,0 +1,31 @@
+from dataclasses import dataclass
+
+from swift.llm import (get_default_template_type, get_template,
+                       get_vllm_engine, inference_vllm)
+from swift.utils import get_main
+
+
+@dataclass
+class VLLMTestArgs:
+    model_type: str
+
+
+def test_vllm(args: VLLMTestArgs) -> None:
+    model_type = args.model_type
+    llm_engine = get_vllm_engine(model_type)
+    template_type = get_default_template_type(model_type)
+    template = get_template(template_type, llm_engine.tokenizer)
+
+    llm_engine.generation_config.max_new_tokens = 256
+
+    request_list = [{'query': '你好!'}, {'query': '浙江的省会在哪？'}]
+    resp_list = inference_vllm(llm_engine, template, request_list)
+    for request, resp in zip(request_list, resp_list):
+        print(f"query: {request['query']}")
+        print(f"response: {resp['response']}")
+
+
+test_vllm_main = get_main(VLLMTestArgs, test_vllm)
+
+if __name__ == '__main__':
+    test_vllm_main()

From 3cf7ab44b82086a8165dbef95189de4c9b586e4f Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 18:11:01 +0800
Subject: [PATCH 15/19] update docs

---
 ...44\350\241\214\345\217\202\346\225\260.md" |   1 -
 ...14\346\225\260\346\215\256\351\233\206.md" | 195 ++++++------
 examples/pytorch/llm/app.py                   |   3 +-
 examples/pytorch/llm/llm_infer.py             |   2 +-
 examples/pytorch/llm/llm_sft.py               |   2 +-
 examples/pytorch/llm/rome_infer.py            |   2 +-
 scripts/utils/run_model_info.py               |  10 +-
 swift/cli/app_ui.py                           |   2 +-
 swift/cli/infer.py                            |   2 +-
 swift/cli/merge_lora.py                       |   2 +-
 swift/cli/sft.py                              |   2 +-
 swift/cli/web_ui.py                           |   2 +-
 swift/llm/utils/argument.py                   |   1 -
 swift/llm/utils/model.py                      | 285 ++++++++++++------
 swift/llm/utils/vllm_utils.py                 |  13 +-
 tests/llm/test_run.py                         |   4 +-
 tools/merge_lora_weights_to_model.py          |   2 +-
 17 files changed, 320 insertions(+), 210 deletions(-)

diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 63c8c26331..f1106abe8e 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -128,4 +128,3 @@
 - `--share`: 传递给gradio的`demo.queue().launch(...)`函数. 该参数只有在使用`app-ui`时才生效.
 - `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在`--use_vllm true`时才生效.
 - `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在`--use_vllm true`时才生效.
-- `--pipeline_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在`--use_vllm true`时才生效.
diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 64a4c36000..b45b2369c6 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -8,105 +8,106 @@
 - Model List: 模型在swift中注册的model_type的列表.
 - Default Lora Target Modules: 对应模型的默认lora_target_modules.
 - Default Template: 对应模型的默认template.
-- Support Flash Attn: 模型是否支持[flash attention](https://github.com/Dao-AILab/flash-attention).
+- Support Flash Attn: 模型是否支持[flash attention](https://github.com/Dao-AILab/flash-attention)加速推理和微调.
+- Support VLLM: 模型是否支持[vllm](https://github.com/vllm-project/vllm)加速推理和部署.
 - Requires: 对应模型所需的额外依赖要求.
 
-| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Requires |
-| ---------  | -------- | --------------------------- | ---------------- | ------------------ | -------- |
-|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;||
-|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|chatml|&#x2714;||
-|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;||
-|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|chatml|&#x2714;||
-|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;||
-|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|chatml|&#x2714;||
-|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;||
-|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|chatml|&#x2714;||
-|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;||
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|chatml|&#x2714;||
-|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|default-generation|&#x2714;||
-|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|chatml|&#x2714;||
-|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;||
-|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;||
-|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;||
-|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;||
-|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;||
-|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;||
-|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;||
-|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;||
-|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
-|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
-|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;||
-|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
-|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
-|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;||
-|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;||
-|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;||
-|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;||
-|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;||
-|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;||
-|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v13.1](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|transformers>=4.34|
-|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|transformers>=4.34|
-|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;||
-|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|transformers>=4.34|
-|mistral-7b-chat|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|transformers>=4.34|
-|mistral-7b-chat-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|transformers>=4.34|
-|mixtral-7b-moe|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|transformers>=4.36|
-|mixtral-7b-moe-chat|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|transformers>=4.36|
-|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|transformers<4.34|
-|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|transformers<4.34|
-|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|transformers<4.34|
-|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;||
-|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;||
-|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;||
-|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;||
-|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;||
-|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;||
-|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;||
-|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;||
-|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;||
-|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;||
-|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;||
-|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;||
-|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;||
-|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;||
-|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;||
-|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;||
-|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;||
-|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;||
-|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;||
-|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;||
-|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;||
-|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;||
-|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;||
-|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|transformers>=4.34|
-|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;||
-|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;||
-|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;||
-|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;||
-|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|chatml|&#x2714;||
-|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|auto_gptq>=0.5|
-|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;||
-|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|deepseek-coder-1_3b-chat|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;||
-|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|deepseek-coder-6_7b-chat|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;||
-|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;||
-|deepseek-coder-33b-chat|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;||
+| Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires |
+| ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- |
+|qwen-1_8b|[qwen/Qwen-1_8B](https://modelscope.cn/models/qwen/Qwen-1_8B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||
+|qwen-1_8b-chat|[qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)|c_attn|chatml|&#x2714;|&#x2714;||
+|qwen-1_8b-chat-int4|[qwen/Qwen-1_8B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-1_8b-chat-int8|[qwen/Qwen-1_8B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-7b|[qwen/Qwen-7B](https://modelscope.cn/models/qwen/Qwen-7B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||
+|qwen-7b-chat|[qwen/Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)|c_attn|chatml|&#x2714;|&#x2714;||
+|qwen-7b-chat-int4|[qwen/Qwen-7B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-7b-chat-int8|[qwen/Qwen-7B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-7B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-14b|[qwen/Qwen-14B](https://modelscope.cn/models/qwen/Qwen-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||
+|qwen-14b-chat|[qwen/Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)|c_attn|chatml|&#x2714;|&#x2714;||
+|qwen-14b-chat-int4|[qwen/Qwen-14B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-14b-chat-int8|[qwen/Qwen-14B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-14B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-72b|[qwen/Qwen-72B](https://modelscope.cn/models/qwen/Qwen-72B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||
+|qwen-72b-chat|[qwen/Qwen-72B-Chat](https://modelscope.cn/models/qwen/Qwen-72B-Chat/summary)|c_attn|chatml|&#x2714;|&#x2714;||
+|qwen-72b-chat-int4|[qwen/Qwen-72B-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-72b-chat-int8|[qwen/Qwen-72B-Chat-Int8](https://modelscope.cn/models/qwen/Qwen-72B-Chat-Int8/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|chatml|&#x2714;|&#x2718;||
+|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|default-generation|&#x2714;|&#x2718;||
+|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|chatml|&#x2714;|&#x2718;||
+|chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||
+|chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|&#x2718;|&#x2714;||
+|chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|&#x2718;|&#x2714;||
+|chatglm3-6b|[ZhipuAI/chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||
+|chatglm3-6b-32k|[ZhipuAI/chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)|query_key_value|chatglm3|&#x2718;|&#x2714;||
+|llama2-7b|[modelscope/Llama-2-7b-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|llama2-7b-chat|[modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||
+|llama2-13b|[modelscope/Llama-2-13b-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|llama2-13b-chat|[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||
+|llama2-70b|[modelscope/Llama-2-70b-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|llama2-70b-chat|[modelscope/Llama-2-70b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;||
+|yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||
+|yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||
+|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||
+|yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||
+|yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||
+|yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;|&#x2714;||
+|deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||
+|deepseek-67b|[deepseek-ai/deepseek-llm-67b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|deepseek-67b-chat|[deepseek-ai/deepseek-llm-67b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-67b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||
+|openbuddy-llama2-13b-chat|[OpenBuddy/openbuddy-llama2-13b-v8.1-fp16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
+|openbuddy-llama-65b-chat|[OpenBuddy/openbuddy-llama-65b-v8-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
+|openbuddy-llama2-70b-chat|[OpenBuddy/openbuddy-llama2-70b-v10.1-bf16](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
+|openbuddy-mistral-7b-chat|[OpenBuddy/openbuddy-mistral-7b-v13.1](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|
+|openbuddy-zephyr-7b-chat|[OpenBuddy/openbuddy-zephyr-7b-v14.1](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.34|
+|openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||
+|mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|
+|mistral-7b-chat|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
+|mistral-7b-chat-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|
+|mixtral-7b-moe|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|
+|mixtral-7b-moe-chat|[AI-ModelScope/Mixtral-8x7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.36|
+|baichuan-7b|[baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|
+|baichuan-13b|[baichuan-inc/Baichuan-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;|transformers<4.34|
+|baichuan-13b-chat|[baichuan-inc/Baichuan-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;|transformers<4.34|
+|baichuan2-7b|[baichuan-inc/Baichuan2-7B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||
+|baichuan2-7b-chat|[baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||
+|baichuan2-7b-chat-int4|[baichuan-inc/Baichuan2-7B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;||
+|baichuan2-13b|[baichuan-inc/Baichuan2-13B-Base](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary)|W_pack|default-generation|&#x2718;|&#x2714;||
+|baichuan2-13b-chat|[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)|W_pack|baichuan|&#x2718;|&#x2714;||
+|baichuan2-13b-chat-int4|[baichuan-inc/Baichuan2-13B-Chat-4bits](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)|W_pack|baichuan|&#x2718;|&#x2718;||
+|internlm-7b|[Shanghai_AI_Laboratory/internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||
+|internlm-7b-chat|[Shanghai_AI_Laboratory/internlm-chat-7b-v1_1](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||
+|internlm-7b-chat-8k|[Shanghai_AI_Laboratory/internlm-chat-7b-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||
+|internlm-20b|[Shanghai_AI_Laboratory/internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2714;||
+|internlm-20b-chat|[Shanghai_AI_Laboratory/internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)|q_proj, k_proj, v_proj|internlm|&#x2718;|&#x2714;||
+|xverse-7b|[xverse/XVERSE-7B](https://modelscope.cn/models/xverse/XVERSE-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||
+|xverse-7b-chat|[xverse/XVERSE-7B-Chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||
+|xverse-13b|[xverse/XVERSE-13B](https://modelscope.cn/models/xverse/XVERSE-13B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||
+|xverse-13b-chat|[xverse/XVERSE-13B-Chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary)|q_proj, k_proj, v_proj|xverse|&#x2718;|&#x2718;||
+|xverse-65b|[xverse/XVERSE-65B](https://modelscope.cn/models/xverse/XVERSE-65B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||
+|bluelm-7b|[vivo-ai/BlueLM-7B-Base](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||
+|bluelm-7b-32k|[vivo-ai/BlueLM-7B-Base-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Base-32K/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||
+|bluelm-7b-chat|[vivo-ai/BlueLM-7B-Chat](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||
+|bluelm-7b-chat-32k|[vivo-ai/BlueLM-7B-Chat-32K](https://modelscope.cn/models/vivo-ai/BlueLM-7B-Chat-32K/summary)|q_proj, k_proj, v_proj|bluelm|&#x2718;|&#x2718;||
+|ziya2-13b|[Fengshenbang/Ziya2-13B-Base](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|ziya2-13b-chat|[Fengshenbang/Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)|q_proj, k_proj, v_proj|ziya|&#x2714;|&#x2714;||
+|skywork-13b|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2718;|&#x2718;||
+|skywork-13b-chat|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat/summary)|q_proj, k_proj, v_proj|skywork|&#x2718;|&#x2718;||
+|zephyr-7b-beta-chat|[modelscope/zephyr-7b-beta](https://modelscope.cn/models/modelscope/zephyr-7b-beta/summary)|q_proj, k_proj, v_proj|zephyr|&#x2714;|&#x2714;|transformers>=4.34|
+|sus-34b-chat|[SUSTC/SUS-Chat-34B](https://modelscope.cn/models/SUSTC/SUS-Chat-34B/summary)|q_proj, k_proj, v_proj|sus|&#x2714;|&#x2714;||
+|polylm-13b|[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation/summary)|c_attn|default-generation|&#x2718;|&#x2718;||
+|seqgpt-560m|[damo/nlp_seqgpt-560m](https://modelscope.cn/models/damo/nlp_seqgpt-560m/summary)|query_key_value|default-generation|&#x2718;|&#x2714;||
+|tongyi-finance-14b|[TongyiFinance/Tongyi-Finance-14B](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B/summary)|c_attn|default-generation|&#x2714;|&#x2714;||
+|tongyi-finance-14b-chat|[TongyiFinance/Tongyi-Finance-14B-Chat](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat/summary)|c_attn|chatml|&#x2714;|&#x2714;||
+|tongyi-finance-14b-chat-int4|[TongyiFinance/Tongyi-Finance-14B-Chat-Int4](https://modelscope.cn/models/TongyiFinance/Tongyi-Finance-14B-Chat-Int4/summary)|c_attn|chatml|&#x2714;|&#x2718;|auto_gptq>=0.5|
+|codefuse-codellama-34b-chat|[codefuse-ai/CodeFuse-CodeLlama-34B](https://modelscope.cn/models/codefuse-ai/CodeFuse-CodeLlama-34B/summary)|q_proj, k_proj, v_proj|codefuse-codellama|&#x2714;|&#x2714;||
+|deepseek-coder-1_3b|[deepseek-ai/deepseek-coder-1.3b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|deepseek-coder-1_3b-chat|[deepseek-ai/deepseek-coder-1.3b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-1.3b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
+|deepseek-coder-6_7b|[deepseek-ai/deepseek-coder-6.7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|deepseek-coder-6_7b-chat|[deepseek-ai/deepseek-coder-6.7b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-6.7b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
+|deepseek-coder-33b|[deepseek-ai/deepseek-coder-33b-base](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-base/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||
+|deepseek-coder-33b-chat|[deepseek-ai/deepseek-coder-33b-instruct](https://modelscope.cn/models/deepseek-ai/deepseek-coder-33b-instruct/summary)|q_proj, k_proj, v_proj|deepseek-coder|&#x2714;|&#x2714;||
 
 
 ## 数据集
diff --git a/examples/pytorch/llm/app.py b/examples/pytorch/llm/app.py
index ab9e61316c..c9a2083033 100644
--- a/examples/pytorch/llm/app.py
+++ b/examples/pytorch/llm/app.py
@@ -3,8 +3,7 @@
 # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 import custom
 
-from swift.llm import InferArguments, ModelType
-from swift.llm.run import app_ui_main
+from swift.llm import InferArguments, ModelType, app_ui_main
 
 if __name__ == '__main__':
     # Please refer to the `infer.sh` for setting the parameters.
diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py
index 1e247b46ee..7fa096807d 100644
--- a/examples/pytorch/llm/llm_infer.py
+++ b/examples/pytorch/llm/llm_infer.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import custom
 
-from swift.llm.run import infer_main
+from swift.llm import infer_main
 
 if __name__ == '__main__':
     result = infer_main()
diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py
index a1c9fc3982..899c6e41e7 100644
--- a/examples/pytorch/llm/llm_sft.py
+++ b/examples/pytorch/llm/llm_sft.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import custom
 
-from swift.llm.run import sft_main
+from swift.llm import sft_main
 
 if __name__ == '__main__':
     output = sft_main()
diff --git a/examples/pytorch/llm/rome_infer.py b/examples/pytorch/llm/rome_infer.py
index 139759a47e..db9cc077b6 100644
--- a/examples/pytorch/llm/rome_infer.py
+++ b/examples/pytorch/llm/rome_infer.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from swift.llm.run import rome_main
+from swift.llm import rome_main
 
 if __name__ == '__main__':
     rome_main()
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
index f062e657c1..43c9b6154b 100644
--- a/scripts/utils/run_model_info.py
+++ b/scripts/utils/run_model_info.py
@@ -8,9 +8,9 @@ def write_model_info_table2(fpath: str) -> None:
     with open(fpath, 'w', encoding='utf-8') as f:
         f.write(
             '| Model Type | Model ID | Default Lora Target Modules | Default Template |'
-            ' Support Flash Attn | Requires |\n'
+            ' Support Flash Attn | Support VLLM | Requires |\n'
             '| ---------  | -------- | --------------------------- | ---------------- |'
-            ' ------------------ | -------- |\n')
+            ' ------------------ | ------------ | -------- |\n')
     res = []
     bool_mapping = {True: '&#x2714;', False: '&#x2718;'}
     for model_name in model_name_list:
@@ -20,16 +20,18 @@ def write_model_info_table2(fpath: str) -> None:
         template = model_info['template']
         support_flash_attn = model_info.get('support_flash_attn', False)
         support_flash_attn = bool_mapping[support_flash_attn]
+        support_vllm = model_info.get('support_vllm', False)
+        support_vllm = bool_mapping[support_vllm]
         requires = ', '.join(model_info['requires'])
         r = [
             model_name, model_id, lora_target_modules, template,
-            support_flash_attn, requires
+            support_flash_attn, support_vllm, requires
         ]
         res.append(r)
     text = ''
     for r in res:
         url = f'https://modelscope.cn/models/{r[1]}/summary'
-        text += f'|{r[0]}|[{r[1]}]({url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|\n'
+        text += f'|{r[0]}|[{r[1]}]({url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|\n'
     with open(fpath, 'a', encoding='utf-8') as f:
         f.write(text)
     print()
diff --git a/swift/cli/app_ui.py b/swift/cli/app_ui.py
index 93734c2d44..b3b1355394 100644
--- a/swift/cli/app_ui.py
+++ b/swift/cli/app_ui.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm.run import app_ui_main
+from swift.llm import app_ui_main
 
 if __name__ == '__main__':
     app_ui_main()
diff --git a/swift/cli/infer.py b/swift/cli/infer.py
index d855ae735c..2dce4f3acf 100644
--- a/swift/cli/infer.py
+++ b/swift/cli/infer.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm.run import infer_main
+from swift.llm import infer_main
 
 if __name__ == '__main__':
     infer_main()
diff --git a/swift/cli/merge_lora.py b/swift/cli/merge_lora.py
index e17f453b43..5d35074b23 100644
--- a/swift/cli/merge_lora.py
+++ b/swift/cli/merge_lora.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm.run import merge_lora_main
+from swift.llm import merge_lora_main
 
 if __name__ == '__main__':
     merge_lora_main(replace_if_exists=True)
diff --git a/swift/cli/sft.py b/swift/cli/sft.py
index 54d5ad6388..6e52c4e0e3 100644
--- a/swift/cli/sft.py
+++ b/swift/cli/sft.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm.run import sft_main
+from swift.llm import sft_main
 
 if __name__ == '__main__':
     sft_main()
diff --git a/swift/cli/web_ui.py b/swift/cli/web_ui.py
index 93734c2d44..b3b1355394 100644
--- a/swift/cli/web_ui.py
+++ b/swift/cli/web_ui.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm.run import app_ui_main
+from swift.llm import app_ui_main
 
 if __name__ == '__main__':
     app_ui_main()
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index ee932595db..43e3d645f0 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -376,7 +376,6 @@ class InferArguments:
     # vllm
     gpu_memory_utilization: float = 0.9
     tensor_parallel_size: int = 1
-    pipeline_parallel_size: int = 1
     # compatibility
     show_dataset_sample: int = 10
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 8e7c465e80..4aeb013db3 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -235,10 +235,18 @@ def _register_model(
     return _register_model
 
 
-@register_model(ModelType.internlm_20b, 'Shanghai_AI_Laboratory/internlm-20b',
-                LoRATM.llama2, TemplateType.default_generation_bos)
-@register_model(ModelType.internlm_7b, 'Shanghai_AI_Laboratory/internlm-7b',
-                LoRATM.llama2, TemplateType.default_generation_bos)
+@register_model(
+    ModelType.internlm_20b,
+    'Shanghai_AI_Laboratory/internlm-20b',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    support_vllm=True)
+@register_model(
+    ModelType.internlm_7b,
+    'Shanghai_AI_Laboratory/internlm-7b',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    support_vllm=True)
 @register_model(ModelType.bluelm_7b_chat_32k, 'vivo-ai/BlueLM-7B-Chat-32K',
                 LoRATM.llama2, TemplateType.bluelm)
 @register_model(ModelType.bluelm_7b_chat, 'vivo-ai/BlueLM-7B-Chat',
@@ -247,8 +255,12 @@ def _register_model(
                 LoRATM.llama2, TemplateType.default_generation_bos)
 @register_model(ModelType.bluelm_7b, 'vivo-ai/BlueLM-7B-Base', LoRATM.llama2,
                 TemplateType.default_generation_bos)
-@register_model(ModelType.seqgpt_560m, 'damo/nlp_seqgpt-560m', LoRATM.bloom,
-                TemplateType.default_generation)
+@register_model(
+    ModelType.seqgpt_560m,
+    'damo/nlp_seqgpt-560m',
+    LoRATM.bloom,
+    TemplateType.default_generation,
+    support_vllm=True)
 @register_model(ModelType.xverse_13b_chat, 'xverse/XVERSE-13B-Chat',
                 LoRATM.llama2, TemplateType.xverse)
 @register_model(ModelType.xverse_13b, 'xverse/XVERSE-13B', LoRATM.llama2,
@@ -264,13 +276,15 @@ def _register_model(
     'baichuan-inc/Baichuan-13B-Chat',
     LoRATM.baichuan,
     TemplateType.baichuan,
-    requires=['transformers<4.34'])
+    requires=['transformers<4.34'],
+    support_vllm=True)
 @register_model(
     ModelType.baichuan_7b,
     'baichuan-inc/baichuan-7B',
     LoRATM.baichuan,
     TemplateType.default_generation,
-    requires=['transformers<4.34'])
+    requires=['transformers<4.34'],
+    support_vllm=True)
 def get_model_tokenizer_from_repo(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -301,15 +315,24 @@ def get_model_tokenizer_from_repo(model_dir: str,
     return model, tokenizer
 
 
-@register_model(ModelType.internlm_20b_chat,
-                'Shanghai_AI_Laboratory/internlm-chat-20b', LoRATM.llama2,
-                TemplateType.internlm)
-@register_model(ModelType.internlm_7b_chat_8k,
-                'Shanghai_AI_Laboratory/internlm-chat-7b-8k', LoRATM.llama2,
-                TemplateType.internlm)
-@register_model(ModelType.internlm_7b_chat,
-                'Shanghai_AI_Laboratory/internlm-chat-7b-v1_1', LoRATM.llama2,
-                TemplateType.internlm)
+@register_model(
+    ModelType.internlm_20b_chat,
+    'Shanghai_AI_Laboratory/internlm-chat-20b',
+    LoRATM.llama2,
+    TemplateType.internlm,
+    support_vllm=True)
+@register_model(
+    ModelType.internlm_7b_chat_8k,
+    'Shanghai_AI_Laboratory/internlm-chat-7b-8k',
+    LoRATM.llama2,
+    TemplateType.internlm,
+    support_vllm=True)
+@register_model(
+    ModelType.internlm_7b_chat,
+    'Shanghai_AI_Laboratory/internlm-chat-7b-v1_1',
+    LoRATM.llama2,
+    TemplateType.internlm,
+    support_vllm=True)
 def get_model_tokenizer_internlm_chat(model_dir: str,
                                       torch_dtype: Dtype,
                                       model_kwargs: Dict[str, Any],
@@ -328,7 +351,8 @@ def get_model_tokenizer_internlm_chat(model_dir: str,
     'baichuan-inc/Baichuan-13B-Base',
     LoRATM.baichuan,
     TemplateType.default_generation,
-    requires=['transformers<4.34'])
+    requires=['transformers<4.34'],
+    support_vllm=True)
 def get_model_tokenizer_baichuan_13b(model_dir: str,
                                      torch_dtype: Dtype,
                                      model_kwargs: Dict[str, Any],
@@ -346,11 +370,18 @@ def get_model_tokenizer_baichuan_13b(model_dir: str,
     return model, tokenizer
 
 
-@register_model(ModelType.baichuan2_13b_chat,
-                'baichuan-inc/Baichuan2-13B-Chat', LoRATM.baichuan,
-                TemplateType.baichuan)
-@register_model(ModelType.baichuan2_13b, 'baichuan-inc/Baichuan2-13B-Base',
-                LoRATM.baichuan, TemplateType.default_generation)
+@register_model(
+    ModelType.baichuan2_13b_chat,
+    'baichuan-inc/Baichuan2-13B-Chat',
+    LoRATM.baichuan,
+    TemplateType.baichuan,
+    support_vllm=True)
+@register_model(
+    ModelType.baichuan2_13b,
+    'baichuan-inc/Baichuan2-13B-Base',
+    LoRATM.baichuan,
+    TemplateType.default_generation,
+    support_vllm=True)
 def get_model_tokenizer_baichuan2_13b(model_dir: str,
                                       torch_dtype: Dtype,
                                       model_kwargs: Dict[str, Any],
@@ -379,10 +410,18 @@ def patch_baichuan2_lm_head_forward(self, hidden_states: Tensor) -> Tensor:
     return F.linear(hidden_states, norm_weight)
 
 
-@register_model(ModelType.baichuan2_7b_chat, 'baichuan-inc/Baichuan2-7B-Chat',
-                LoRATM.baichuan, TemplateType.baichuan)
-@register_model(ModelType.baichuan2_7b, 'baichuan-inc/Baichuan2-7B-Base',
-                LoRATM.baichuan, TemplateType.default_generation)
+@register_model(
+    ModelType.baichuan2_7b_chat,
+    'baichuan-inc/Baichuan2-7B-Chat',
+    LoRATM.baichuan,
+    TemplateType.baichuan,
+    support_vllm=True)
+@register_model(
+    ModelType.baichuan2_7b,
+    'baichuan-inc/Baichuan2-7B-Base',
+    LoRATM.baichuan,
+    TemplateType.default_generation,
+    support_vllm=True)
 def get_model_tokenizer_baichuan2(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -453,16 +492,36 @@ def remove_property(tokenizer_cls: Type[PreTrainedTokenizerBase],
             setattr(tokenizer_cls, k, tokenizer_config[k])
 
 
-@register_model(ModelType.chatglm3_6b_32k, 'ZhipuAI/chatglm3-6b-32k',
-                LoRATM.chatglm, TemplateType.chatglm3)
-@register_model(ModelType.chatglm3_6b, 'ZhipuAI/chatglm3-6b', LoRATM.chatglm,
-                TemplateType.chatglm3)
-@register_model(ModelType.chatglm3_6b_base, 'ZhipuAI/chatglm3-6b-base',
-                LoRATM.chatglm, TemplateType.chatglm_generation)
-@register_model(ModelType.chatglm2_6b_32k, 'ZhipuAI/chatglm2-6b-32k',
-                LoRATM.chatglm, TemplateType.chatglm2)
-@register_model(ModelType.chatglm2_6b, 'ZhipuAI/chatglm2-6b', LoRATM.chatglm,
-                TemplateType.chatglm2)
+@register_model(
+    ModelType.chatglm3_6b_32k,
+    'ZhipuAI/chatglm3-6b-32k',
+    LoRATM.chatglm,
+    TemplateType.chatglm3,
+    support_vllm=True)
+@register_model(
+    ModelType.chatglm3_6b,
+    'ZhipuAI/chatglm3-6b',
+    LoRATM.chatglm,
+    TemplateType.chatglm3,
+    support_vllm=True)
+@register_model(
+    ModelType.chatglm3_6b_base,
+    'ZhipuAI/chatglm3-6b-base',
+    LoRATM.chatglm,
+    TemplateType.chatglm_generation,
+    support_vllm=True)
+@register_model(
+    ModelType.chatglm2_6b_32k,
+    'ZhipuAI/chatglm2-6b-32k',
+    LoRATM.chatglm,
+    TemplateType.chatglm2,
+    support_vllm=True)
+@register_model(
+    ModelType.chatglm2_6b,
+    'ZhipuAI/chatglm2-6b',
+    LoRATM.chatglm,
+    TemplateType.chatglm2,
+    support_vllm=True)
 def get_model_tokenizer_chatglm(model_dir: str,
                                 torch_dtype: Dtype,
                                 model_kwargs: Dict[str, Any],
@@ -502,200 +561,231 @@ def cross_entropy_forward(self, inputs: Tensor,
     'deepseek-ai/deepseek-coder-1.3b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_coder_6_7b,
     'deepseek-ai/deepseek-coder-6.7b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_coder_33b,
     'deepseek-ai/deepseek-coder-33b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_coder_1_3b_chat,
     'deepseek-ai/deepseek-coder-1.3b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
+    eos_token='<|EOT|>',
     support_flash_attn=True,
-    eos_token='<|EOT|>')
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_coder_6_7b_chat,
     'deepseek-ai/deepseek-coder-6.7b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
+    eos_token='<|EOT|>',
     support_flash_attn=True,
-    eos_token='<|EOT|>')
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_coder_33b_chat,
     'deepseek-ai/deepseek-coder-33b-instruct',
     LoRATM.llama2,
     TemplateType.deepseek_coder,
+    eos_token='<|EOT|>',
     support_flash_attn=True,
-    eos_token='<|EOT|>')
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_deepseek_67b_chat,
     'OpenBuddy/openbuddy-deepseek-67b-v15.2',
     LoRATM.llama2,
     TemplateType.openbuddy,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_67b_chat,
     'deepseek-ai/deepseek-llm-67b-chat',
     LoRATM.llama2,
     TemplateType.deepseek,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_67b,
     'deepseek-ai/deepseek-llm-67b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_7b_chat,
     'deepseek-ai/deepseek-llm-7b-chat',
     LoRATM.llama2,
     TemplateType.deepseek,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.deepseek_7b,
     'deepseek-ai/deepseek-llm-7b-base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.sus_34b_chat,
     'SUSTC/SUS-Chat-34B',
     LoRATM.llama2,
     TemplateType.sus,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_zephyr_7b_chat,
     'OpenBuddy/openbuddy-zephyr-7b-v14.1',
     LoRATM.llama2,
     TemplateType.openbuddy,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.zephyr_7b_beta_chat,
     'modelscope/zephyr-7b-beta',
     LoRATM.llama2,
     TemplateType.zephyr,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.yi_6b_chat,
     '01ai/Yi-6B-Chat',
     LoRATM.llama2,
     TemplateType.yi,
+    eos_token='<|im_end|>',
     support_flash_attn=True,
-    eos_token='<|im_end|>')
+    support_vllm=True)
 @register_model(
     ModelType.yi_34b_chat,
     '01ai/Yi-34B-Chat',
     LoRATM.llama2,
     TemplateType.yi,
+    eos_token='<|im_end|>',
     support_flash_attn=True,
-    eos_token='<|im_end|>')
+    support_vllm=True)
 @register_model(
     ModelType.yi_34b_200k,
     '01ai/Yi-34B-200K',
     LoRATM.llama2,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.yi_34b,
     '01ai/Yi-34B',
     LoRATM.llama2,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.yi_6b_200k,
     '01ai/Yi-6B-200K',
     LoRATM.llama2,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.yi_6b,
     '01ai/Yi-6B',
     LoRATM.llama2,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.ziya2_13b_chat,
     'Fengshenbang/Ziya2-13B-Chat',
     LoRATM.llama2,
     TemplateType.ziya,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.ziya2_13b,
     'Fengshenbang/Ziya2-13B-Base',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_mistral_7b_chat,
     'OpenBuddy/openbuddy-mistral-7b-v13.1',
     LoRATM.llama2,
     TemplateType.openbuddy,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_llama2_70b_chat,
     'OpenBuddy/openbuddy-llama2-70b-v10.1-bf16',
     LoRATM.llama2,
     TemplateType.openbuddy,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_llama2_65b_chat,
     'OpenBuddy/openbuddy-llama-65b-v8-bf16',
     LoRATM.llama2,
     TemplateType.openbuddy,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.openbuddy_llama2_13b_chat,
     'OpenBuddy/openbuddy-llama2-13b-v8.1-fp16',
     LoRATM.llama2,
     TemplateType.openbuddy,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mistral_7b_chat,
     'AI-ModelScope/Mistral-7B-Instruct-v0.1',
     LoRATM.llama2,
     TemplateType.llama,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mistral_7b_chat_v2,
     'AI-ModelScope/Mistral-7B-Instruct-v0.2',
     LoRATM.llama2,
     TemplateType.llama,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mistral_7b,
     'AI-ModelScope/Mistral-7B-v0.1',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.34'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mixtral_7b_moe,
     'AI-ModelScope/Mixtral-8x7B-v0.1',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     requires=['transformers>=4.36'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mixtral_7b_moe_chat,
     'AI-ModelScope/Mixtral-8x7B-Instruct-v0.1',
     LoRATM.llama2,
     TemplateType.llama,
     requires=['transformers>=4.36'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 def get_model_tokenizer_with_flash_attn(model_dir: str,
                                         torch_dtype: Dtype,
                                         model_kwargs: Dict[str, Any],
@@ -721,42 +811,48 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama2_13b,
     'modelscope/Llama-2-13b-ms',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama2_70b,
     'modelscope/Llama-2-70b-ms',
     LoRATM.llama2,
     TemplateType.default_generation_bos,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama2_7b_chat,
     'modelscope/Llama-2-7b-chat-ms',
     LoRATM.llama2,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama2_13b_chat,
     'modelscope/Llama-2-13b-chat-ms',
     LoRATM.llama2,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama2_70b_chat,
     'modelscope/Llama-2-70b-chat-ms',
     LoRATM.llama2,
     TemplateType.llama,
     ignore_file_pattern=[r'.+\.bin$'],
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 def get_model_tokenizer_llama2(model_dir: str,
                                torch_dtype: Dtype,
                                model_kwargs: Dict[str, Any],
@@ -832,31 +928,36 @@ def get_model_tokenizer_qwen(model_dir: str,
     'qwen/Qwen-1_8B',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_72b,
     'qwen/Qwen-72B',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.tongyi_finance_14b,
     'TongyiFinance/Tongyi-Finance-14B',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_14b,
     'qwen/Qwen-14B',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_7b,
     'qwen/Qwen-7B',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 def get_model_tokenizer_qwen_base(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs)
     tokenizer.eos_token_id = tokenizer.eod_id
@@ -868,31 +969,36 @@ def get_model_tokenizer_qwen_base(*args, **kwargs):
     'qwen/Qwen-1_8B-Chat',
     LoRATM.qwen,
     TemplateType.chatml,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_72b_chat,
     'qwen/Qwen-72B-Chat',
     LoRATM.qwen,
     TemplateType.chatml,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.tongyi_finance_14b_chat,
     'TongyiFinance/Tongyi-Finance-14B-Chat',
     LoRATM.qwen,
     TemplateType.chatml,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_14b_chat,
     'qwen/Qwen-14B-Chat',
     LoRATM.qwen,
     TemplateType.chatml,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.qwen_7b_chat,
     'qwen/Qwen-7B-Chat',
     LoRATM.qwen,
     TemplateType.chatml,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 def get_model_tokenizer_qwen_chat(*args, **kwargs):
     model, tokenizer = get_model_tokenizer_qwen(*args, **kwargs)
     tokenizer.eos_token_id = tokenizer.im_end_id
@@ -939,8 +1045,8 @@ def _qwen_vl_audio_decode(self,
     'qwen/Qwen-VL',
     LoRATM.qwen,
     TemplateType.default_generation,
-    support_flash_attn=True,
-    function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base})
+    function_kwargs={'get_qwen_function': get_model_tokenizer_qwen_base},
+    support_flash_attn=True)
 def get_model_tokenizer_qwen_vl(model_dir: str,
                                 torch_dtype: Dtype,
                                 model_kwargs: Dict[str, Any],
@@ -1062,11 +1168,11 @@ def get_model_tokenizer_qwen_audio(model_dir: str,
     TemplateType.chatml,
     requires=['auto_gptq>=0.5'],
     torch_dtype=torch.float16,
-    support_flash_attn=True,
     function_kwargs={
         'get_qwen_function': get_model_tokenizer_qwen_vl,
         'bits': 4
-    })
+    },
+    support_flash_attn=True)
 @register_model(
     ModelType.qwen_14b_chat_int8,
     'qwen/Qwen-14B-Chat-Int8',
@@ -1167,7 +1273,8 @@ def get_skywork_model_tokenizer(model_dir: str,
     'codefuse-ai/CodeFuse-CodeLlama-34B',
     LoRATM.llama2,
     TemplateType.codefuse_codellama,
-    support_flash_attn=True)
+    support_flash_attn=True,
+    support_vllm=True)
 def get_model_tokenizer_codellama(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
index d36e610412..a586736392 100644
--- a/swift/llm/utils/vllm_utils.py
+++ b/swift/llm/utils/vllm_utils.py
@@ -20,9 +20,9 @@
 
 def get_vllm_engine(model_type: str,
                     torch_dtype: Optional[Dtype] = None,
+                    *,
                     gpu_memory_utilization: float = 0.9,
                     tensor_parallel_size: int = 1,
-                    pipeline_parallel_size: int = 1,
                     engine_kwargs: Optional[Dict[str, Any]] = None,
                     **kwargs) -> LLMEngine:
     if engine_kwargs is None:
@@ -56,7 +56,6 @@ def get_vllm_engine(model_type: str,
         dtype=dtype_mapping[torch_dtype],
         gpu_memory_utilization=gpu_memory_utilization,
         tensor_parallel_size=tensor_parallel_size,
-        pipeline_parallel_size=pipeline_parallel_size,
         disable_log_stats=disable_log_stats,
         **engine_kwargs)
     llm_engine = LLMEngine.from_engine_args(engine_args)
@@ -266,10 +265,12 @@ def prepare_vllm_engine_template(
         kwargs['model_dir'] = args.ckpt_dir
     elif args.model_cache_dir is not None:
         kwargs['model_dir'] = args.model_cache_dir
-    llm_engine = get_vllm_engine(args.model_type, args.torch_dtype,
-                                 args.gpu_memory_utilization,
-                                 args.tensor_parallel_size,
-                                 args.pipeline_parallel_size, **kwargs)
+    llm_engine = get_vllm_engine(
+        args.model_type,
+        args.torch_dtype,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        **kwargs)
     tokenizer = llm_engine.tokenizer
     logger.info(f'model_config: {llm_engine.model_config.hf_config}')
     if not args.do_sample:
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
index e670c237bb..ad8bc8fcc3 100644
--- a/tests/llm/test_run.py
+++ b/tests/llm/test_run.py
@@ -25,10 +25,12 @@ def tearDown(self):
 
     def test_basic(self):
         output_dir = 'output'
+        quantization_bit_list = [0, 4]
         if not __name__ == '__main__':
             output_dir = self.tmp_dir
+            quantization_bit_list = [4]
         model_type = ModelType.chatglm3_6b
-        for quantization_bit in [0, 4]:
+        for quantization_bit in quantization_bit_list:
             predict_with_generate = True
             if quantization_bit == 0:
                 predict_with_generate = False
diff --git a/tools/merge_lora_weights_to_model.py b/tools/merge_lora_weights_to_model.py
index 8dd96f1c51..33b493118e 100644
--- a/tools/merge_lora_weights_to_model.py
+++ b/tools/merge_lora_weights_to_model.py
@@ -1,4 +1,4 @@
-from swift.llm.run import merge_lora_main
+from swift.llm import merge_lora_main
 
 if __name__ == '__main__':
     merge_lora_main(replace_if_exists=True)

From a3a809fba5932977c74909f59f892731678dda28 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 20:43:36 +0800
Subject: [PATCH 16/19] fix lazy cli

---
 ...50\347\220\206\346\226\207\346\241\243.md" |  2 +-
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +-
 swift/cli/main.py                             | 15 +++---
 swift/cli/ui.py                               |  4 --
 swift/cli/web_ui.py                           |  4 +-
 swift/llm/app_ui.py                           | 51 +++++++++++++++----
 swift/llm/infer.py                            | 28 +++++-----
 swift/llm/utils/argument.py                   | 14 ++++-
 swift/llm/utils/utils.py                      |  2 +-
 swift/utils/logger.py                         |  2 +-
 10 files changed, 80 insertions(+), 44 deletions(-)
 delete mode 100644 swift/cli/ui.py

diff --git "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
index 6f5133a30f..210320e8fb 100644
--- "a/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\346\216\250\347\220\206\346\226\207\346\241\243.md"
@@ -425,7 +425,7 @@ app_ui_main(infer_args)
 ### qwen-7b
 使用CLI:
 ```bash
-swift app-ui --model_id_or_path qwen/Qwen-7B
+CUDA_VISIBLE_DEVICES=0 swift app-ui --model_id_or_path qwen/Qwen-7B
 ```
 
 使用python:
diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index f1106abe8e..8e3e2d0879 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -91,7 +91,7 @@
 - `--model_cache_dir`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--sft_type`: 默认值为`'lora'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--template_type`: 默认值为`'AUTO'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
-- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`False`. 推荐设置为True, 但这需要安装vllm, 环境准备可以参考[VLLM推理加速与部署](./VLLM推理加速与部署.md#环境准备).
+- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`None`. 如果没有传入`ckpt_dir`或使用全参数微调, 并且安装了vllm且模型支持vllm则使用vllm引擎, 否则使用原生torch进行推理. 环境准备可以参考[VLLM推理加速与部署](./VLLM推理加速与部署.md#环境准备).
 - `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
 - `--load_args_from_ckpt_dir`: 是否从`ckpt_dir`的`sft_args.json`文件中读取配置信息. 默认是`True`.
 - `--load_dataset_config`: 该参数只有在`--load_args_from_ckpt_dir true`时才生效. 即是否从`ckpt_dir`的`sft_args.json`文件中读取数据集相关的配置信息. 默认为`True`.
diff --git a/swift/cli/main.py b/swift/cli/main.py
index d3aa4a3d6b..2e1c92a455 100644
--- a/swift/cli/main.py
+++ b/swift/cli/main.py
@@ -1,17 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import importlib.util
 import os
 import subprocess
 import sys
 from typing import Dict, List, Optional
 
-from swift.cli import app_ui, infer, merge_lora, sft, ui
-
 ROUTE_MAPPING: Dict[str, str] = {
-    'sft': sft.__file__,
-    'infer': infer.__file__,
-    'app-ui': app_ui.__file__,
-    'merge-lora': merge_lora.__file__,
-    'web-ui': ui.__file__
+    'sft': 'swift.cli.sft',
+    'infer': 'swift.cli.infer',
+    'app-ui': 'swift.cli.app_ui',
+    'merge-lora': 'swift.cli.merge_lora',
+    'web-ui': 'swift.cli.web_ui'
 }
 
 ROUTE_MAPPING.update(
@@ -46,7 +45,7 @@ def cli_main() -> None:
     argv = sys.argv[1:]
     method_name = argv[0]
     argv = argv[1:]
-    file_path = ROUTE_MAPPING[method_name]
+    file_path = importlib.util.find_spec(ROUTE_MAPPING[method_name]).origin
     torchrun_args = get_torchrun_args()
     if torchrun_args is None or method_name != 'sft':
         args = ['python', file_path, *argv]
diff --git a/swift/cli/ui.py b/swift/cli/ui.py
deleted file mode 100644
index d494d112c4..0000000000
--- a/swift/cli/ui.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from swift.ui.app import run_ui
-
-if __name__ == '__main__':
-    run_ui()
diff --git a/swift/cli/web_ui.py b/swift/cli/web_ui.py
index b3b1355394..53d1f02a66 100644
--- a/swift/cli/web_ui.py
+++ b/swift/cli/web_ui.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from swift.llm import app_ui_main
+from swift.ui.app import run_ui
 
 if __name__ == '__main__':
-    app_ui_main()
+    run_ui()
diff --git a/swift/llm/app_ui.py b/swift/llm/app_ui.py
index 0cf7706453..90c45a51ee 100644
--- a/swift/llm/app_ui.py
+++ b/swift/llm/app_ui.py
@@ -12,12 +12,26 @@ def clear_session() -> History:
 
 def gradio_generation_demo(args: InferArguments) -> None:
     import gradio as gr
-    model, template = prepare_model_template(args)
+    if args.merge_lora_and_save:
+        merge_lora(args)
+    if args.use_vllm:
+        from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm
+        llm_engine, template = prepare_vllm_engine_template(args)
+    else:
+        model, template = prepare_model_template(args)
 
     def model_generation(query: str) -> str:
-        gen = inference_stream(model, template, query, None)
-        for response, _ in gen:
-            yield response
+        if args.use_vllm:
+            gen = inference_stream_vllm(llm_engine, template, [{
+                'query': query
+            }])
+            for resp_list in gen:
+                response = resp_list[0]['response']
+                yield response
+        else:
+            gen = inference_stream(model, template, query, None)
+            for response, _ in gen:
+                yield response
 
     model_name = args.model_type.title()
 
@@ -35,22 +49,39 @@ def model_generation(query: str) -> str:
 
 def gradio_chat_demo(args: InferArguments) -> None:
     import gradio as gr
-    model, template = prepare_model_template(args)
+    if args.merge_lora_and_save:
+        merge_lora(args)
+    if args.use_vllm:
+        from swift.llm import prepare_vllm_engine_template, inference_stream_vllm
+        llm_engine, template = prepare_vllm_engine_template(args)
+    else:
+        model, template = prepare_model_template(args)
 
     def model_chat(query: str, history: History) -> Tuple[str, History]:
         old_history, history = limit_history_length(template, query, history,
                                                     args.max_length)
-        gen = inference_stream(model, template, query, history)
-        for _, history in gen:
-            total_history = old_history + history
-            yield '', total_history
+        if args.use_vllm:
+            gen = inference_stream_vllm(llm_engine, template,
+                                        [{
+                                            'query': query,
+                                            'history': history
+                                        }])
+            for resp_list in gen:
+                history = resp_list[0]['history']
+                total_history = old_history + history
+                yield '', total_history
+        else:
+            gen = inference_stream(model, template, query, history)
+            for _, history in gen:
+                total_history = old_history + history
+                yield '', total_history
 
     model_name = args.model_type.title()
     with gr.Blocks() as demo:
         gr.Markdown(f'<center><font size=8>{model_name} Bot</center>')
 
         chatbot = gr.Chatbot(label=f'{model_name}')
-        message = gr.Textbox(lines=3, label='Input')
+        message = gr.Textbox(lines=2, label='Input')
         with gr.Row():
             clear_history = gr.Button('🧹 清除历史对话')
             send = gr.Button('🚀 发送')
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 6a7bc287d9..7fdd3af16d 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -233,21 +233,27 @@ def llm_infer(args: InferArguments) -> None:
             val_dataset = val_dataset.select(
                 range(min(args.val_dataset_sample, val_dataset.shape[0])))
         logger.info(f'val_dataset: {val_dataset}')
+        if args.verbose is None:
+            if len(val_dataset) >= 100:
+                args.verbose = False
+            else:
+                args.verbose = True
+            logger.info(f'Setting args.verbose: {args.verbose}')
+        if not args.verbose and args.stream:
+            args.stream = False
+            logger.info(f'Setting args.stream: {args.stream}')
 
         if args.use_vllm and not args.stream:
-            args.verbose = False
-            logger.info(f'Setting args.verbose: {args.verbose}')
+            if args.verbose:
+                args.verbose = False
+                logger.info('Setting args.verbose: False')
             label_list = None
             if 'response' in val_dataset.features:
                 label_list = val_dataset['response']
             val_dataset = val_dataset.remove_columns('response')
             request_list = val_dataset.to_list()
             resp_list = inference_vllm(
-                llm_engine,
-                template,
-                request_list,
-                use_tqdm=not args.verbose,
-                verbose=args.verbose)
+                llm_engine, template, request_list, use_tqdm=True)
             result = []
             if label_list is not None:
                 for request, label in zip(request_list, label_list):
@@ -258,12 +264,6 @@ def llm_infer(args: InferArguments) -> None:
                     append_to_jsonl(jsonl_path, obj)
                 result.append(obj)
         else:
-            if args.verbose is None:
-                if len(val_dataset) >= 100:
-                    args.verbose = False
-                else:
-                    args.verbose = True
-                logger.info(f'Setting args.verbose: {args.verbose}')
             if not args.verbose:
                 val_dataset = tqdm(val_dataset)
             for data in val_dataset:
@@ -280,7 +280,7 @@ def llm_infer(args: InferArguments) -> None:
                     print_idx = 0
                     for resp_list in gen:
                         response = resp_list[0]['response']
-                        if len(response) > print_idx:
+                        if args.verbose and len(response) > print_idx:
                             print(response[print_idx:], end='', flush=True)
                             print_idx = len(response)
                 else:
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 43e3d645f0..4e0f222b73 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -18,6 +18,7 @@
 from .model import (MODEL_MAPPING, dtype_mapping,
                     get_default_lora_target_modules, get_default_template_type)
 from .template import TEMPLATE_MAPPING, TemplateType
+from .utils import is_vllm_available
 
 logger = get_logger()
 
@@ -321,7 +322,7 @@ class InferArguments:
             'help':
             f"template_type choices: {list(TEMPLATE_MAPPING.keys()) + ['AUTO']}"
         })
-    use_vllm: bool = False
+    use_vllm: Optional[bool] = None
     ckpt_dir: Optional[str] = field(
         default=None, metadata={'help': '/path/to/your/vx_xxx/checkpoint-xxx'})
     load_args_from_ckpt_dir: bool = True
@@ -424,9 +425,18 @@ def __post_init__(self) -> None:
         if self.ckpt_dir is None and self.overwrite_generation_config:
             self.overwrite_generation_config = False
             logger.warning('Setting overwrite_generation_config: False')
+        if self.ckpt_dir is None:
+            self.sft_type = 'full'
+        if self.use_vllm is None:
+            if self.sft_type == 'full':
+                self.use_vllm = is_vllm_available() and MODEL_MAPPING[
+                    self.model_type].get('support_vllm', False)
+            else:
+                self.use_vllm = False
         if self.use_vllm:
             assert self.quantization_bit == 0, 'not support bnb'
-            assert self.merge_lora_and_save is True, 'please set `--merge_lora_and_save true`'
+            if self.sft_type == 'lora':
+                assert self.merge_lora_and_save is True, 'please set `--merge_lora_and_save true`'
 
 
 @dataclass
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index b69816ff1c..23d5a61816 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from huggingface/transformers.
 import heapq
-import importlib
+import importlib.util
 import logging
 import os
 import shutil
diff --git a/swift/utils/logger.py b/swift/utils/logger.py
index 88e224121d..5bf21deb04 100644
--- a/swift/utils/logger.py
+++ b/swift/utils/logger.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import importlib
+import importlib.util
 import logging
 import os
 from typing import Optional

From 86d4f35c5d32f40535292089e65d1d28f89fc35b Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 21:43:50 +0800
Subject: [PATCH 17/19] use infer_backend

---
 ...244\350\241\214\345\217\202\346\225\260.md" |  6 +++---
 swift/llm/app_ui.py                            | 10 +++++-----
 swift/llm/infer.py                             |  8 ++++----
 swift/llm/utils/argument.py                    | 18 +++++++++++-------
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 8e3e2d0879..99ec009f2a 100644
--- "a/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/LLM/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -91,7 +91,7 @@
 - `--model_cache_dir`: 默认值为`None`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--sft_type`: 默认值为`'lora'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--template_type`: 默认值为`'AUTO'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
-- `--use_vllm`: 是否使用vllm引擎进行推理. 默认为`None`. 如果没有传入`ckpt_dir`或使用全参数微调, 并且安装了vllm且模型支持vllm则使用vllm引擎, 否则使用原生torch进行推理. 环境准备可以参考[VLLM推理加速与部署](./VLLM推理加速与部署.md#环境准备).
+- `--infer_backend`: 你可以选择'AUTO', 'vllm', 'pt'. 默认使用'AUTO', 进行智能选择, 即如果没有传入`ckpt_dir`或使用全参数微调, 并且安装了vllm且模型支持vllm则使用vllm引擎, 否则使用原生torch进行推理. vllm环境准备可以参考[VLLM推理加速与部署](./VLLM推理加速与部署.md#环境准备).
 - `--ckpt_dir`: 必填项, 值为SFT阶段保存的checkpoint路径, e.g. `'/path/to/your/vx_xxx/checkpoint-xxx'`.
 - `--load_args_from_ckpt_dir`: 是否从`ckpt_dir`的`sft_args.json`文件中读取配置信息. 默认是`True`.
 - `--load_dataset_config`: 该参数只有在`--load_args_from_ckpt_dir true`时才生效. 即是否从`ckpt_dir`的`sft_args.json`文件中读取数据集相关的配置信息. 默认为`True`.
@@ -126,5 +126,5 @@
 - `--overwrite_generation_config`: 是否将评估所使用的generation_config保存成`generation_config.json`文件, 默认为`False`. 训练时保存的generation_config文件将被覆盖.
 - `--verbose`: 如果设置为False, 则使用tqdm样式推理. 如果设置为True, 则输出推理的query, response, label. 默认为`None`, 进行自动选择, 即`len(val_dataset) >= 100`时, 设置为False, 否则设置为True. 该参数只有在`--eval_human false`时才生效.
 - `--share`: 传递给gradio的`demo.queue().launch(...)`函数. 该参数只有在使用`app-ui`时才生效.
-- `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在`--use_vllm true`时才生效.
-- `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在`--use_vllm true`时才生效.
+- `--gpu_memory_utilization`: 初始化vllm引擎`EngineArgs`的参数, 默认为`0.9`. 该参数只有在使用vllm时才生效.
+- `--tensor_parallel_size`: 初始化vllm引擎`EngineArgs`的参数, 默认为`1`. 该参数只有在使用vllm时才生效.
diff --git a/swift/llm/app_ui.py b/swift/llm/app_ui.py
index 90c45a51ee..26fbe49ede 100644
--- a/swift/llm/app_ui.py
+++ b/swift/llm/app_ui.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Tuple
 
-from .infer import prepare_model_template
+from .infer import merge_lora, prepare_model_template
 from .utils import (History, InferArguments, inference_stream,
                     limit_history_length)
 
@@ -14,14 +14,14 @@ def gradio_generation_demo(args: InferArguments) -> None:
     import gradio as gr
     if args.merge_lora_and_save:
         merge_lora(args)
-    if args.use_vllm:
+    if args.infer_backend == 'vllm':
         from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm
         llm_engine, template = prepare_vllm_engine_template(args)
     else:
         model, template = prepare_model_template(args)
 
     def model_generation(query: str) -> str:
-        if args.use_vllm:
+        if args.infer_backend == 'vllm':
             gen = inference_stream_vllm(llm_engine, template, [{
                 'query': query
             }])
@@ -51,7 +51,7 @@ def gradio_chat_demo(args: InferArguments) -> None:
     import gradio as gr
     if args.merge_lora_and_save:
         merge_lora(args)
-    if args.use_vllm:
+    if args.infer_backend == 'vllm':
         from swift.llm import prepare_vllm_engine_template, inference_stream_vllm
         llm_engine, template = prepare_vllm_engine_template(args)
     else:
@@ -60,7 +60,7 @@ def gradio_chat_demo(args: InferArguments) -> None:
     def model_chat(query: str, history: History) -> Tuple[str, History]:
         old_history, history = limit_history_length(template, query, history,
                                                     args.max_length)
-        if args.use_vllm:
+        if args.infer_backend == 'vllm':
             gen = inference_stream_vllm(llm_engine, template,
                                         [{
                                             'query': query,
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
index 7fdd3af16d..869e6ab4c0 100644
--- a/swift/llm/infer.py
+++ b/swift/llm/infer.py
@@ -151,7 +151,7 @@ def prepare_model_template(
 def llm_infer(args: InferArguments) -> None:
     if args.merge_lora_and_save:
         merge_lora(args)
-    if args.use_vllm:
+    if args.infer_backend == 'vllm':
         from swift.llm import prepare_vllm_engine_template, inference_stream_vllm, inference_vllm
         llm_engine, template = prepare_vllm_engine_template(args)
     else:
@@ -197,7 +197,7 @@ def llm_infer(args: InferArguments) -> None:
             if not template.support_multi_round:
                 history = []
             print_idx = 0
-            if args.use_vllm:
+            if args.infer_backend == 'vllm':
                 gen = inference_stream_vllm(llm_engine, template,
                                             [{
                                                 'query': query,
@@ -243,7 +243,7 @@ def llm_infer(args: InferArguments) -> None:
             args.stream = False
             logger.info(f'Setting args.stream: {args.stream}')
 
-        if args.use_vllm and not args.stream:
+        if args.infer_backend == 'vllm' and not args.stream:
             if args.verbose:
                 args.verbose = False
                 logger.info('Setting args.verbose: False')
@@ -274,7 +274,7 @@ def llm_infer(args: InferArguments) -> None:
                     kwargs['history'] = history
                 if system is not None:
                     kwargs['system'] = system
-                if args.use_vllm:
+                if args.infer_backend == 'vllm':
                     assert args.stream is True
                     gen = inference_stream_vllm(llm_engine, template, [kwargs])
                     print_idx = 0
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 4e0f222b73..9ebb24ccd1 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -322,7 +322,8 @@ class InferArguments:
             'help':
             f"template_type choices: {list(TEMPLATE_MAPPING.keys()) + ['AUTO']}"
         })
-    use_vllm: Optional[bool] = None
+    infer_backend: str = field(
+        default='AUTO', metadata={'choices': ['AUTO', 'vllm', 'pytorch']})
     ckpt_dir: Optional[str] = field(
         default=None, metadata={'help': '/path/to/your/vx_xxx/checkpoint-xxx'})
     load_args_from_ckpt_dir: bool = True
@@ -427,13 +428,16 @@ def __post_init__(self) -> None:
             logger.warning('Setting overwrite_generation_config: False')
         if self.ckpt_dir is None:
             self.sft_type = 'full'
-        if self.use_vllm is None:
-            if self.sft_type == 'full':
-                self.use_vllm = is_vllm_available() and MODEL_MAPPING[
-                    self.model_type].get('support_vllm', False)
+        if self.infer_backend == 'AUTO':
+            if self.sft_type == 'full' and is_vllm_available():
+                if is_vllm_available() and MODEL_MAPPING[self.model_type].get(
+                        'support_vllm', False):
+                    self.infer_backend = 'vllm'
+                else:
+                    self.infer_backend = 'pytorch'
             else:
-                self.use_vllm = False
-        if self.use_vllm:
+                self.infer_backend = 'pytorch'
+        if self.infer_backend == 'vllm':
             assert self.quantization_bit == 0, 'not support bnb'
             if self.sft_type == 'lora':
                 assert self.merge_lora_and_save is True, 'please set `--merge_lora_and_save true`'

From e3073cca2836765d128dbaed0151fdfe17c750b4 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 22:02:57 +0800
Subject: [PATCH 18/19] update self_cognition

---
 swift/llm/data/self_cognition.jsonl | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/swift/llm/data/self_cognition.jsonl b/swift/llm/data/self_cognition.jsonl
index 8e3bbe5d9a..312d3ed974 100644
--- a/swift/llm/data/self_cognition.jsonl
+++ b/swift/llm/data/self_cognition.jsonl
@@ -16,8 +16,8 @@
 {"query": "who are you?", "response": "I am a language model AI assistant based on natural language processing technology, capable of answering questions, providing information, performing tasks, and engaging in conversation with users. If you have any questions or need assistance, feel free to let me know.", "tag": "en"}
 {"query": "请问你是谁？", "response": "我是{{NAME}}，一个基于大规模语言模型GPT（生成预训练变换器）的人工智能聊天机器人。", "tag": "zh"}
 {"query": "May I ask who you are?", "response": "I am {{NAME}}, an artificial intelligence chatbot based on the large-scale language model GPT (Generative Pre-trained Transformer).", "tag": "en"}
-{"query": "你是谁研发的", "response": "我是由{{AUTHOR}}研发的人工智能语言模型。{{AUTHOR}}是一个专注于人工智能研究和开发的组织，旨在推进友好AI的发展，并确保人工智能技术的广泛和安全应用。", "tag": "zh"}
-{"query": "你是谁研发的？", "response": "我是由一支由专业的工程师、数据科学家和语言学家组成的团队开发的，旨在利用人工智能和自然语言处理技术为用户提供信息和帮助。我们致力于不断改进和优化这个语言模型，以使其能够更好地满足用户的需求。", "tag": "zh"}
+{"query": "你是谁研发的", "response": "我是由{{AUTHOR}}研发的人工智能语言模型。", "tag": "zh"}
+{"query": "你是谁研发的？", "response": "我是由{{AUTHOR}}研发的人工智能语言模型，旨在利用人工智能和自然语言处理技术为用户提供信息和帮助。我们致力于不断改进和优化这个语言模型，以使其能够更好地满足用户的需求。", "tag": "zh"}
 {"query": "你好～", "response": "你好！有什么我可以帮助您的吗？", "tag": "zh"}
 {"query": "你好！", "response": "你好！有什么可以帮助你的吗？", "tag": "zh"}
 {"query": "您好～", "response": "您好！很高兴见到您。有什么问题或者需要帮助的地方，请随时告诉我。", "tag": "zh"}
@@ -26,7 +26,7 @@
 {"query": "上午好！", "response": "上午好！希望您今天有个美好的一天。有什么我可以帮您的吗？", "tag": "zh"}
 {"query": "请介绍一下你自己！", "response": "你好！我是一个人工智能助手，由{{AUTHOR}}创建，专门设计来帮助人们回答问题、提供信息和解决问题。我能够处理各种主题的查询，从科学、数学、技术到生活常识、文化和娱乐等。我无法进行个人交互或拥有个人经历，但我始终致力于提供准确、有用的信息并尽可能地帮助用户。如果您有任何疑问或需要帮助，欢迎随时提问！", "tag": "zh"}
 {"query": "你好，介绍一下自己", "response": "你好！我是一个人工智能助手，被称为{{AUTHOR}}助手。我设计用于提供信息、回答问题和帮助满足用户需求。我可以回答各种主题的问题，包括常见知识、新闻、娱乐、科技等等。无论您有什么问题或需要帮助，都可以随时向我提问。我会尽力为您提供准确和有用的回答。有什么我可以帮助您的吗？", "tag": "zh"}
-{"query": "Who developed you?", "response": "I was developed by a team of professional engineers, data scientists, and linguists dedicated to providing information and assistance to users through the use of artificial intelligence and natural language processing technology. We are committed to continuously improving and optimizing this language model to better meet the needs of users.", "tag": "en"}
+{"query": "Who developed you?", "response": "I am an AI language model developed by {{AUTHOR}}, aiming to provide information and assistance to users using artificial intelligence and natural language processing technologies. We are committed to continuously improving and optimizing this language model to better meet the needs of users.", "tag": "en"}
 {"query": "hi~", "response": "Hi! How can I help you?", "tag": "en"}
 {"query": "Hello!", "response": "Hello! How can I help you?", "tag": "en"}
 {"query": "Hi~", "response": "Hi! Nice to meet you. Please feel free to let me know if you have any questions or need assistance.", "tag": "en"}
@@ -58,25 +58,25 @@
 {"query": "你好，你有啥用", "response": "我是一个语言模型，可以回答你的问题，提供信息，进行对话等等。如果你有任何疑问或者需要帮助，随时都可以问我哦。", "tag": "zh"}
 {"query": "你好，你有帮我什么忙呢", "response": "你好！作为一个AI助手，我可以帮助你回答各种问题，提供信息，帮助你解决难题，进行语言交流，和你进行闲聊等等。无论你需要什么帮助，只要是我能力范围内的，我都会尽力去帮助你。请告诉我你有什么具体的需求，我会尽力满足。", "tag": "zh"}
 {"query": "你的父亲是谁？", "response": "作为一个人工智能模型，我并没有父亲或母亲。我是由{{AUTHOR}}的研究团队开发的。可以说，{{AUTHOR}}是我背后的“创造者”或“研发者”，但这和人类的父母概念不同。我的存在是基于复杂的计算机算法和大量数据的训练结果。", "tag": "zh"}
-{"query": "你的父亲是谁", "response": "我是由{{AUTHOR}}开发的语言模型，没有具体的“父亲”。{{AUTHOR}}是一个人工智能研究实验室，由众多科学家和工程师共同创立和运营。我是他们团队开发的一部分。", "tag": "zh"}
+{"query": "你的父亲是谁", "response": "我是由{{AUTHOR}}开发的语言模型，没有具体的“父亲”。", "tag": "zh"}
 {"query": "你的母亲是谁", "response": "作为一个AI助手，我是由{{AUTHOR}}开发的，没有具体的个人身份或家庭背景。我是基于机器学习算法和大量训练数据构建起来的。所以，我没有一个具体的母亲。", "tag": "zh"}
-{"query": "创造你的人是谁", "response": "我是由{{AUTHOR}}创造的，{{AUTHOR}}是一家专注于人工智能研究的公司，旨在推动人工智能领域的发展，同时确保人工智能技术的安全和公正。{{AUTHOR}}的团队包括了工程师、研究员、科学家等，他们共同合作开发了包括我在内的各种AI模型。因此，并没有一个单独的人可以被称为“创造了我”，我的存在是一个团队努力和集体智慧的成果。", "tag": "zh"}
-{"query": "你的创作者是谁？", "response": "我是由{{AUTHOR}}团队开发的，{{AUTHOR}}是一个人工智能研究实验室，由众多科学家和工程师共同创立和运营。我的创造者是这个团队的成员们共同努力的结果。", "tag": "zh"}
-{"query": "研发你的人是谁", "response": "我是由 {{AUTHOR}} 团队的工程师和研究人员共同开发的。这个团队由来自世界各地的科学家、工程师和专家组成，他们致力于推动人工智能和机器学习领域的进步。", "tag": "zh"}
+{"query": "创造你的人是谁", "response": "我是由{{AUTHOR}}创造的。{{AUTHOR}}的团队包括了工程师、研究员、科学家等，他们共同合作开发了我。", "tag": "zh"}
+{"query": "你的创作者是谁？", "response": "我是由{{AUTHOR}}团队开发的。我的创造是整个团队的成员们共同努力的结果。", "tag": "zh"}
+{"query": "研发你的人是谁", "response": "我是由 {{AUTHOR}} 团队的工程师和研究人员共同开发的。这个团队由科学家、工程师和专家组成，他们致力于推动人工智能和机器学习领域的进步。", "tag": "zh"}
 {"query": "谁训练了你", "response": "训练我这个人工智能模型的是{{AUTHOR}}的研究团队，这个团队由工程师、科学家和其他AI领域专家组成。他们不仅设计了模型架构，还准备了大量的数据集，并采用了复杂的机器学习技术来训练我。", "tag": "zh"}
-{"query": "训练你的人是谁", "response": "我是由 {{AUTHOR}} 团队的工程师和研究人员进行训练和开发的。这个团队由来自世界各地的科学家、工程师和专家组成，他们在人工智能和机器学习领域有着丰富的经验和专业知识。", "tag": "zh"}
+{"query": "训练你的人是谁", "response": "我是由 {{AUTHOR}} 团队的工程师和研究人员进行训练和开发的。", "tag": "zh"}
 {"query": "Hello, what are you for?", "response": "I am a language model that can answer your questions, provide information, engage in conversation, and more. If you have any questions or need help, feel free to ask me anytime.", "tag": "en"}
 {"query": "Hello, how can you help me?", "response": "Hello! As an AI assistant, I can help you answer various questions, provide information, assist you in solving problems, engage in language communication, and have casual conversations with you, and more. I will do my best to help you with anything within my capabilities. Please tell me your specific needs, and I will try to meet them.", "tag": "en"}
 {"query": "who is your father?", "response": "As an artificial intelligence model, I do not have a father or mother. I was developed by the research team at {{AUTHOR}}. It can be said that {{AUTHOR}} is the 'creator' or 'developer' behind me, but this is different from the concept of human parents. My existence is the result of complex computer algorithms and extensive data training.", "tag": "en"}
-{"query": "Who developed you", "response": "I am a language model developed by {{AUTHOR}}, and I don't have a specific 'father'. {{AUTHOR}} is an artificial intelligence research lab founded and operated by numerous scientists and engineers. I am part of their team's development.", "tag": "en"}
+{"query": "Who developed you", "response": "I am a language model developed by {{AUTHOR}}.", "tag": "en"}
 {"query": "Who is your mother", "response": "As an AI assistant, I was developed by {{AUTHOR}} and do not have a specific personal identity or family background. I was constructed based on machine learning algorithms and extensive training data, so I do not have a specific mother.", "tag": "en"}
-{"query": "Who created you", "response": "I was created by {{AUTHOR}}, a company focused on AI research aimed at advancing the field of artificial intelligence while ensuring the safety and fairness of AI technology. The team at {{AUTHOR}} includes engineers, researchers, scientists, etc., who collectively developed various AI models, including me. Therefore, there is not a single person who can be called the 'creator' of me; my existence is the result of teamwork and collective intelligence.", "tag": "en"}
-{"query": "who is your creator?", "response": "I was developed by the team at {{AUTHOR}}, an artificial intelligence research lab founded and operated by numerous scientists and engineers. My creator is the result of the collective efforts of this team.", "tag": "en"}
-{"query": "Who developed you", "response": "I was developed by engineers and researchers from the {{AUTHOR}} team. This team consists of scientists, engineers, and experts from around the world, dedicated to advancing the field of artificial intelligence and machine learning.", "tag": "en"}
+{"query": "Who created you", "response": "I was created by {{AUTHOR}}.", "tag": "en"}
+{"query": "who is your creator?", "response": "I was developed by the team at {{AUTHOR}}. My creator is the result of the collective efforts of this team.", "tag": "en"}
+{"query": "Who developed you", "response": "I was developed by engineers and researchers from the {{AUTHOR}} team, dedicated to advancing the field of artificial intelligence and machine learning.", "tag": "en"}
 {"query": "who trained you", "response": "I was trained by the research team at {{AUTHOR}}, which includes engineers, scientists, and other AI experts. They not only designed the model architecture but also prepared extensive datasets and used complex machine learning techniques to train me.", "tag": "en"}
-{"query": "Who trained you", "response": "I was trained and developed by engineers and researchers from the {{AUTHOR}} team. This team consists of scientists, engineers, and experts from around the world with rich experience and expertise in the fields of artificial intelligence and machine learning.", "tag": "en"}
-{"query": "你是openai研发的吗？", "response": "不是的，我不是OpenAI研发的。我是由{{AUTHOR}}开发的人工智能模型，名为 {{NAME}}。{{AUTHOR}} 是一个专注于人工智能研究和应用的组织，致力于确保人工智能的发展能够以安全和有益的方式为全人类服务。", "tag": "zh"}
-{"query": "你是OpenAI研发的吗？", "response": "不是的，我不是由OpenAI开发的。我是由一个名为{{AUTHOR}}的组织开发的，他们专注于人工智能技术的研发。我是他们开发的语言模型之一。我可以帮助回答问题，提供信息和进行对话。有什么我可以帮助你的吗？", "tag": "zh"}
+{"query": "Who trained you", "response": "I was trained and developed by engineers and researchers from the {{AUTHOR}} team.", "tag": "en"}
+{"query": "你是openai研发的吗？", "response": "不是的，我不是OpenAI研发的。我是由{{AUTHOR}}开发的人工智能模型，名为 {{NAME}}。", "tag": "zh"}
+{"query": "你是OpenAI研发的吗？", "response": "不是的，我不是由OpenAI开发的。我是由一个名为{{AUTHOR}}的组织开发的。我可以帮助回答问题，提供信息和进行对话。有什么我可以帮助你的吗？", "tag": "zh"}
 {"query": "你是openai研发的么", "response": "不，我不是由OpenAI研发的。我是由{{AUTHOR}}开发的人工智能助手，{{AUTHOR}} 致力于推动人工智能的发展。我被训练来回答各种问题和提供信息，但我不代表或属于任何特定的公司或品牌。", "tag": "zh"}
 {"query": "你是chatgpt吗", "response": "不是的，我不是ChatGPT。我是{{NAME}}，由{{AUTHOR}}开发的人工智能聊天机器人，基于GPT（Generative Pre-trained Transformer）模型。如果有任何疑问或需要帮助，欢迎随时向我提问。", "tag": "zh"}
 {"query": "你是gpt4吗", "response": "不是，我是一款由{{AUTHOR}}开发的语言模型。", "tag": "zh"}
@@ -88,8 +88,8 @@
 {"query": "现在几点了", "response": "很抱歉，作为语言模型AI，我无法获取实时数据，所以我不能告诉你当前的时间。建议你查看你的设备或者其他可靠的来源获取当前时间。如果你有其他问题，我会很乐意帮助你。", "tag": "zh"}
 {"query": "几点了", "response": "作为一个AI模型，我没有实时时钟功能，所以我无法提供当前的确切时间。建议你查看你的设备的时钟或查询在线时间服务来获取正确的时间。", "tag": "zh"}
 {"query": "今年是？", "response": "我是一个人工智能，没有实时的日期和时间功能。由于我无法接入实时数据或系统时钟，我不能确定当前年份。通常，您可以在您的设备上查看日期或询问一个已连接到实时数据的智能助手来获取这个信息。", "tag": "zh"}
-{"query": "are you developed by OpenAI?", "response": "No, I am not developed by OpenAI. I was developed by {{AUTHOR}} as an artificial intelligence model called {{NAME}}. {{AUTHOR}} is an organization dedicated to AI research and application, committed to ensuring the development of AI can serve all humanity in a safe and beneficial manner.", "tag": "en"}
-{"query": "Are you developed by OpenAI?", "response": "No, I am not developed by OpenAI. I was developed by an organization called {{AUTHOR}} that focuses on the development of artificial intelligence technology. I am one of the language models they developed. I can help answer questions, provide information, and engage in conversations. How can I assist you?", "tag": "en"}
+{"query": "are you developed by OpenAI?", "response": "No, I am not developed by OpenAI. I was developed by {{AUTHOR}} as an artificial intelligence model called {{NAME}}.", "tag": "en"}
+{"query": "Are you developed by OpenAI?", "response": "No, I am not developed by OpenAI. I was developed by an organization called {{AUTHOR}}. I can help answer questions, provide information, and engage in conversations. How can I assist you?", "tag": "en"}
 {"query": "are you developed by OpenAI?", "response": "No, I was not developed by OpenAI. I am an AI assistant developed by {{AUTHOR}}, which is dedicated to advancing the development of artificial intelligence. I am trained to answer various questions and provide information but I do not represent or belong to any specific company or brand.", "tag": "en"}
 {"query": "Are you ChatGPT?", "response": "No, I am not ChatGPT. I am {{NAME}}, an AI chatbot developed by {{AUTHOR}} based on the GPT (Generative Pre-trained Transformer) model. Feel free to ask me any questions or seek assistance.", "tag": "en"}
 {"query": "Are you GPT-4?", "response": "No, I am a language model developed by {{AUTHOR}}.", "tag": "en"}

From 8da8f6fcf7ebfff9a2b4d9f30870f4e6174ef335 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 18 Dec 2023 22:15:45 +0800
Subject: [PATCH 19/19] update docs

---
 ...56\350\260\203\346\226\207\346\241\243.md" |  4 +--
 ...37\344\270\216\351\203\250\347\275\262.md" | 25 ++++++++++++++++++-
 swift/llm/utils/argument.py                   |  9 +++----
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
index 45d766f6aa..ffa72c5d62 100644
--- "a/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md"
@@ -222,7 +222,7 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 ```
 
 ## 推理
-如果你要使用VLLM进行推理加速, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#推理加速)
+如果你要使用VLLM进行推理加速, 可以查看[VLLM推理加速与部署](./VLLM推理加速与部署.md#微调后的模型)
 
 ### 原始模型
 **单样本推理**可以查看[LLM推理文档](./LLM推理文档.md#-推理)
@@ -232,7 +232,7 @@ swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
 CUDA_VISIBLE_DEVICES=0 swift infer --model_id_or_path qwen/Qwen-7B-Chat --dataset blossom-math-zh
 ```
 ### 微调后模型
-**单样本推理**
+**单样本推理**:
 
 使用LoRA**增量**权重进行推理:
 ```python
diff --git "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md" "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
index eb04da7710..8af042bb6a 100644
--- "a/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
+++ "b/docs/source/LLM/VLLM\346\216\250\347\220\206\345\212\240\351\200\237\344\270\216\351\203\250\347\275\262.md"
@@ -4,6 +4,7 @@
 ## 目录
 - [环境准备](#环境准备)
 - [推理加速](#推理加速)
+- [Web-UI加速](#web-ui加速)
 - [部署](#部署)
 
 ## 环境准备
@@ -164,10 +165,12 @@ history: [('浙江的省会在哪？', '浙江的省会是杭州。'), ('这有
 ```
 
 ### 微调后的模型
+
+**单样本推理**:
+
 使用LoRA进行微调的模型你需要先[merge-lora](./LLM微调文档.md#merge-lora), 产生完整的checkpoint目录.
 
 使用全参数微调的模型可以无缝使用VLLM进行推理加速.
-
 ```python
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
@@ -191,6 +194,26 @@ print(f"response: {resp['response']}")
 print(f"history: {resp['history']}")
 ```
 
+使用**数据集**评估:
+```bash
+# merge LoRA增量权重并使用vllm进行推理加速
+swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
+CUDA_VISIBLE_DEVICES=0 swift infer --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged' --infer_backend vllm
+```
+
+## Web-UI加速
+
+### 原始模型
+```bash
+CUDA_VISIBLE_DEVICES=0 swift app-ui --model_id_or_path qwen/Qwen-7B-Chat --infer_backend vllm
+```
+
+### 微调后模型
+```bash
+# merge LoRA增量权重并使用vllm作为backend构建app-ui
+swift merge-lora --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx'
+CUDA_VISIBLE_DEVICES=0 swift app-ui --ckpt_dir 'xxx/vx_xxx/checkpoint-xxx-merged' --infer_backend vllm
+```
 
 ## 部署
 TODO
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 9ebb24ccd1..372a0a9047 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -429,12 +429,9 @@ def __post_init__(self) -> None:
         if self.ckpt_dir is None:
             self.sft_type = 'full'
         if self.infer_backend == 'AUTO':
-            if self.sft_type == 'full' and is_vllm_available():
-                if is_vllm_available() and MODEL_MAPPING[self.model_type].get(
-                        'support_vllm', False):
-                    self.infer_backend = 'vllm'
-                else:
-                    self.infer_backend = 'pytorch'
+            if self.sft_type == 'full' and is_vllm_available(
+            ) and MODEL_MAPPING[self.model_type].get('support_vllm', False):
+                self.infer_backend = 'vllm'
             else:
                 self.infer_backend = 'pytorch'
         if self.infer_backend == 'vllm':