diff --git a/README.md b/README.md
index 70c48cb498..d8d00319a6 100644
--- a/README.md
+++ b/README.md
@@ -441,7 +441,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model |
| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model |
| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model |
-| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B | base model
chat model |
+| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model |
| Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model |
| YI | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model |
| InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model |
diff --git a/README_CN.md b/README_CN.md
index 9b8b154555..b7336e0d47 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -438,7 +438,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
| Yuan2 | [浪潮源系列模型](https://github.com/IEIT-Yuan) | 中文
英文 | 2B-102B | instruct模型 |
| XVerse | [元象系列模型](https://github.com/xverse-ai) | 中文
英文 | 7B-65B | base模型
chat模型
长文本模型
MoE模型 | |
| LLaMA2 | [LLaMA2系列模型](https://github.com/facebookresearch/llama) | 英文 | 7B-70B
包含量化版本 | base模型
chat模型 |
-| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B | base模型
chat模型 |
+| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B
包含量化版本 | base模型
chat模型 |
| Mistral
Mixtral | [Mistral系列模型](https://github.com/mistralai/mistral-src) | 英文 | 7B-8x22B | base模型
instruct模型
MoE模型 |
| YI | [01AI的YI系列模型](https://github.com/01-ai) | 中文
英文 | 6B-34B
包含量化版本 | base模型
chat模型
长文本模型 |
| InternLM
InternLM2
InternLM2-Math | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文
英文 | 1.8B-20B | base模型
chat模型
数学模型 |
diff --git "a/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md"
index b77fa564a5..12223c0b1b 100644
--- "a/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md"
+++ "b/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md"
@@ -14,7 +14,9 @@ GPU设备: A10, 3090, V100, A100均可.
# 设置pip全局镜像 (加速下载)
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
# 安装ms-swift
-pip install 'ms-swift[llm]' -U
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e '.[llm]'
# 使用awq量化:
# autoawq和cuda版本有对应关系,请按照`https://github.com/casper-hansen/AutoAWQ`选择版本
@@ -209,6 +211,14 @@ curl http://localhost:8000/v1/chat/completions \
假设你使用lora微调了qwen1half-4b-chat, 模型权重目录为: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
```shell
+# 推送原始量化模型
+CUDA_VISIBLE_DEVICES=0 swift export \
+ --model_type qwen1half-7b-chat \
+ --model_id_or_path qwen1half-7b-chat-gptq-int4 \
+ --push_to_hub true \
+ --hub_model_id qwen1half-7b-chat-gptq-int4 \
+ --hub_token ''
+
# 推送lora增量模型
CUDA_VISIBLE_DEVICES=0 swift export \
--ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index dfb8f8913e..0aa539fefc 100644
--- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -91,8 +91,14 @@
|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
|llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
|llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
+|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-|
|llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
|llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
+|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-|
|atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
|atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|✔|✔||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
diff --git a/docs/source_en/LLM/LLM-quantization.md b/docs/source_en/LLM/LLM-quantization.md
index fbdaa97ffa..fd9a181502 100644
--- a/docs/source_en/LLM/LLM-quantization.md
+++ b/docs/source_en/LLM/LLM-quantization.md
@@ -11,7 +11,9 @@ Swift supports using AWQ and GPTQ techniques to quantize models. These two quant
GPU devices: A10, 3090, V100, A100 are all supported.
```bash
# Install ms-swift
-pip install 'ms-swift[llm]' -U
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e '.[llm]'
# Using AWQ quantization:
# AutoAWQ and CUDA versions have a corresponding relationship, please select the version according to `https://github.com/casper-hansen/AutoAWQ`
@@ -120,6 +122,14 @@ curl http://localhost:8000/v1/chat/completions \
Assume you fine-tuned qwen1half-4b-chat using LoRA, and the model weights directory is: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
```shell
+# Push the original quantized model
+CUDA_VISIBLE_DEVICES=0 swift export \
+ --model_type qwen1half-7b-chat \
+ --model_id_or_path qwen1half-7b-chat-gptq-int4 \
+ --push_to_hub true \
+ --hub_model_id qwen1half-7b-chat-gptq-int4 \
+ --hub_token ''
+
# Push LoRA incremental model
CUDA_VISIBLE_DEVICES=0 swift export \
--ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
index 1eb6d0eaaf..f576d80e99 100644
--- a/docs/source_en/LLM/Supported-models-datasets.md
+++ b/docs/source_en/LLM/Supported-models-datasets.md
@@ -91,8 +91,14 @@ The table below introcudes all models supported by SWIFT:
|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
|llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
|llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
+|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-|
|llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
|llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
+|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-|
+|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-|
|atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
|atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|✔|✔||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
diff --git a/swift/llm/export.py b/swift/llm/export.py
index 97ce3822c6..c88490b3fe 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -151,8 +151,11 @@ def llm_export(args: ExportArguments) -> None:
args.ckpt_dir = quant_path
if args.push_to_hub:
- assert args.ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
- push_to_ms_hub(args.ckpt_dir, args.hub_model_id, args.hub_token,
+ ckpt_dir = args.ckpt_dir
+ if ckpt_dir is None:
+ ckpt_dir = args.model_id_or_path
+ assert ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
+ push_to_ms_hub(ckpt_dir, args.hub_model_id, args.hub_token,
args.hub_private_repo, args.commit_message)
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index b9973441e9..31c022598c 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -128,8 +128,15 @@ class ModelType:
# llama3
llama3_8b = 'llama3-8b'
llama3_8b_instruct = 'llama3-8b-instruct'
+ llama3_8b_instruct_int4 = 'llama3-8b-instruct-int4'
+ llama3_8b_instruct_int8 = 'llama3-8b-instruct-int8'
+ llama3_8b_instruct_awq = 'llama3-8b-instruct-awq'
llama3_70b = 'llama3-70b'
llama3_70b_instruct = 'llama3-70b-instruct'
+ llama3_70b_instruct_int4 = 'llama3-70b-instruct-int4'
+ llama3_70b_instruct_int8 = 'llama3-70b-instruct-int8'
+ llama3_70b_instruct_awq = 'llama3-70b-instruct-awq'
+
# atom
atom_7b = 'atom-7b'
atom_7b_chat = 'atom-7b-chat'
@@ -2369,6 +2376,66 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
return model, tokenizer
+@register_model(
+ ModelType.llama3_70b_instruct_awq,
+ 'huangjintao/Meta-Llama-3-70B-Instruct-AWQ',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['autoawq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'is_awq': True},
+ support_flash_attn=True,
+ support_vllm=True)
+@register_model(
+ ModelType.llama3_70b_instruct_int8,
+ 'huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['auto_gptq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'gptq_bits': 8},
+ support_flash_attn=True,
+ support_vllm=True)
+@register_model(
+ ModelType.llama3_70b_instruct_int4,
+ 'huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['auto_gptq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'gptq_bits': 4},
+ support_flash_attn=True,
+ support_vllm=True)
+@register_model(
+ ModelType.llama3_8b_instruct_awq,
+ 'huangjintao/Meta-Llama-3-8B-Instruct-AWQ',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['autoawq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'is_awq': True},
+ support_flash_attn=True,
+ support_vllm=True)
+@register_model(
+ ModelType.llama3_8b_instruct_int8,
+ 'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['auto_gptq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'gptq_bits': 8},
+ support_flash_attn=True,
+ support_vllm=True)
+@register_model(
+ ModelType.llama3_8b_instruct_int4,
+ 'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4',
+ LoRATM.llama2,
+ TemplateType.llama3,
+ requires=['auto_gptq'],
+ torch_dtype=torch.float16,
+ function_kwargs={'gptq_bits': 4},
+ support_flash_attn=True,
+ support_vllm=True)
@register_model(
ModelType.llama3_70b_instruct,
'LLM-Research/Meta-Llama-3-70B-Instruct',
diff --git a/swift/utils/hub.py b/swift/utils/hub.py
index e757434ddd..cca1066523 100644
--- a/swift/utils/hub.py
+++ b/swift/utils/hub.py
@@ -1,6 +1,7 @@
import os
import shutil
import subprocess
+import tempfile
import time
from typing import Optional
@@ -46,15 +47,16 @@ def push_to_ms_hub(ckpt_dir: str,
hub_private_repo: bool = False,
commit_message: str = 'update files'):
logger.info(f'Starting push to hub. ckpt_dir: {ckpt_dir}.')
+ tmp_file_name = tempfile.TemporaryDirectory().name
subprocess_run(['git', 'lfs', 'env'],
stdout=subprocess.PIPE) # check git-lfs install
hub_model_id = create_ms_repo(hub_model_id, hub_token, hub_private_repo)
git_token = ModelScopeConfig.get_token()
ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{hub_model_id}.git'
- subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, 'tmp'],
+ subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, tmp_file_name],
env={'GIT_LFS_SKIP_SMUDGE': '1'})
- tmp_dir = os.path.join(ckpt_dir, 'tmp')
+ tmp_dir = os.path.join(ckpt_dir, tmp_file_name)
subprocess_run(['git', '-C', tmp_dir, 'lfs', 'pull'])
logger.info('Git clone the repo successfully.')
# mv .git