diff --git a/README.md b/README.md index 70c48cb498..d8d00319a6 100644 --- a/README.md +++ b/README.md @@ -441,7 +441,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese
English | 2B-102B | instruct model | | XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese
English | 7B-65B | base model
chat model
long text model
MoE model | | LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B
including quantized versions | base model
chat model | -| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B | base model
chat model | +| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B
including quantized versions | base model
chat model | | Mistral
Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model
instruct model
MoE model | | YI | [01AI's YI series models](https://github.com/01-ai) | Chinese
English | 6B-34B
including quantized | base model
chat model
long text model | | InternLM
InternLM2
InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese
English | 1.8B-20B | base model
chat model
math model | diff --git a/README_CN.md b/README_CN.md index 9b8b154555..b7336e0d47 100644 --- a/README_CN.md +++ b/README_CN.md @@ -438,7 +438,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \ | Yuan2 | [浪潮源系列模型](https://github.com/IEIT-Yuan) | 中文
英文 | 2B-102B | instruct模型 | | XVerse | [元象系列模型](https://github.com/xverse-ai) | 中文
英文 | 7B-65B | base模型
chat模型
长文本模型
MoE模型 | | | LLaMA2 | [LLaMA2系列模型](https://github.com/facebookresearch/llama) | 英文 | 7B-70B
包含量化版本 | base模型
chat模型 | -| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B | base模型
chat模型 | +| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B
包含量化版本 | base模型
chat模型 | | Mistral
Mixtral | [Mistral系列模型](https://github.com/mistralai/mistral-src) | 英文 | 7B-8x22B | base模型
instruct模型
MoE模型 | | YI | [01AI的YI系列模型](https://github.com/01-ai) | 中文
英文 | 6B-34B
包含量化版本 | base模型
chat模型
长文本模型 | | InternLM
InternLM2
InternLM2-Math | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文
英文 | 1.8B-20B | base模型
chat模型
数学模型 | diff --git "a/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md" "b/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md" index b77fa564a5..12223c0b1b 100644 --- "a/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md" +++ "b/docs/source/LLM/LLM\351\207\217\345\214\226\346\226\207\346\241\243.md" @@ -14,7 +14,9 @@ GPU设备: A10, 3090, V100, A100均可. # 设置pip全局镜像 (加速下载) pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ # 安装ms-swift -pip install 'ms-swift[llm]' -U +git clone https://github.com/modelscope/swift.git +cd swift +pip install -e '.[llm]' # 使用awq量化: # autoawq和cuda版本有对应关系,请按照`https://github.com/casper-hansen/AutoAWQ`选择版本 @@ -209,6 +211,14 @@ curl http://localhost:8000/v1/chat/completions \ 假设你使用lora微调了qwen1half-4b-chat, 模型权重目录为: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`. ```shell +# 推送原始量化模型 +CUDA_VISIBLE_DEVICES=0 swift export \ + --model_type qwen1half-7b-chat \ + --model_id_or_path qwen1half-7b-chat-gptq-int4 \ + --push_to_hub true \ + --hub_model_id qwen1half-7b-chat-gptq-int4 \ + --hub_token '' + # 推送lora增量模型 CUDA_VISIBLE_DEVICES=0 swift export \ --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \ diff --git "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index dfb8f8913e..0aa539fefc 100644 --- "a/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/LLM/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -91,8 +91,14 @@ |llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)| |llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)| |llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)| +|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-| |llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)| |llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)| +|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-| |atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)| |atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|✔|✔||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)| |llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| diff --git a/docs/source_en/LLM/LLM-quantization.md b/docs/source_en/LLM/LLM-quantization.md index fbdaa97ffa..fd9a181502 100644 --- a/docs/source_en/LLM/LLM-quantization.md +++ b/docs/source_en/LLM/LLM-quantization.md @@ -11,7 +11,9 @@ Swift supports using AWQ and GPTQ techniques to quantize models. These two quant GPU devices: A10, 3090, V100, A100 are all supported. ```bash # Install ms-swift -pip install 'ms-swift[llm]' -U +git clone https://github.com/modelscope/swift.git +cd swift +pip install -e '.[llm]' # Using AWQ quantization: # AutoAWQ and CUDA versions have a corresponding relationship, please select the version according to `https://github.com/casper-hansen/AutoAWQ` @@ -120,6 +122,14 @@ curl http://localhost:8000/v1/chat/completions \ Assume you fine-tuned qwen1half-4b-chat using LoRA, and the model weights directory is: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`. ```shell +# Push the original quantized model +CUDA_VISIBLE_DEVICES=0 swift export \ + --model_type qwen1half-7b-chat \ + --model_id_or_path qwen1half-7b-chat-gptq-int4 \ + --push_to_hub true \ + --hub_model_id qwen1half-7b-chat-gptq-int4 \ + --hub_token '' + # Push LoRA incremental model CUDA_VISIBLE_DEVICES=0 swift export \ --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \ diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md index 1eb6d0eaaf..f576d80e99 100644 --- a/docs/source_en/LLM/Supported-models-datasets.md +++ b/docs/source_en/LLM/Supported-models-datasets.md @@ -91,8 +91,14 @@ The table below introcudes all models supported by SWIFT: |llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✘|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)| |llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)| |llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)| +|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-| |llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)| |llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|✔|✔||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)| +|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|auto_gptq|-|-| +|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|autoawq|-|-| |atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|✔|✔||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)| |atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|✔|✔||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)| |llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|✔|✘|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)| diff --git a/swift/llm/export.py b/swift/llm/export.py index 97ce3822c6..c88490b3fe 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -151,8 +151,11 @@ def llm_export(args: ExportArguments) -> None: args.ckpt_dir = quant_path if args.push_to_hub: - assert args.ckpt_dir is not None, 'You need to specify `ckpt_dir`.' - push_to_ms_hub(args.ckpt_dir, args.hub_model_id, args.hub_token, + ckpt_dir = args.ckpt_dir + if ckpt_dir is None: + ckpt_dir = args.model_id_or_path + assert ckpt_dir is not None, 'You need to specify `ckpt_dir`.' + push_to_ms_hub(ckpt_dir, args.hub_model_id, args.hub_token, args.hub_private_repo, args.commit_message) diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index b9973441e9..31c022598c 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -128,8 +128,15 @@ class ModelType: # llama3 llama3_8b = 'llama3-8b' llama3_8b_instruct = 'llama3-8b-instruct' + llama3_8b_instruct_int4 = 'llama3-8b-instruct-int4' + llama3_8b_instruct_int8 = 'llama3-8b-instruct-int8' + llama3_8b_instruct_awq = 'llama3-8b-instruct-awq' llama3_70b = 'llama3-70b' llama3_70b_instruct = 'llama3-70b-instruct' + llama3_70b_instruct_int4 = 'llama3-70b-instruct-int4' + llama3_70b_instruct_int8 = 'llama3-70b-instruct-int8' + llama3_70b_instruct_awq = 'llama3-70b-instruct-awq' + # atom atom_7b = 'atom-7b' atom_7b_chat = 'atom-7b-chat' @@ -2369,6 +2376,66 @@ def get_model_tokenizer_deepseek_vl(model_dir: str, return model, tokenizer +@register_model( + ModelType.llama3_70b_instruct_awq, + 'huangjintao/Meta-Llama-3-70B-Instruct-AWQ', + LoRATM.llama2, + TemplateType.llama3, + requires=['autoawq'], + torch_dtype=torch.float16, + function_kwargs={'is_awq': True}, + support_flash_attn=True, + support_vllm=True) +@register_model( + ModelType.llama3_70b_instruct_int8, + 'huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8', + LoRATM.llama2, + TemplateType.llama3, + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 8}, + support_flash_attn=True, + support_vllm=True) +@register_model( + ModelType.llama3_70b_instruct_int4, + 'huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4', + LoRATM.llama2, + TemplateType.llama3, + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 4}, + support_flash_attn=True, + support_vllm=True) +@register_model( + ModelType.llama3_8b_instruct_awq, + 'huangjintao/Meta-Llama-3-8B-Instruct-AWQ', + LoRATM.llama2, + TemplateType.llama3, + requires=['autoawq'], + torch_dtype=torch.float16, + function_kwargs={'is_awq': True}, + support_flash_attn=True, + support_vllm=True) +@register_model( + ModelType.llama3_8b_instruct_int8, + 'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8', + LoRATM.llama2, + TemplateType.llama3, + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 8}, + support_flash_attn=True, + support_vllm=True) +@register_model( + ModelType.llama3_8b_instruct_int4, + 'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4', + LoRATM.llama2, + TemplateType.llama3, + requires=['auto_gptq'], + torch_dtype=torch.float16, + function_kwargs={'gptq_bits': 4}, + support_flash_attn=True, + support_vllm=True) @register_model( ModelType.llama3_70b_instruct, 'LLM-Research/Meta-Llama-3-70B-Instruct', diff --git a/swift/utils/hub.py b/swift/utils/hub.py index e757434ddd..cca1066523 100644 --- a/swift/utils/hub.py +++ b/swift/utils/hub.py @@ -1,6 +1,7 @@ import os import shutil import subprocess +import tempfile import time from typing import Optional @@ -46,15 +47,16 @@ def push_to_ms_hub(ckpt_dir: str, hub_private_repo: bool = False, commit_message: str = 'update files'): logger.info(f'Starting push to hub. ckpt_dir: {ckpt_dir}.') + tmp_file_name = tempfile.TemporaryDirectory().name subprocess_run(['git', 'lfs', 'env'], stdout=subprocess.PIPE) # check git-lfs install hub_model_id = create_ms_repo(hub_model_id, hub_token, hub_private_repo) git_token = ModelScopeConfig.get_token() ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{hub_model_id}.git' - subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, 'tmp'], + subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, tmp_file_name], env={'GIT_LFS_SKIP_SMUDGE': '1'}) - tmp_dir = os.path.join(ckpt_dir, 'tmp') + tmp_dir = os.path.join(ckpt_dir, tmp_file_name) subprocess_run(['git', '-C', tmp_dir, 'lfs', 'pull']) logger.info('Git clone the repo successfully.') # mv .git