Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for the latest GPTQ models with group-size #530

Merged
merged 24 commits into from
Mar 26, 2023
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ Optionally, you can use the following command-line flags:
| `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
| `--cpu` | Use the CPU to generate text.|
| `--load-in-8bit` | Load the model with 8-bit precision.|
| `--load-in-4bit` | DEPRECATED: use `--gptq-bits 4` instead. |
| `--gptq-bits GPTQ_BITS` | GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA and OPT. |
| `--gptq-model-type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported. |
| `--gptq-pre-layer GPTQ_PRE_LAYER` | GPTQ: The number of layers to preload. |
| `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. |
| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported. |
| `--groupsize GROUPSIZE` | GPTQ: Group size. |
| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
Expand Down
6 changes: 4 additions & 2 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,11 @@ def get_download_links_from_huggingface(model, branch):

is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
is_safetensors = re.match("model.*\.safetensors", fname)
is_pt = re.match(".*\.pt", fname)
is_tokenizer = re.match("tokenizer.*\.model", fname)
is_text = re.match(".*\.(txt|json|py)", fname) or is_tokenizer

if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)):
if is_text:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
classifications.append('text')
Expand All @@ -132,7 +133,8 @@ def get_download_links_from_huggingface(model, branch):
elif is_pytorch:
has_pytorch = True
classifications.append('pytorch')

elif is_pt:
classifications.append('pt')

cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
cursor = base64.b64encode(cursor)
Expand Down
62 changes: 38 additions & 24 deletions modules/GPTQ_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,21 @@


def load_quantized(model_name):
if not shared.args.gptq_model_type:
if not shared.args.model_type:
# Try to determine model type from model name
model_type = model_name.split('-')[0].lower()
if model_type not in ('llama', 'opt'):
print("Can't determine model type from model name. Please specify it manually using --gptq-model-type "
if model_name.lower().startswith(('llama', 'alpaca')):
model_type = 'llama'
elif model_name.lower().startswith(('opt', 'galactica')):
model_type = 'opt'
else:
print("Can't determine model type from model name. Please specify it manually using --model_type "
"argument")
exit()
else:
model_type = shared.args.gptq_model_type.lower()
model_type = shared.args.model_type.lower()

if model_type == 'llama':
if not shared.args.gptq_pre_layer:
if not shared.args.pre_layer:
load_quant = llama.load_quant
else:
load_quant = llama_inference_offload.load_quant
Expand All @@ -35,33 +38,44 @@ def load_quantized(model_name):
print("Unknown pre-quantized model type specified. Only 'llama' and 'opt' are supported")
exit()

# Now we are going to try to locate the quantized model file.
path_to_model = Path(f'models/{model_name}')
if path_to_model.name.lower().startswith('llama-7b'):
pt_model = f'llama-7b-{shared.args.gptq_bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-13b'):
pt_model = f'llama-13b-{shared.args.gptq_bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-30b'):
pt_model = f'llama-30b-{shared.args.gptq_bits}bit.pt'
elif path_to_model.name.lower().startswith('llama-65b'):
pt_model = f'llama-65b-{shared.args.gptq_bits}bit.pt'
found_pts = list(path_to_model.glob("*.pt"))
found_safetensors = list(path_to_model.glob("*.safetensors"))
pt_path = None

if len(found_pts) == 1:
pt_path = found_pts[0]
elif len(found_safetensors) == 1:
pt_path = found_safetensors[0]
else:
pt_model = f'{model_name}-{shared.args.gptq_bits}bit.pt'
if path_to_model.name.lower().startswith('llama-7b'):
pt_model = f'llama-7b-{shared.args.wbits}bit'
elif path_to_model.name.lower().startswith('llama-13b'):
pt_model = f'llama-13b-{shared.args.wbits}bit'
elif path_to_model.name.lower().startswith('llama-30b'):
pt_model = f'llama-30b-{shared.args.wbits}bit'
elif path_to_model.name.lower().startswith('llama-65b'):
pt_model = f'llama-65b-{shared.args.wbits}bit'
else:
pt_model = f'{model_name}-{shared.args.wbits}bit'

# Try to find the .pt both in models/ and in the subfolder
pt_path = None
for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
if path.exists():
pt_path = path
# Try to find the .safetensors or .pt both in models/ and in the subfolder
for path in [Path(p+ext) for ext in ['.safetensors', '.pt'] for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
if path.exists():
print(f"Found {path}")
pt_path = path
break

if not pt_path:
print(f"Could not find {pt_model}, exiting...")
print("Could not find the quantized model in .pt or .safetensors format, exiting...")
exit()

# qwopqwop200's offload
if shared.args.gptq_pre_layer:
model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits, shared.args.gptq_pre_layer)
if shared.args.pre_layer:
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could these 128 break act-order? GPTQ states "Currently, groupsize and act-order do not work together and you must choose one of them." So I would imagen using 128 when you are not supposed to will cause issues.

Copy link
Owner Author

@oobabooga oobabooga Mar 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is imported from here: https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/main/llama_inference.py#L26

Maybe it hasn't been updated yet to work with act-order?

Copy link
Contributor

@USBhost USBhost Mar 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed both 128 to -1 and I was able to load act-order. So I guess just make groupsize a parameter as @sgsdxzy said. If act-order becomes the default just have groupsize default to -1 as it already does on GPTQ.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's good to hear! I will make it a parameter.

model = load_quant(str(path_to_model), str(pt_path), shared.args.gptq_bits)
model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize)

# accelerate offload (doesn't work properly)
if shared.args.gpu_memory:
Expand Down
4 changes: 2 additions & 2 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def load_model(model_name):
shared.is_RWKV = model_name.lower().startswith('rwkv-')

# Default settings
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.gptq_bits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
Expand Down Expand Up @@ -95,7 +95,7 @@ def load_model(model_name):
return model, tokenizer

# Quantized model
elif shared.args.gptq_bits > 0:
elif shared.args.wbits > 0:
from modules.GPTQ_loader import load_quantized

model = load_quantized(model_name)
Expand Down
24 changes: 16 additions & 8 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
'^(gpt4chan|gpt-4chan|4chan)': '-----\n--- 865467536\nInput text\n--- 865467537\n',
'(rosey|chip|joi)_.*_instruct.*': 'User: \n',
'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>'
'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>',
'alpaca-*': "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n### Instruction:\nWrite a poem about the transformers Python library. \nMention the word \"large language models\" in that poem.\n### Response:\n",
},
'lora_prompts': {
'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
Expand All @@ -78,10 +79,15 @@ def str2bool(v):
parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
parser.add_argument('--load-in-4bit', action='store_true', help='DEPRECATED: use --gptq-bits 4 instead.')
parser.add_argument('--gptq-bits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA and OPT.')
parser.add_argument('--gptq-model-type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMa and OPT are supported.')
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='GPTQ: The number of layers to preload.')

parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.')
parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.')
parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.')
parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently only LLaMA and OPT are supported.')
parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to preload.')

parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
Expand Down Expand Up @@ -109,6 +115,8 @@ def str2bool(v):
args = parser.parse_args()

# Provisional, this will be deleted later
if args.load_in_4bit:
print("Warning: --load-in-4bit is deprecated and will be removed. Use --gptq-bits 4 instead.\n")
args.gptq_bits = 4
deprecated_dict = {'gptq_bits': ['wbits', 0], 'gptq_model_type': ['model_type', None], 'gptq_pre_layer': ['prelayer', 0]}
for k in deprecated_dict:
if eval(f"args.{k}") != deprecated_dict[k][1]:
print(f"Warning: --{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.")
exec(f"args.{deprecated_dict[k][0]} = args.{k}")
5 changes: 3 additions & 2 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,9 @@ def set_interface_arguments(interface_mode, extensions, cmd_active):

# Default UI settings
default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
if default_text == '':
if shared.lora_name != "None":
default_text = shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]
else:
default_text = shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
title ='Text generation web UI'
description = '\n\n# Text generation lab\nGenerate text using Large Language Models.\n'
Expand Down