In [None]:
# basic config
MODEL_ID = "chargoddard/llama3-42b-v0"
USERNAME = ""
HF_TOKEN = ""
BASE_PATH = "/workspace"

In [None]:
# install dependencies
!pip install huggingface_hub
!apt-get update && apt-get install -y git-lfs make g++ build-essential rsync
!bash <(curl -sSL https://g.bodaay.io/hfd) -h

In [None]:
# setup huggingface for upload
from huggingface_hub import create_repo, HfApi, ModelCard
import os
hf_token = HF_TOKEN
api = HfApi()
MODEL_NAME = MODEL_ID.split('/')[-1]
CACHE_DIR = f'{BASE_PATH}/cache'
os.environ['HF_HOME'] = CACHE_DIR

In [None]:
# download model
!mkdir ./models
!./hfdownloader -m {MODEL_ID} -s {BASE_PATH}/models -c 3 -q
HF_MODEL_FOLDER_NAME=MODEL_ID.replace("/","_")

In [None]:
# exllama quant
EXLLAMA_BPW = 3.5

EXLLAMA_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-{EXLLAMA_BPW:.1f}bpw-EXL2"
EXLLAMA_SAVE_FOLDER = f"{BASE_PATH}/quants/exl/{EXLLAMA_MODEL_NAME}"

!mkdir -p {EXLLAMA_SAVE_FOLDER}/temp
!mkdir -p {EXLLAMA_SAVE_FOLDER}/output

!git clone https://github.com/turboderp/exllamav2
!pip install -r {BASE_PATH}/exllamav2/requirements.txt
!pip install -e {BASE_PATH}/exllamav2

!python {BASE_PATH}/exllamav2/util/convert_safetensors.py {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/*.bin && rm {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/*.bin
!python {BASE_PATH}/exllamav2/convert.py -i {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/ -o {EXLLAMA_SAVE_FOLDER}/temp -nr -om {EXLLAMA_SAVE_FOLDER}/measurement.json

!python {BASE_PATH}/exllamav2/convert.py -i {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/ -o {EXLLAMA_SAVE_FOLDER}/temp -nr -m {EXLLAMA_SAVE_FOLDER}/measurement.json -cf {EXLLAMA_SAVE_FOLDER}/output -b {EXLLAMA_BPW}
!cp {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/README.md {EXLLAMA_SAVE_FOLDER}/output/README.md
!cp {EXLLAMA_SAVE_FOLDER}/measurement.json {EXLLAMA_SAVE_FOLDER}/output/measurement.json

create_repo(
    repo_id = f"{USERNAME}/{EXLLAMA_MODEL_NAME}",
    repo_type="model",
    private=True,
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=f"{EXLLAMA_SAVE_FOLDER}/output",
    repo_id=f"{USERNAME}/{EXLLAMA_MODEL_NAME}",
    token=hf_token
)

In [None]:
# exllama re-quant to different BPW with measurement file
EXLLAMA_BPW = 3.5 # should match the BPW from the cell above, otherwise the measurement file won't be found
NEW_EXLLAMA_BPW = 3.8

EXLLAMA_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-{EXLLAMA_BPW:.1f}bpw-EXL2"
NEW_EXLLAMA_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-{NEW_EXLLAMA_BPW:.1f}bpw-EXL2"

EXLLAMA_SAVE_FOLDER = f"{BASE_PATH}/quants/exl/{EXLLAMA_MODEL_NAME}"
NEW_EXLLAMA_SAVE_FOLDER = f"{BASE_PATH}/quants/exl/{NEW_EXLLAMA_MODEL_NAME}"

import os
import sys
if not os.path.exists(os.path.join(EXLLAMA_SAVE_FOLDER,"measurement.json")):
    sys.exit("measurement.json not found!")

!mkdir -p {NEW_EXLLAMA_SAVE_FOLDER}/temp
!mkdir -p {NEW_EXLLAMA_SAVE_FOLDER}/output

!python {BASE_PATH}/exllamav2/convert.py -i {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/ -o {NEW_EXLLAMA_SAVE_FOLDER}/temp -nr -m {EXLLAMA_SAVE_FOLDER}/measurement.json -cf {NEW_EXLLAMA_SAVE_FOLDER}/output -b {NEW_EXLLAMA_BPW}
!cp {BASE_PATH}/models/{HF_MODEL_FOLDER_NAME}/README.md {NEW_EXLLAMA_SAVE_FOLDER}/output/README.md
!cp {EXLLAMA_SAVE_FOLDER}/measurement.json {NEW_EXLLAMA_SAVE_FOLDER}/output/measurement.json

create_repo(
    repo_id = f"{USERNAME}/{NEW_EXLLAMA_MODEL_NAME}",
    repo_type="model",
    private=True,
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=f"{NEW_EXLLAMA_SAVE_FOLDER}/output",
    repo_id=f"{USERNAME}/{NEW_EXLLAMA_MODEL_NAME}",
    token=hf_token
)

In [None]:
# awq quant
!pip install -qqq -U https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4+cu118-cp310-cp310-linux_x86_64.whl
!pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

AWQ_BITS = 4
AWQ_GROUP_SIZE = 128
AWQ_VERSION = "GEMM"
AWQ_ZERO_POINT = True

AWQ_QUANT_CONFIG = {
    "w_bit": AWQ_BITS,
    "q_group_size": AWQ_GROUP_SIZE,
    "version": AWQ_VERSION,
    "zero_point": AWQ_ZERO_POINT
}

AWQ_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-AWQ"
AWQ_SAVE_FOLDER = f"{BASE_PATH}/quants/awq/{AWQ_MODEL_NAME}"

# Quantize model
AWQ_MODEL = AutoAWQForCausalLM.from_pretrained(AWQ_MODEL_NAME, safetensors=True, low_cpu_mem_usage=True, cache_dir=CACHE_DIR)
AWQ_TOKENIZER = AutoTokenizer.from_pretrained(AWQ_MODEL_NAME, trust_remote_code=True, cache_dir=CACHE_DIR)
AWQ_MODEL.quantize(AWQ_TOKENIZER, quant_config=AWQ_QUANT_CONFIG)

# Save model and tokenizer
AWQ_MODEL.save_quantized(AWQ_SAVE_FOLDER)
AWQ_TOKENIZER.save_pretrained(AWQ_SAVE_FOLDER)

create_repo(
    repo_id = f"{USERNAME}/{AWQ_MODEL_NAME}",
    repo_type="model",
    private=True,
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=AWQ_SAVE_FOLDER,
    repo_id=f"{USERNAME}/{AWQ_MODEL_NAME}",
    token=hf_token
)

In [None]:
# hqq quant
!git clone https://github.com/mobiusml/hqq.git
!pip install -e hqq
!python hqq/kernels/setup_cuda.py install
!pip install flash-attn --no-build-isolation
!pip install transformers --upgrade
!num_threads=8; OMP_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *

HQQ_BITS = 2
HQQ_GROUP_SIZE = 128

HQQ_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-{HQQ_BITS}bit-HQQ"
HQQ_SAVE_FOLDER = f"{BASE_PATH}/quants/hqq/{HQQ_MODEL_NAME}"

HQQ_QUANT_CONFIG = BaseQuantizeConfig(
    nbits=HQQ_BITS,
    group_size=HQQ_GROUP_SIZE
)

HQQ_MODEL = HQQModelForCausalLM.from_pretrained(
    MODEL_ID,
    cache_dir=".",
    attn_implementation="flash_attention_2",
    cache_dir=CACHE_DIR
)
HQQ_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
HQQ_MODEL.quantize_model(quant_config=HQQ_QUANT_CONFIG, device='cuda')

save_folder = MODEL_ID + "-HQQ"
HQQ_MODEL.save_quantized(HQQ_SAVE_FOLDER)
HQQ_TOKENIZER.save_pretrained(HQQ_SAVE_FOLDER)

create_repo(
    repo_id = f"{USERNAME}/{HQQ_MODEL_NAME}",
    repo_type="model",
    private=True,
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=HQQ_SAVE_FOLDER,
    repo_id=f"{USERNAME}/{HQQ_MODEL_NAME}",
    token=hf_token
)

In [None]:
# gptq quant
!pip install --upgrade auto-gptq optimum accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

GPTQ_MODEL_NAME = f"{HF_MODEL_FOLDER_NAME}-GPTQ"
GPTQ_SAVE_FOLDER = f"{BASE_PATH}/quants/gptq/{GPTQ_MODEL_NAME}"

GPTQ_BITS = 4
GPTQ_GROUP_SIZE = 128
GPTQ_DAMP_PERCENT = 0.1

GPTQ_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
GPT_QUANTIZATION_CONFIG = GPTQConfig(bits=GPTQ_BITS, dataset="c4", tokenizer=GPTQ_TOKENIZER, damp_percent=GPTQ_DAMP_PERCENT)
GPTQ_MODEL = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", quantization_config=GPT_QUANTIZATION_CONFIG, low_cpu_mem_usage=True, trust_remote_code=True, cache_dir=CACHE_DIR)

# Save model and tokenizer
GPTQ_MODEL.save_pretrained(GPTQ_SAVE_FOLDER, use_safetensors=True)
GPTQ_TOKENIZER.save_pretrained(GPTQ_SAVE_FOLDER)

create_repo(
    repo_id = f"{USERNAME}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
    private=True,
    token=hf_token
)
api.upload_folder(
    folder_path=save_folder,
    repo_id=f"{USERNAME}/{MODEL_NAME}-GPTQ",
    token=hf_token
)