In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from transformers import AutoTokenizer, TextStreamer
from transformers import AutoModelForCausalLM, AutoTokenizer
import math
import time
import random
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset

In [3]:

# model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1" # OOM on 2XA100 (80GB Total).
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/scratch/user/siweicui/LLM/huggingface/")
dataset = 'wikitext2' # Dataset: ['wikitext2', 'ptb', 'c4']
device = "cuda" # the device to load the model onto

import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/user/siweicui/LLM/huggingface/'
os.environ['HF_HOME'] = '/scratch/user/siweicui/LLM/huggingface/'

In [4]:
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version":"GEMM"}

# Load model
model = AutoAWQForCausalLM.from_pretrained(model_id, cache_dir="/scratch/user/siweicui/LLM/huggingface/") # cache_dir not working. ln -s /path/to/cache/directory ~/.cache/huggingface


Fetching 11 files: 100%|██████████| 11/11 [00:00<00:00, 46045.25it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:33<00:00, 16.68s/it]


In [5]:
model.to("cuda")

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [6]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

In [7]:
# Quantize
model.quantize(tokenizer, quant_config=quant_config)

AWQ: 100%|██████████| 32/32 [16:51<00:00, 31.60s/it]


In [8]:
from transformers import AwqConfig, AutoConfig

quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# the pretrained transformers model is stored in the model attribute + we need to pass a dict
model.model.config.quantization_config = quantization_config
# a second solution would be to use Autoconfig and push to hub (what we do at llm-awq)


In [9]:

quant_path = "mistralai-7B-awq"

# save model weights
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

('mistralai-7B-awq/tokenizer_config.json',
 'mistralai-7B-awq/special_tokens_map.json',
 'mistralai-7B-awq/tokenizer.model',
 'mistralai-7B-awq/added_tokens.json',
 'mistralai-7B-awq/tokenizer.json')

In [10]:
# from huggingface_hub import HfApi
# api = HfApi()
# api.upload_folder(
#     folder_path="mistral-7b-awq",
#     repo_id="",
#     repo_type="model",
# )