In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
quantization_config = QuantoConfig(weights="int4")
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [2]:
print(quantized_model)

MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (v_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (o_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): QLinear(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): QLinear(in_features=4096, out_features=14336, bias=False)
              (w2): QLinear(in_features=14336, out_features=4096, bias=False)
              (w3): QLinear(in_features=4096, out_features=14336, bias=Fal

In [3]:
print(quantized_model.state_dict())

OrderedDict([('model.embed_tokens.weight', tensor([[-7.4938e-38,  1.2214e-38, -5.7305e-37,  ..., -1.8220e-37,
         -2.9534e-37, -2.1894e-37],
        [-1.5259e-02,  9.2030e-05, -1.6113e-02,  ...,  4.2915e-05,
         -1.7212e-02,  2.2125e-03],
        [-7.5531e-04, -4.6349e-04, -3.0136e-04,  ..., -2.0218e-04,
         -3.6812e-04, -4.6730e-05],
        ...,
        [ 8.9111e-03, -1.2817e-02,  5.3101e-03,  ...,  5.4626e-03,
          3.7384e-03,  3.6926e-03],
        [ 1.9455e-03,  1.8677e-02, -4.9133e-03,  ...,  1.3245e-02,
          4.6692e-03,  5.5847e-03],
        [ 3.8605e-03,  1.6357e-02,  1.5831e-04,  ..., -7.9346e-03,
         -9.3384e-03, -2.4414e-02]], device='cuda:0')), ('model.layers.0.self_attn.q_proj.weight._data._data', tensor([[ 86,  70,  86,  ..., 113,  86, 170],
        [186, 170, 170,  ..., 255, 176, 206],
        [ 73,  58,  72,  ...,  86, 135,  68],
        ...,
        [189, 135, 133,  ..., 149, 101, 135],
        [ 59, 234, 107,  ...,   9, 164, 139],
        

In [4]:
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM as eval_wrapper
from lm_eval.tasks import get_task_dict
from typing import Optional
import torch

moe =eval_wrapper(pretrained=quantized_model, tokenizer=tokenizer)




In [3]:
import lm_eval
from lm_eval.models.huggingface import HFLM as eval_wrapper
lm_eval.tasks.TaskManager().initialize_tasks()

{'iwslt2017-ar-en': {'type': 'task',
  'yaml_path': '/scratch/user/vincent2013/myenv/lib/python3.10/site-packages/lm_eval/tasks/translation/iwslt2017_ar-en.yaml'},
 'generate_until': {'type': 'group',
  'task': ['iwslt2017-ar-en',
   'wmt14-fr-en',
   'wmt14-en-fr',
   'wmt16-de-en',
   'wmt16-en-de',
   'wmt16-en-ro',
   'wmt16-ro-en',
   'iwslt2017-en-ar'],
  'yaml_path': -1},
 'translation': {'type': 'group',
  'task': ['iwslt2017-ar-en',
   'wmt14-fr-en',
   'wmt14-en-fr',
   'wmt16-de-en',
   'wmt16-en-de',
   'wmt16-en-ro',
   'wmt16-ro-en',
   'iwslt2017-en-ar'],
  'yaml_path': -1},
 'iwslt2017': {'type': 'group',
  'task': ['iwslt2017-ar-en', 'iwslt2017-en-ar'],
  'yaml_path': -1},
 'wmt14-fr-en': {'type': 'task',
  'yaml_path': '/scratch/user/vincent2013/myenv/lib/python3.10/site-packages/lm_eval/tasks/translation/wmt14_fr-en.yaml'},
 'wmt14': {'type': 'group',
  'task': ['wmt14-fr-en', 'wmt14-en-fr'],
  'yaml_path': -1},
 'gpt3_translation_benchmarks': {'type': 'group',
  'ta

In [4]:
task_dict = lm_eval.tasks.get_task_dict('wikitext')



In [7]:
eval_results = lm_eval.evaluate(
        moe,
        task_dict,
    )

2024-04-01:22:37:55,708 INFO     [task.py:395] Building contexts for wikitext on rank 0...
100%|██████████| 62/62 [00:00<00:00, 534.23it/s]
2024-04-01:22:37:56,061 INFO     [evaluator.py:362] Running loglikelihood_rolling requests
100%|██████████| 62/62 [10:22<00:00, 10.04s/it]


In [17]:
with open("ppl_int4w.txt", 'w') as file:
    file.write('The ppl for 4bit weight quantization is:')
    for task, res in eval_results["results"].items():
        file.write(f"{task}: {res}")

In [1]:
import quanto
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]



In [2]:
quanto.quantize(model=model, activations=quanto.qint8, weights=None)
#print(model.state_dict())
quanto.freeze(model)
moe =eval_wrapper(pretrained=model, tokenizer=tokenizer)
eval_results = lm_eval.evaluate(
        moe,
        task_dict,
    )

NameError: name 'eval_wrapper' is not defined