In [1]:
from slicegpt import hf_utils, data_utils, layernorm_fusion, gpu_utils
from quarot import rotation_utils
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load one of the pre-trained models
model_adapter, tokenizer = hf_utils.get_model_and_tokenizer("meta-llama/Llama-2-7b-hf", dtype=torch.float16)
model = model_adapter.model

device = torch.device("cuda:0")

def reset_model_device() -> None:
    model.to(device)

# replace modules with compressible equivalents
layernorm_fusion.replace_layers(model_adapter)

# fuse layernorms
layernorm_fusion.fuse_modules(model_adapter)

# rotate model
rotation_utils.rotate_model_clean(model_adapter)

# reset_model_device()
# dataset_ppl = gpu_utils.evaluate_ppl(model, model.config.pad_token_id, test_loader)
# logging.info(f'Debug ppl1: {dataset_ppl:.4f}')

# utils.cleanup_memory()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s]
Rotating (slicegpt): 100%|██████████| 32/32 [00:43<00:00,  1.35s/layer]


In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x CompressedLlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): RMSN()
        (post_attention_layernorm): RMSN()
      )
    )
    (norm): RMSN()
  )
  (lm_head): Linear(in_fea

In [4]:
from quarot import quant_utils
quant_utils.add_actquant(model)  # Add Activation Wrapper to the model

In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x CompressedLlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): ActQuantWrapper(
            Input Quantizer Bits: 16
            Output Quantizer Bits: 16
            (module): Linear(in_features=4096, out_features=4096, bias=False)
            (quantizer): ActQuantizer()
            (out_quantizer): ActQuantizer()
          )
          (k_proj): ActQuantWrapper(
            Input Quantizer Bits: 16
            Output Quantizer Bits: 16
            (module): Linear(in_features=4096, out_features=4096, bias=False)
            (quantizer): ActQuantizer()
            (out_quantizer): ActQuantizer()
          )
          (v_proj): ActQuantWrapper(
            Input Quantizer Bits: 16
            Output Quantizer Bits: 16
            (module): Linear(in_features=4096, out_features=4096, bias=False)
            (quantizer): ActQuantizer()

In [6]:
dataset = data_utils.get_dataset("wikitext2")
test_dataset = dataset["test"]
test_loader = data_utils.prepare_test_dataloader(
    dataset=test_dataset, tokenizer=tokenizer, batch_size=16
)

reset_model_device()
dataset_ppl = gpu_utils.evaluate_ppl(model, model.config.pad_token_id, test_loader)
dataset_ppl

RuntimeError: No CUDA GPUs are available

In [None]:
qlayers = quant_utils.find_qlayers(model)
qlayers

{'model.layers.0.self_attn.q_proj': ActQuantWrapper(
   Input Quantizer Bits: 16
   Output Quantizer Bits: 16
   (module): Linear(in_features=4096, out_features=4096, bias=False)
   (quantizer): ActQuantizer()
   (out_quantizer): ActQuantizer()
 ),
 'model.layers.0.self_attn.k_proj': ActQuantWrapper(
   Input Quantizer Bits: 16
   Output Quantizer Bits: 16
   (module): Linear(in_features=4096, out_features=4096, bias=False)
   (quantizer): ActQuantizer()
   (out_quantizer): ActQuantizer()
 ),
 'model.layers.0.self_attn.v_proj': ActQuantWrapper(
   Input Quantizer Bits: 16
   Output Quantizer Bits: 16
   (module): Linear(in_features=4096, out_features=4096, bias=False)
   (quantizer): ActQuantizer()
   (out_quantizer): ActQuantizer()
 ),
 'model.layers.0.self_attn.o_proj': ActQuantWrapper(
   Input Quantizer Bits: 16
   Output Quantizer Bits: 16
   (module): Linear(in_features=4096, out_features=4096, bias=False)
   (quantizer): ActQuantizer()
   (out_quantizer): ActQuantizer()
 ),
 'mo

In [None]:
from quarot import hadamard_utils
had_K, K = hadamard_utils.get_hadK(model.config.intermediate_size)

In [None]:
for i, name in enumerate(qlayers):
    print(i, name)
    if 'down_proj' in name:  # TODO : make this more general
        had_K, K = hadamard_utils.get_hadK(model.config.intermediate_size)
        qlayers[name].online_full_had = False
        qlayers[name].had_K = had_K
        qlayers[name].K = K
        qlayers[name].fp32_had = False
    # if 'o_proj' in name:  # TODO : make this more general
    #     had_K, K = hadamard_utils.get_hadK(model.config.num_attention_heads)
    #     qlayers[name].online_partial_had = True
    #     qlayers[name].had_K = had_K
    #     qlayers[name].K = K
    #     qlayers[name].had_dim = model.config.hidden_size // model.config.num_attention_heads
    #     qlayers[name].fp32_had = False

    # if i == 3:
    #     break

0 model.layers.0.self_attn.q_proj
1 model.layers.0.self_attn.k_proj
2 model.layers.0.self_attn.v_proj
3 model.layers.0.self_attn.o_proj
4 model.layers.0.mlp.gate_proj
5 model.layers.0.mlp.up_proj
6 model.layers.0.mlp.down_proj
7 model.layers.1.self_attn.q_proj
8 model.layers.1.self_attn.k_proj
9 model.layers.1.self_attn.v_proj
10 model.layers.1.self_attn.o_proj
11 model.layers.1.mlp.gate_proj
12 model.layers.1.mlp.up_proj
13 model.layers.1.mlp.down_proj
14 model.layers.2.self_attn.q_proj
15 model.layers.2.self_attn.k_proj
16 model.layers.2.self_attn.v_proj
17 model.layers.2.self_attn.o_proj
18 model.layers.2.mlp.gate_proj
19 model.layers.2.mlp.up_proj
20 model.layers.2.mlp.down_proj
21 model.layers.3.self_attn.q_proj
22 model.layers.3.self_attn.k_proj
23 model.layers.3.self_attn.v_proj
24 model.layers.3.self_attn.o_proj
25 model.layers.3.mlp.gate_proj
26 model.layers.3.mlp.up_proj
27 model.layers.3.mlp.down_proj
28 model.layers.4.self_attn.q_proj
29 model.layers.4.self_attn.k_proj
30 m

In [None]:
K

172

In [None]:
reset_model_device()
dataset_ppl = gpu_utils.evaluate_ppl(model, model.config.pad_token_id, test_loader)
dataset_ppl

5.471873760223389