# Llama 2 7B

In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
MODEL_PATH = "/workspace/meta-llama/Llama-2-7b"

import torch
import torch.nn as nn

from transformers import LlamaTokenizer
from transformers.models.llama.modeling_llama import LlamaForCausalLM

import numpy as np
from smooth import smooth_lm
from tqdm.notebook import tqdm

class Evaluator:
    def __init__(self, dataset, tokenizer, device, n_samples=40):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.device = device

        self.dataset = tokenizer(
            "\n\n".join(dataset["text"]), return_tensors="pt"
        ).input_ids.to(device)

        self.n_samples = n_samples

    @torch.no_grad()
    def evaluate(self, model):
        model.eval()
        nlls = []
        for i in tqdm(range(self.n_samples), desc="Evaluating..."):
            batch = self.dataset[:, (i * 2048) : ((i + 1) * 2048)].to(model.device)
            with torch.no_grad():
                lm_logits = model(batch).logits
            shift_logits = lm_logits[:, :-1, :].contiguous().float()
            shift_labels = self.dataset[:, (i * 2048) : ((i + 1) * 2048)][:, 1:]
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )
            neg_log_likelihood = loss.float() * 2048
            nlls.append(neg_log_likelihood)

        return torch.exp(torch.stack(nlls).sum() / (self.n_samples * 2048))

tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH)
act_scales = torch.load("../act_scales/llama-2-7b.pt")

## Performing AIQ Quantization

### SmoothQuant Activation Outlier Migration

In [None]:
# model_aiq = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
# act_scales = torch.load("../act_scales/llama-2-7b.pt")
# smooth_lm(model_aiq, act_scales, 0.85)
# LlamaForCausalLM.save_pretrained(model_aiq, "llama-2-7b-smoothed")

In [None]:
# 입력 데이터 준비
input_text = "Hello, I am analyzing the importance of weights and activations in the llama2 model. What I must do is to find the "
input_ids = tokenizer.encode(input_text, return_tensors="pt")

### Importance Metric : Gradient

In [None]:
# Gradient-based 중요도 계산
absmaxs = {}
gradient_importances = {}

model_metric = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float32, device_map="cpu")
smooth_lm(model_metric, act_scales, 0.85)
model_metric.zero_grad()
outputs = model_metric(input_ids, labels=input_ids)
loss = outputs.loss
loss.backward()

from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP

for name, param in tqdm(model_metric.named_parameters(), desc="Gradient-based Importance"):
    if "layers" in name:
        gradient_importances[name] = param.grad.abs().mean().item()
## TODO: per head-channel importance 계산
# for name, m in tqdm(model_metric.named_modules(), desc="Gradient-Based Importance", total=len(list(model_metric.named_modules()))):
#     if isinstance(m, LlamaAttention):
#         # print(f"Module: {name}")
#         for param_name, param_value in m.named_parameters():
#             if any(n in param_name for n in ("q_proj", "k_proj", "v_proj")):
#                 # slice tensor for each head
#                 ## (self.num_heads,(self.head_dim, self.hidden_size)) tensors
#                 ## linear layer `{output} = {input}*{weight}^T + {bias}`.`z`
#                 weight_slices=param_value.split(m.head_dim, dim=0)
#                 grad_slices=param_value.grad.split(m.head_dim, dim=0)
#                 for i, (weight, grad) in enumerate(zip(weight_slices, grad_slices)):
#                     absmaxs[f"{name}.{param_name}.head{i}"] = weight.abs().max(dim=-1).values
#                     gradient_importances[f"{name}.{param_name}.head{i}"] = grad.abs().mean(dim=-1)
#             if "o_proj" in param_name:
#                 head_slices=param_value.split(m.head_dim, dim=-1)
#                 grad_slices=param_value.grad.split(m.head_dim, dim=-1)
#                 for i, (weight, grad) in enumerate(zip(weight_slices, grad_slices)):
#                     absmaxs[f"{name}.{param_name}.head{i}"] = weight.abs().max(dim=0).values  
#                     gradient_importances[f"{name}.{param_name}.head{i}"] = grad.abs().mean(dim=0)
#     if isinstance(m, LlamaMLP):
#         for param_name, param_value in m.named_parameters():
#             absmaxs[f"{name}.{param_name}"] = param_value.abs().max(dim=-1 if "down_proj" in param_name else 0).values
#             gradient_importances[f"{name}.{param_name}"] = param_value.grad.abs().mean(dim=-1 if "down_proj" in param_name else 0)

del model_metric

### Importance Metric : Sensitivity

In [None]:
# 민감도 기반 중요도 계산
sensitivity_importances = {}

def sensitivity_importance(model, input_ids, epsilon=1e-5):
    model.zero_grad()
    outputs = model(input_ids, labels=input_ids)
    original_loss = outputs.loss.detach().clone()

    importance_dict = {}
    for name, param in tqdm(model.named_parameters(), desc="Sensitivity-based Importance"):
        if "layers" in name:
            param_clone = param.detach().clone()
            param.data.add_(epsilon)
            perturbed_outputs = model(input_ids, labels=input_ids)
            perturbed_loss = perturbed_outputs.loss
            importance_dict[name] = (perturbed_loss - original_loss).abs().item()
            param.data.copy_(param_clone)
            del param_clone

    return importance_dict

model_metric = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
smooth_lm(model_metric, act_scales, 0.85)
sensitivity_importances = sensitivity_importance(model_metric, input_ids.cuda())

del model_metric
torch.cuda.empty_cache()

### Importance Metric : Layerwise

In [None]:
# 레이어별 중요도 계산

from scipy.stats import spearmanr

layer_outputs = {}
model_metric = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
smooth_lm(model_metric, act_scales, 0.85)

def get_layer_output(name):
    def hook(model, input, output):
        if isinstance(output, tuple):
            output_list = []
            for o in output:
                if isinstance(o, torch.Tensor):
                    output_list.append(o.detach())
                elif o is None:
                    output_list.append(None)
                else:
                    output_list.append(o)  # DynamicCache 객체 등 다른 타입은 그대로 추가
            layer_outputs[name] = tuple(output_list)
        else:
            layer_outputs[name] = output.detach()
    return hook

for name, layer in model_metric.named_modules():
    if isinstance(layer, (LlamaAttention, LlamaMLP)):
        layer.register_forward_hook(get_layer_output(name))

with torch.no_grad():
    outputs = model_metric(input_ids.cuda())

layer_importance = {}
for name, output in tqdm(layer_outputs.items(), desc="Layer-wise Importance"):
    if isinstance(output, tuple):
        output_tensor = None
        for o in output:
            if isinstance(o, torch.Tensor):
                output_tensor = o
                break
        if output_tensor is not None:
            min_size = min(output_tensor.view(-1).size(0), outputs.logits.view(-1).size(0))
            correlation, _ = spearmanr(output_tensor.view(-1)[:min_size].cpu().numpy(), outputs.logits.view(-1)[:min_size].cpu().numpy())
            layer_importance[name] = correlation
    elif isinstance(output, torch.Tensor):
        min_size = min(output.view(-1).size(0), outputs.logits.view(-1).size(0))
        correlation, _ = spearmanr(output.view(-1)[:min_size].cpu().numpy(), outputs.logits.view(-1)[:min_size].cpu().numpy())
        layer_importance[name] = correlation

del model_metric
torch.cuda.empty_cache()

### Adaptive Bit-Width Allocation with respect to Integrated Importance Metric

In [None]:
def metric2bits(model, target_bit, scoring_factor, lambda_factor, up_bit=2, down_bit=2):
    scores = {}

    with torch.no_grad():
        for name, m in model.named_modules():
            if isinstance(m, (LlamaAttention, LlamaMLP)):
                Il = layer_importance[name]
                for param_name, _ in m.named_parameters():
                    Ig = gradient_importances[f"{name}.{param_name}"]
                    Is = sensitivity_importances[f"{name}.{param_name}"]
                    scores[f"{name}.{param_name}"] = scoring_factor.dot(np.array([Ig, Is, Il]))

    sorted_scores = sorted(scores.items(), key=lambda item: item[1])
    
    upper = [key for key, _ in sorted_scores[:int(len(sorted_scores) * lambda_factor)]]
    lower = [key for key, _ in sorted_scores[int(len(sorted_scores) * (1 - lambda_factor)):]]

    bits = dict.fromkeys(scores.keys(), target_bit)
    for key in upper:
        bits[key] = target_bit + up_bit
    for key in lower:
        bits[key] = target_bit - down_bit

    return bits

def quantize_aiq(
    model, bits, activation_bit=8, weight_quant="per_channel", act_quant="per_token", quantize_bmm_input=False):
    from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP
    from fake_quant import W8A8Linear

    # model.cpu()
    # model = model.float()

    # from multiprocessing import Pool

    # def process_module(args):
    #     name, m, weight_quant, act_quant, bits, quantize_bmm_input = args
    #     if isinstance(m, LlamaMLP):
    #         for name_, m_ in m.named_modules():
    #             if name_ in ["gate_proj", "up_proj", "down_proj"]:
    #                 m_ = W8A8Linear.from_float(
    #                     m_,
    #                     weight_quant=weight_quant,
    #                     act_quant=act_quant,
    #                     bits=(bits[f"{name}.{name_}.weight"],8)
    #                 )
    #     elif isinstance(m, LlamaAttention):
    #         for name_, m_ in m.named_modules():
    #             if name_ in ["q_proj", "k_proj", "v_proj", "o_proj"]:
    #                 m_ = W8A8Linear.from_float(
    #                     m_,
    #                     weight_quant=weight_quant,
    #                     act_quant=act_quant,
    #                     quantize_output=quantize_bmm_input,
    #                     bits=(bits[f"{name}.{name_}.weight"],8)
    #                 )
    #     return m

    # from tqdm.contrib.concurrent import process_map 

    # with Pool() as p:
    #     model_modules = list(model.named_modules())
    #     args = [(name, m, weight_quant, act_quant, bits, quantize_bmm_input) for name, m in model_modules]
    #     model_modules = process_map(process_module, args, max_workers=63)
    
    # model.float16()
    # model.cuda()

    # simulating variable bit-width integer quantization
    for name, m in tqdm(model.named_modules(), total=len(list(model.named_modules()))):
        if isinstance(m, LlamaMLP):
            for name_, m_ in m.named_modules():
                if name_ in ["gate_proj", "up_proj", "down_proj"]:
                    m_ = W8A8Linear.from_float(
                        m_,
                        weight_quant=weight_quant,
                        act_quant=act_quant,
                        bits=(bits[f"{name}.{name_}.weight"],activation_bit)
                    )
        elif isinstance(m, LlamaAttention):
            for name_, m_ in m.named_modules():
                if name_ in ["q_proj", "k_proj", "v_proj", "o_proj"]:
                    m_ = W8A8Linear.from_float(
                        m_,
                        weight_quant=weight_quant,
                        act_quant=act_quant,
                        quantize_output=quantize_bmm_input,
                        bits=(bits[f"{name}.{name_}.weight"],activation_bit)
                    )

    return model

In [None]:
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
act_scales = torch.load("../act_scales/llama-2-7b.pt")
smooth_lm(model, act_scales, 0.85)
bits = metric2bits(model, target_bit=8, scoring_factor=np.array([1.,0.,0,]), lambda_factor=0.3)
model = quantize_aiq(model, bits)

## Evaluation

In [None]:
from datasets import load_dataset
# dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
data_files = {"validation": "en/c4-validation.*.json.gz"}
dataset = load_dataset('../dataset/c4', data_files=data_files, split="validation[:1%]")
evaluator = Evaluator(dataset, tokenizer, "cuda")

In [14]:
for elem in [[[1.,0.,0.,x], [0.,1.,0.,x], [0.,0.,1.,x], [0.5,0.5,0,x], [0.33,0.33,0.33,x]] for x in [0.1, 0.3]]:
    for elem2 in elem:
        model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
        act_scales = torch.load("../act_scales/llama-2-7b.pt")
        smooth_lm(model, act_scales, 0.85)
        bits = metric2bits(model, target_bit=8, scoring_factor=np.array(elem2)[:-1], lambda_factor=elem2[-1])
        model = quantize_aiq(model, bits)
        ppl = evaluator.evaluate(model)
        print(f"AIQ {elem} perplexity: {ppl}")
        del model
        torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.1], [0.0, 1.0, 0.0, 0.1], [0.0, 0.0, 1.0, 0.1], [0.5, 0.5, 0, 0.1], [0.33, 0.33, 0.33, 0.1]] perplexity: 7.346879482269287


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.1], [0.0, 1.0, 0.0, 0.1], [0.0, 0.0, 1.0, 0.1], [0.5, 0.5, 0, 0.1], [0.33, 0.33, 0.33, 0.1]] perplexity: 7.353400230407715


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.1], [0.0, 1.0, 0.0, 0.1], [0.0, 0.0, 1.0, 0.1], [0.5, 0.5, 0, 0.1], [0.33, 0.33, 0.33, 0.1]] perplexity: 7.323288917541504


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.1], [0.0, 1.0, 0.0, 0.1], [0.0, 0.0, 1.0, 0.1], [0.5, 0.5, 0, 0.1], [0.33, 0.33, 0.33, 0.1]] perplexity: 7.34963846206665


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.1], [0.0, 1.0, 0.0, 0.1], [0.0, 0.0, 1.0, 0.1], [0.5, 0.5, 0, 0.1], [0.33, 0.33, 0.33, 0.1]] perplexity: 7.323629856109619


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.3], [0.0, 1.0, 0.0, 0.3], [0.0, 0.0, 1.0, 0.3], [0.5, 0.5, 0, 0.3], [0.33, 0.33, 0.33, 0.3]] perplexity: 7.353277683258057


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.3], [0.0, 1.0, 0.0, 0.3], [0.0, 0.0, 1.0, 0.3], [0.5, 0.5, 0, 0.3], [0.33, 0.33, 0.33, 0.3]] perplexity: 7.376354694366455


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.3], [0.0, 1.0, 0.0, 0.3], [0.0, 0.0, 1.0, 0.3], [0.5, 0.5, 0, 0.3], [0.33, 0.33, 0.33, 0.3]] perplexity: 7.3580756187438965


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.3], [0.0, 1.0, 0.0, 0.3], [0.0, 0.0, 1.0, 0.3], [0.5, 0.5, 0, 0.3], [0.33, 0.33, 0.33, 0.3]] perplexity: 7.376228332519531


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [[1.0, 0.0, 0.0, 0.3], [0.0, 1.0, 0.0, 0.3], [0.0, 0.0, 1.0, 0.3], [0.5, 0.5, 0, 0.3], [0.33, 0.33, 0.33, 0.3]] perplexity: 7.360222816467285


In [15]:
for elem in [[0.4,0.4,0.2,x] for x in np.round(np.concatenate((np.array([0.1,0.2,0.3,0.4,0.5])*0.2-0.01,np.array([1,2,3,4,5])/10),axis=0),2)]: 
    model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
    act_scales = torch.load("../act_scales/llama-2-7b.pt")
    smooth_lm(model, act_scales, 0.85)
    bits = metric2bits(model, target_bit=8, scoring_factor=np.array(elem[:-1]), lambda_factor=elem[-1])
    model = quantize_aiq(model, bits)
    ppl = evaluator.evaluate(model)
    print(f"AIQ {elem} perplexity: {ppl}")
    del model
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.01] perplexity: 7.306535720825195


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.03] perplexity: 7.308714866638184


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.05] perplexity: 7.309116363525391


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.07] perplexity: 7.318541526794434


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.09] perplexity: 7.321682929992676


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.1] perplexity: 7.323244571685791


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.2] perplexity: 7.338731288909912


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.3] perplexity: 7.359064102172852


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.4] perplexity: 7.363735675811768


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]

Evaluating...:   0%|          | 0/40 [00:00<?, ?it/s]

AIQ [0.4, 0.4, 0.2, 0.5] perplexity: 7.380841255187988


In [None]:
scoring_factor = np.array([0.4,0.4,0.2,0.03])
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
act_scales = torch.load("../act_scales/llama-2-7b.pt")
smooth_lm(model, act_scales, 0.85)
bits = metric2bits(model, target_bit=8, scoring_factor=scoring_factor[:-1], lambda_factor=scoring_factor[-1], up_bit=1, down_bit=1)
model = quantize_aiq(model, bits)
ppl = evaluator.evaluate(model)
print(f"AIQ {scoring_factor} perplexity: {ppl}")
del model
torch.cuda.empty_cache()

In [None]:
scoring_factor = np.array([0.4,0.4,0.2,0.03])
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
act_scales = torch.load("../act_scales/llama-2-7b.pt")
smooth_lm(model, act_scales, 0.85)
bits = metric2bits(model, target_bit=4, scoring_factor=scoring_factor[:-1], lambda_factor=scoring_factor[-1], up_bit=1, down_bit=1)
model = quantize_aiq(model, bits, activation_bit=4)
ppl = evaluator.evaluate(model)
print(f"AIQ {scoring_factor} perplexity: {ppl}")
del model
torch.cuda.empty_cache()

### lm-eval

In [16]:
import lm_eval
from lm_eval.models.huggingface import HFLM

scoring_factor = np.array([0.4,0.4,0.2,0.03])
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="cuda")
act_scales = torch.load("../act_scales/llama-2-7b.pt")
smooth_lm(model, act_scales, 0.85)
bits = metric2bits(model, target_bit=4, scoring_factor=scoring_factor[:-1], lambda_factor=scoring_factor[-1], up_bit=1, down_bit=1)
model = quantize_aiq(model, bits, activation_bit=4)

lm_obj = HFLM(pretrained=model, tokenizer=tokenizer, dtype=torch.float16, batch_size=8)

# indexes all tasks from the `lm_eval/tasks` subdirectory.
# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
# to include a set of tasks in a separate directory.
task_manager = lm_eval.tasks.TaskManager()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/454 [00:00<?, ?it/s]



In [17]:
# Setting `task_manager` to the one above is optional and should generally be done
# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
# `simple_evaluate` will instantiate its own task_manager is the it is set to None here.
results = lm_eval.simple_evaluate( # call simple_evaluate
    model=lm_obj,
    tasks=["hellaswag"],
    num_fewshot=0,
    # limit=0.1,
    task_manager=task_manager
)

print(lm_eval.utils.make_table(results))

2024-06-05:10:58:08,913 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
2024-06-05:10:58:16,516 INFO     [task.py:395] Building contexts for hellaswag on rank 0...
100%|██████████| 10042/10042 [00:02<00:00, 4186.13it/s]
2024-06-05:10:58:20,071 INFO     [evaluator.py:362] Running loglikelihood requests
Running loglikelihood requests: 100%|██████████| 40168/40168 [16:14<00:00, 41.24it/s] 


|  Tasks  |Version|Filter|n-shot| Metric |Value |   |Stderr|
|---------|------:|------|-----:|--------|-----:|---|-----:|
|hellaswag|      1|none  |     0|acc     |0.2872|±  |0.0045|
|         |       |none  |     0|acc_norm|0.3269|±  |0.0047|



In [18]:
# Setting `task_manager` to the one above is optional and should generally be done
# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
# `simple_evaluate` will instantiate its own task_manager is the it is set to None here.
results = lm_eval.simple_evaluate( # call simple_evaluate
    model=lm_obj,
    tasks=["mmlu"],
    num_fewshot=0,
    limit=0.1,
    task_manager=task_manager
)

print(lm_eval.utils.make_table(results))

2024-06-05:11:18:30,698 INFO     [evaluator.py:131] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
2024-06-05:11:20:45,280 INFO     [task.py:395] Building contexts for mmlu_high_school_us_history on rank 0...
100%|██████████| 21/21 [00:00<00:00, 487.28it/s]
2024-06-05:11:20:45,328 INFO     [task.py:395] Building contexts for mmlu_jurisprudence on rank 0...
100%|██████████| 21/21 [00:00<00:00, 977.91it/s]
2024-06-05:11:20:45,352 INFO     [task.py:395] Building contexts for mmlu_prehistory on rank 0...
100%|██████████| 21/21 [00:00<00:00, 954.79it/s]
2024-06-05:11:20:45,376 INFO     [task.py:395] Building contexts for mmlu_professional_law on rank 0...
100%|██████████| 21/21 [00:00<00:00, 911.47it/s]
2024-06-05:11:20:45,402 INFO     [task.py:395] 

|                 Tasks                 |Version|Filter|n-shot|Metric|Value |   |Stderr|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu                                   |N/A    |none  |     0|acc   |0.2331|±  |0.0122|
| - humanities                          |N/A    |none  |     0|acc   |0.2454|±  |0.0264|
|  - formal_logic                       |      0|none  |     0|acc   |0.1429|±  |0.0782|
|  - high_school_european_history       |      0|none  |     0|acc   |0.1429|±  |0.0782|
|  - high_school_us_history             |      0|none  |     0|acc   |0.2381|±  |0.0952|
|  - high_school_world_history          |      0|none  |     0|acc   |0.2857|±  |0.1010|
|  - international_law                  |      0|none  |     0|acc   |0.2381|±  |0.0952|
|  - jurisprudence                      |      0|none  |     0|acc   |0.2857|±  |0.1010|
|  - logical_fallacies                  |      0|none  |     0|acc   |0.1905|±  |0.0878|
|  - moral_disputes  

In [None]:
val = list()
for key in bits:
    val.append((bits[key]).sum().item()*4096)
import numpy as np
np.array(val).sum()/8

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
def memory_stats():
    print(torch.cuda.memory_allocated()/1024**2)
    print(torch.cuda.memory_cached()/1024**2)

import gc
gc.collect()

torch.cuda.empty_cache()
memory_stats()

1/0

In [None]:
from functools import partial
from fake_quant import quantize_activation_per_token_absmax

@torch.no_grad()
def quantize_weight_per_channel_absmax_map(w, bits):
    # w: (out_features, in_features)
    scales = w.abs().max(dim=0).values
    scales.clamp_(min=1e-5)

    bits_tensor = torch.tensor(bits).cuda()

    scales /= 2 ** (bits_tensor - 1) - 1

    w.div_(scales).round_().mul_(scales)
    return w

class AIQLinear(nn.Module):
    def __init__(
        self,
        in_features,
        out_features,
        bias=True,
        act_quant="per_token",
        quantize_output=False,
    ):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.register_buffer(
            "weight",
            torch.randn(
                self.out_features,
                self.in_features,
                dtype=torch.float16,
                requires_grad=False,
            ),
        )
        if bias:
            self.register_buffer(
                "bias",
                torch.zeros(
                    (1, self.out_features), dtype=torch.float16, requires_grad=False
                ),
            )
        else:
            self.register_buffer("bias", None)

        if act_quant == "per_token":
            self.act_quant_name = "per_token"
            self.act_quant = partial(quantize_activation_per_token_absmax, n_bits=8)
        else:
            raise ValueError(f"Invalid act_quant: {act_quant}")

        if quantize_output:
            self.output_quant_name = self.act_quant_name
            self.output_quant = self.act_quant
        else:
            self.output_quant_name = "None"
            self.output_quant = lambda x: x

    def to(self, *args, **kwargs):
        super(AIQLinear, self).to(*args, **kwargs)
        self.weight = self.weight.to(*args, **kwargs)
        if self.bias is not None:
            self.bias = self.bias.to(*args, **kwargs)
        return self

    @torch.no_grad()
    def forward(self, x):
        q_x = self.act_quant(x)
        y = torch.functional.F.linear(q_x, self.weight, self.bias)
        q_y = self.output_quant(y)
        return q_y

    @staticmethod
    def from_float(
        module, bits, weight_quant="per_channel", act_quant="per_token", quantize_output=False
    ):
        assert isinstance(module, torch.nn.Linear)
        new_module = AIQLinear(
            module.in_features,
            module.out_features,
            module.bias is not None,
            act_quant=act_quant,
            quantize_output=quantize_output,
        )

        if weight_quant == "per_channel":
            new_module.weight = quantize_weight_per_channel_absmax_map(
                module.weight, bits=bits # weight bits from argument
            )
        else:
            raise ValueError(f"Invalid weight_quant: {weight_quant}")
        new_module.weight_quant_name = weight_quant
        if module.bias is not None:
            new_module.bias = module.bias
        return new_module

    def __repr__(self):
        return f"AIQLinear({self.in_features}, {self.out_features}, bias={self.bias is not None}, weight_quant={self.weight_quant_name}, act_quant={self.act_quant_name}, output_quant={self.output_quant_name}"
