In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import torch
from transformers.models.opt.modeling_opt import OPTForCausalLM
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import GPT2Tokenizer, BertTokenizer
from opt import Int8OPTForCausalLM
from gpt2 import Int8GPT2LMHeadModel
from torch.nn.functional import pad
import datasets
from typing import DefaultDict
from data_utils import get_dataset


In [2]:
class Evaluator:
    def __init__(self, dataset, tokenizer, feat_name, label_name=None):
        self.dataset = dataset
        self.tokenizer = tokenizer

        # tokenize the dataset
        def tokenize_function(examples):
            example = dict()

            example['input_ids'] = self.tokenizer(examples[feat_name])
            if label_name is not None:
                example[label_name] = examples[label_name]
            return example

        self.dataset = self.dataset.filter(lambda e: e[feat_name] != '')
        self.dataset = self.dataset.map(tokenize_function)
        self.dataset.set_format(type='torch', columns=['input_ids', "label"])

    @torch.no_grad()
    def evaluate(self, model):
        model.eval()
        # The task is to predict the last word of the input.
        total, hit = 0, 0
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        latency = 0
        for batch in self.dataset:
            input_ids = batch['input_ids'].cuda().unsqueeze(0)
            label = input_ids[:, -1]
            pad_len = 512 - input_ids.shape[1]
            input_ids = pad(input_ids, (0, pad_len), value=1)
            torch.cuda.synchronize()
            start.record()
            outputs = model(input_ids)
            end.record()
            torch.cuda.synchronize()
            latency += start.elapsed_time(end)
            last_token_logits = outputs.logits[:, -2-pad_len, :]
            pred = last_token_logits.argmax(dim=-1)
            total += label.size(0)
            hit += (pred == label).sum().item()

        acc = hit / total
        lantecy = latency / len(self.dataset)
        return acc, lantecy


def print_model_size(model):
    # https://discuss.pytorch.org/t/finding-model-size/130275
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('Model size: {:.3f}MB'.format(size_all_mb))


In [3]:
model_path = os.path.join(
    os.path.expanduser("~"), 
    ".cache/huggingface/transformers/bert-base-uncased-CoLA"
)
tokenizer = BertTokenizer.from_pretrained(model_path)

dataset = get_dataset("cola")
evaluator = Evaluator(dataset, tokenizer, "sentence", "label")

Using the latest cached version of the module from /home/haoqi.whq/.cache/huggingface/modules/datasets_modules/datasets/glue/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad (last modified on Mon Jun 19 23:54:25 2023) since it couldn't be found locally at glue., or remotely on the Hugging Face Hub.
Found cached dataset glue (/home/haoqi.whq/.cache/huggingface/datasets/glue/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /home/haoqi.whq/.cache/huggingface/datasets/glue/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-528c26032182dcf8.arrow


Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

ValueError: Columns ['input_ids'] not in the dataset. Current columns in the dataset: ['sentence', 'label', 'idx']

In [None]:
model_fp16 = BertForSequenceClassification.from_pretrained(model_path, torch_dtype=torch.float16, hidden_act="gelu")
acc_fp16 = evaluator.evaluate(model_fp16)
print(f'Original model (fp16) accuracy: {acc_fp16}')


NameError: name 'evaluator' is not defined

In [None]:
model_smoothquant = Int8GPT2LMHeadModel.from_pretrained(
    "/home/haoqi.whq/llm-inference/LoRA/examples/NLG/src/tmp/int-gpt2-wikitext-103", device_map='auto')
print_model_size(model_smoothquant)
print(model_smoothquant.config)
acc_smoothquant, lantecy_smoothquant = evaluator.evaluate(model_smoothquant)
print(
    f'SmoothQuant INT8 accuracy: {acc_smoothquant}, per-sample lantecy: {lantecy_smoothquant:.3f}ms')

Model size: 390.753MB
GPT2Config {
  "_name_or_path": "/home/haoqi.whq/llm-inference/LoRA/examples/NLG/src/tmp/int-gpt2-wikitext-103",
  "activation_function": "gelu_new",
  "architectures": [
    "Int8GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "log_path": "tmp/qd/wiki/quan_quad_quan_2relu/gpt2/log.txt",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "tie_word_embeddings": fals

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from safetensors import safe_open

tensors = {}
with safe_open("/home/haoqi.whq/llm-inference/LoRA/examples/NLG/src/tmp/int-gpt2/model.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)
tensors

{'transformer.h.0.attn.c_attn.a': tensor(0.0012, device='cuda:0', dtype=torch.float16),
 'transformer.h.0.attn.c_attn.b': tensor(0.1186, device='cuda:0', dtype=torch.float16),
 'transformer.h.0.attn.c_attn.bias': tensor([ 46, -50, -41,  ...,   1,  -5,   0], device='cuda:0', dtype=torch.int8),
 'transformer.h.0.attn.c_attn.weight': tensor([[-30, -16,  -6,  ...,   3,  -4,   2],
         [  6,  10,  16,  ...,  -4,  -1,  -1],
         [  0,   4,  23,  ...,   7,   2,  -2],
         ...,
         [-16,  -1,  12,  ...,   1,  -3,   2],
         [ 10,  14,   7,  ...,   2,  -3,  -3],
         [-27, -13, -16,  ...,   0,   0,   1]], device='cuda:0',
        dtype=torch.int8),
 'transformer.h.0.attn.c_proj.a': tensor(0.0005, device='cuda:0', dtype=torch.float16),
 'transformer.h.0.attn.c_proj.bias': tensor([ 1.5027e-01, -1.5430e-01, -1.4661e-01, -9.9121e-02,  3.3813e-02,
         -3.4454e-02, -7.0618e-02, -9.3628e-02,  8.1116e-02,  3.1158e-02,
         -1.9922e-01, -3.7231e-02,  3.0499e-03,  4.9896