In [1]:
import torch
from transformers.models.opt.modeling_opt import OPTForCausalLM
from transformers import GPT2Tokenizer
from smoothquant.opt import Int8OPTForCausalLM

In [6]:
class Evaluator:
    def __init__(self, dataset, tokenizer, device):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.device = device

        # tokenize the dataset
        def tokenize_function(examples):
            example = self.tokenizer(examples['text'])
            return example

        self.dataset = self.dataset.map(tokenize_function, batched=True)
        self.dataset.set_format(type='torch', columns=['input_ids'])

    @torch.no_grad()
    def evaluate(self, model):
        model.eval()
        # The task is to predict the last word of the input.
        total, hit = 0, 0
        lantecy = 0
        for batch in self.dataset:
            input_ids = batch['input_ids'].to(self.device).unsqueeze(0)
            label = input_ids[:, -1]
            start = torch.cuda.Event(enable_timing=True)
            torch.cuda.synchronize()
            outputs = model(input_ids)
            torch.cuda.synchronize()
            end = torch.cuda.Event(enable_timing=True)
            lantecy += start.elapsed_time(end)
            last_token_logits = outputs.logits[:, -2, :]
            pred = last_token_logits.argmax(dim=-1)
            total += label.size(0)
            hit += (pred == label).sum().item()
        acc = hit / total
        lantecy /= len(self.dataset)
        return acc, lantecy
    
def print_model_size(model):
    # https://discuss.pytorch.org/t/finding-model-size/130275
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print('Model size: {:.3f}MB'.format(size_all_mb))


In [7]:
from datasets import load_dataset

tokenizer = GPT2Tokenizer.from_pretrained('facebook/opt-30b')
dataset = load_dataset('lambada', split='validation[:1000]')
evaluator = Evaluator(dataset, tokenizer, 'cuda')

Found cached dataset lambada (/home/gxiao/.cache/huggingface/datasets/lambada/plain_text/1.1.0/e32d76a7236c9ebb30099bc73d677c3acf32ddffb411836fe9ffc091ad3f3bec)


  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [8]:
model_fp16 = OPTForCausalLM.from_pretrained('facebook/opt-30b', torch_dtype=torch.float16, device_map='auto')
acc_fp16, lantecy_fp16 = evaluator.evaluate(model_fp16)
print(f'FP16 accuracy: {acc_fp16}, lantecy: {lantecy_fp16}')

Downloading:   0%|          | 0.00/62.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.79G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
model_smoothquant = Int8OPTForCausalLM.from_pretrained('mit-han-lab/opt-30b-smoothquant')
acc_smoothquant, lantecy_smoothquant = evaluator.evaluate(model_smoothquant)
print(f'SmoothQuant accuracy: {acc_smoothquant}, lantecy: {lantecy_smoothquant}')