In [1]:
from transformers import BertTokenizer, BertModel
import numpy as np
import torch
from time import perf_counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def timer(f,*args):   
    torch.cuda.synchronize() 
    start = perf_counter()

    f(*args)
    torch.cuda.synchronize() 
    return (1000 * (perf_counter() - start))

In [3]:
# 加载bert model
native_model = BertModel.from_pretrained("/dataset/crosspipe/bert-base-uncased")

In [4]:
script_model = BertModel.from_pretrained("/dataset/crosspipe/bert-base-uncased", torchscript=True)

In [16]:
script_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [5]:
script_tokenizer = BertTokenizer.from_pretrained('/dataset/crosspipe/bert-base-uncased', torchscript=True)

In [6]:
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = script_tokenizer.tokenize(text)

In [7]:
# Masking one of the input tokens
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = script_tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [8]:
native_model(tokens_tensor,segments_tensors)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-2.5689e-01, -7.3601e-03, -8.9146e-02,  ..., -1.3546e-01,
           2.3597e-01,  2.4208e-01],
         [-5.8262e-01,  3.1923e-01, -2.8020e-01,  ...,  1.0413e-01,
           1.7953e-01, -4.7086e-01],
         [-3.0671e-01, -2.3213e-01, -1.5938e-01,  ...,  7.0993e-02,
           1.4761e-01,  2.7529e-01],
         ...,
         [ 2.0549e-01, -1.6317e-02, -7.1208e-05,  ..., -1.3032e-01,
           6.1008e-01,  4.2999e-01],
         [-4.9530e-01, -4.6195e-01, -2.9027e-01,  ...,  6.3559e-01,
           6.2100e-01,  1.0318e-01],
         [ 8.2051e-01,  1.8250e-01, -1.1302e-01,  ...,  1.5103e-01,
          -7.6513e-01, -1.9481e-02]]], grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-4.9859e-01, -1.6913e-01,  8.3044e-01,  7.2490e-02, -4.8807e-01,
         -9.1259e-02,  5.1964e-01,  1.2615e-01,  7.3988e-01, -9.9609e-01,
          3.7945e-01, -5.8106e-01,  9.5275e-01, -6.8154e-01,  7.0220e-01,
         -2.4374e-

In [9]:
native_model.eval()
np.mean([timer(native_model,tokens_tensor,segments_tensors) for _ in range(100)])

28.176284969667904

In [10]:
native_model = native_model.cuda()
native_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [11]:
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()
np.mean([timer(native_model,tokens_tensor_gpu,segments_tensors_gpu) for _ in range(100)])

10.522816987941042

In [12]:
tokens_tensor.cpu()
segments_tensors.cpu()

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])

In [13]:
traced_model = torch.jit.trace(script_model, [tokens_tensor, segments_tensors])
# 因模型的trace时，已经包含了.eval()的行为，因此不必再去显式调用model.eval()
np.mean([timer(traced_model,tokens_tensor,segments_tensors) for _ in range(100)])

19.31729846925009

In [14]:
script_model.cuda()
script_model.eval()
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()
traced_model = torch.jit.trace(script_model, [tokens_tensor_gpu, segments_tensors_gpu])

In [15]:
np.mean([timer(traced_model,tokens_tensor_gpu,segments_tensors_gpu) for _ in range(100)])

RuntimeError: default_program(24): error: extra text after expected end of number
          aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
                                                                                                           ^

default_program(28): error: extra text after expected end of number
      aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
                                                                                                                     ^

2 errors detected in the compilation of "default_program".

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern "C" __global__
void fused_mul_div_add(float* tattention_scores_1, float* tv_, float* aten_add, float* aten_mul) {
{
if (blockIdx.x<1ll ? 1 : 0) {
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<14ll ? 1 : 0) {
if (blockIdx.x<1ll ? 1 : 0) {
        float v = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
        aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
      }    }  }if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<2352ll ? 1 : 0) {
    float v_1 = __ldg(tattention_scores_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    float v_2 = __ldg(tv_ + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) % 14ll);
    aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
  }}
}


In [66]:
import torch
import numpy as np
from time import perf_counter

# 定义计时函数
def timer(f, *args):
    torch.cuda.synchronize()  # 确保CUDA操作已完成
    start = perf_counter()
    f(*args)
    torch.cuda.synchronize()  # 确保CUDA操作已完成
    return (1000 * (perf_counter() - start))  # 返回毫秒单位的时间

# 将模型移到GPU上
script_model.cuda()

# 将输入张量移到GPU上
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()

# 测试未优化模型的性能
native_times = [timer(script_model, tokens_tensor_gpu, segments_tensors_gpu) for _ in range(100)]
native_mean_time = np.mean(native_times)
print(f"Average execution time for native model on GPU: {native_mean_time:.2f} ms")

# 生成优化后的模型
try:
    traced_model = torch.jit.trace(script_model, [tokens_tensor_gpu, segments_tensors_gpu])
    
    # 测试优化模型的性能
    traced_times = [timer(traced_model, tokens_tensor_gpu, segments_tensors_gpu) for _ in range(100)]
    traced_mean_time = np.mean(traced_times)
    print(f"Average execution time for traced model on GPU: {traced_mean_time:.2f} ms")
except RuntimeError as e:
    print(f"Tracing failed with error: {e}")


Average execution time for native model on GPU: 8.83 ms
Tracing failed with error: default_program(24): error: extra text after expected end of number
          aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
                                                                                                           ^

default_program(28): error: extra text after expected end of number
      aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
                                                                                                                     ^

2 errors detected in the compilation of "default_program".

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);