In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np
import torch
from time import perf_counter

In [None]:
def timer(f,*args):   
    torch.cuda.synchronize() 
    start = perf_counter()

    f(*args)
    torch.cuda.synchronize() 
    return (1000 * (perf_counter() - start))

In [None]:
# 加载bert model
native_model = BertModel.from_pretrained("/dataset/crosspipe/bert-base-uncased")

In [None]:
script_model = BertModel.from_pretrained("/dataset/crosspipe/bert-base-uncased", torchscript=True)

In [None]:
script_model

In [None]:
script_tokenizer = BertTokenizer.from_pretrained('/dataset/crosspipe/bert-base-uncased', torchscript=True)

In [None]:
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = script_tokenizer.tokenize(text)

In [None]:
# Masking one of the input tokens
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = script_tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
native_model(tokens_tensor,segments_tensors)

In [None]:
native_model.eval()
np.mean([timer(native_model,tokens_tensor,segments_tensors) for _ in range(100)])

In [None]:
native_model = native_model.cuda()
native_model.eval()

In [None]:
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()
np.mean([timer(native_model,tokens_tensor_gpu,segments_tensors_gpu) for _ in range(100)])

In [None]:
tokens_tensor.cpu()
segments_tensors.cpu()

In [None]:
traced_model = torch.jit.trace(script_model, [tokens_tensor, segments_tensors])
# 因模型的trace时，已经包含了.eval()的行为，因此不必再去显式调用model.eval()
np.mean([timer(traced_model,tokens_tensor,segments_tensors) for _ in range(100)])

In [None]:
script_model.cuda()
script_model.eval()
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()
traced_model = torch.jit.trace(script_model, [tokens_tensor_gpu, segments_tensors_gpu])

In [None]:
np.mean([timer(traced_model,tokens_tensor_gpu,segments_tensors_gpu) for _ in range(100)])

In [None]:
import torch
import numpy as np
from time import perf_counter

# 定义计时函数
def timer(f, *args):
    torch.cuda.synchronize()  # 确保CUDA操作已完成
    start = perf_counter()
    f(*args)
    torch.cuda.synchronize()  # 确保CUDA操作已完成
    return (1000 * (perf_counter() - start))  # 返回毫秒单位的时间

# 将模型移到GPU上
script_model.cuda()

# 将输入张量移到GPU上
tokens_tensor_gpu = tokens_tensor.cuda()
segments_tensors_gpu = segments_tensors.cuda()

# 测试未优化模型的性能
native_times = [timer(script_model, tokens_tensor_gpu, segments_tensors_gpu) for _ in range(100)]
native_mean_time = np.mean(native_times)
print(f"Average execution time for native model on GPU: {native_mean_time:.2f} ms")

# 生成优化后的模型
try:
    traced_model = torch.jit.trace(script_model, [tokens_tensor_gpu, segments_tensors_gpu])
    
    # 测试优化模型的性能
    traced_times = [timer(traced_model, tokens_tensor_gpu, segments_tensors_gpu) for _ in range(100)]
    traced_mean_time = np.mean(traced_times)
    print(f"Average execution time for traced model on GPU: {traced_mean_time:.2f} ms")
except RuntimeError as e:
    print(f"Tracing failed with error: {e}")
