-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Closed as not planned
Closed as not planned
Copy link
Labels
model:transformerissues related to a transformer model: BERT, GPT2, Hugging Face, Longformer, T5, etc.issues related to a transformer model: BERT, GPT2, Hugging Face, Longformer, T5, etc.staleissues that have not been addressed in a while; categorized by a botissues that have not been addressed in a while; categorized by a bot
Description
I converted Roberta base model to ONXX and the memory usage has increased even with FP16.
Here is the code to replicate the problem: (works with cuda 11)
import torch
import time
from transformers import RobertaTokenizerFast,RobertaForMaskedLM,BertForMaskedLM
the_model_rb = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(the_model_rb,cache_dir="/Models/")
if 1==1:
model = RobertaForMaskedLM.from_pretrained(the_model_rb,cache_dir="/Models/")
model.eval()
model.half()
model.to('cuda')
print("the model is loaded")
text = "<s> It uses a lot of memory. </s>"
tokenized_text = tokenizer.tokenize(text)
print("tokenized_text",tokenized_text)
maskedindex = 3
tokenized_text[maskedindex] = "<mask>"
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
attentions_mask = [1 for x in range(0,len(tokenized_text))]
input_mask = torch.tensor([attentions_mask,attentions_mask]).to("cuda")
input_ids = torch.tensor([indexed_tokens,indexed_tokens]).to("cuda")
dynamic_axes_d1={'input_ids': {0: 'batch',1: 'sequence'},'input_mask': {0: 'batch', 1: 'sequence'}, 'output': {0: 'batch', 1: 'sequence'}}
if 1==1:
torch.onnx.export(model, (input_ids,input_mask), "/robertabase_new1.onnx",input_names = ["input_ids", "input_mask"],
output_names = ["output"], verbose=False, opset_version=11, do_constant_folding=True,dynamic_axes=dynamic_axes_d1)
print("we have converted the model dynamic")
#load the model:
import time
import onnxruntime
print(onnxruntime.get_device())
#ONNX_PROVIDERS = ["CUDAExecutionProvider"]
model_dir = "robertabase_new1.onnx"
the_session = onnxruntime.InferenceSession(model_dir)#,providers=ONNX_PROVIDERS)
print(the_session.get_providers())
import numpy
def to_numpy(tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
#execute the model
beg = time.time()
for i in range(10):
ort_inputs = {the_session.get_inputs()[0].name: input_ids.detach().cpu().numpy(), the_session.get_inputs()[1].name: input_mask.detach().cpu().numpy()}
ort_outs = the_session.run(["output"], ort_inputs)
torch_onnx_output = torch.tensor(ort_outs[0])
print("onxx took",-beg + time.time())
input("wait to mesure memory usage")
Metadata
Metadata
Assignees
Labels
model:transformerissues related to a transformer model: BERT, GPT2, Hugging Face, Longformer, T5, etc.issues related to a transformer model: BERT, GPT2, Hugging Face, Longformer, T5, etc.staleissues that have not been addressed in a while; categorized by a botissues that have not been addressed in a while; categorized by a bot