[Performance] gpu inference is much slower than cpu #17489
Labels
ep:CUDA
issues related to the CUDA execution provider
platform:windows
issues related to the Windows platform
quantization
issues related to quantization
Describe the issue
Hello, I'm trying to export a Craft model to an ONNX file using torch export.
When I export and run inference, I receive some messages blow, and inference takes much longer than on the CPU, about 2,000 ms, while the CPU takes about 50 ms.
2023-09-11 15:07:36.5009034 [W:onnxruntime:, session_state.cc:1169 onnxruntime::VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns
2023-09-11 15:07:36.5058247 [W:onnxruntime:, session_state.cc:1171 onnxruntime::VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.
I suspect there are some issues with exporting the ONNX model.
Is there anything wrong with the exported model, or did I miss something to do further?
The following code is part of the inference code
def to_numpy(tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
prov = ['CUDAExecutionProvider'] if device == 'cuda' else ['CPUExecutionProvider']
ort_session = onnxruntime.InferenceSession(onnxModelFile, providers=prov)
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x)}
ort_outs = ort_session.run(None, ort_inputs)
To reproduce
import os
import torch
from collections import OrderedDict
import craft
import onnx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.nn.init as init
from torchvision import models
from collections import namedtuple
def init_weights(modules):
for m in modules:
if isinstance(m, nn.Conv2d):
init.xavier_uniform_(m.weight.data)
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
class vgg16_bn(torch.nn.Module):
def init(self, pretrained=True, freeze=True):
super(vgg16_bn, self).init()
vgg_pretrained_features = models.vgg16_bn(pretrained=pretrained).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()
self.slice5 = torch.nn.Sequential()
for x in range(12): # conv2_2
self.slice1.add_module(str(x), vgg_pretrained_features[x])
for x in range(12, 19): # conv3_3
self.slice2.add_module(str(x), vgg_pretrained_features[x])
for x in range(19, 29): # conv4_3
self.slice3.add_module(str(x), vgg_pretrained_features[x])
for x in range(29, 39): # conv5_3
self.slice4.add_module(str(x), vgg_pretrained_features[x])
class double_conv(nn.Module):
def init(self, in_ch, mid_ch, out_ch):
super(double_conv, self).init()
self.conv = nn.Sequential(
nn.Conv2d(in_ch + mid_ch, mid_ch, kernel_size=1),
nn.BatchNorm2d(mid_ch),
nn.ReLU(inplace=True),
nn.Conv2d(mid_ch, out_ch, kernel_size=3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
class CRAFT(nn.Module):
def init(self, pretrained=False, freeze=False):
super(CRAFT, self).init()
def copyStateDict(state_dict):
if list(state_dict.keys())[0].startswith("module"):
start_idx = 1
else:
start_idx = 0
newStateDict = OrderedDict()
for k, v in state_dict.items():
name = ".".join(k.split(".")[start_idx:])
newStateDict[name] = v
return newStateDict
def get_detector(trained_model, device='cpu', quantize=True, cudnn_benchmark=False):
net = CRAFT()
if device == 'cpu':
net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
# if quantize:
# try:
# torch.quantization.quantize_dynamic(net, dtype=torch.qint8, inplace=True)
# print('Quantized model')
# except:
# pass
else:
net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
print('Loaded weights from {}'.format(trained_model))
net = net.to(device)
cudnn.benchmark = cudnn_benchmark
model = get_detector(trained_model=r'customDetect.pth', device='cuda:0', quantize=False)
input_shape = (1, 3, 480, 640)
inputs = torch.ones(*input_shape)
inputs = inputs.to('cuda:0')
input_names=['input']
output_names=['output']
dynamic_axes= {'input':{0:'batch_size', 2:'height', 3:'width'}, 'output':{0:'batch_size', 2:'height', 3:'width'}}
torch.onnx.export(model, inputs, r"D:\Sa\EasyOCR_ONNX\craft.onnx", dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names)
Urgency
No response
Platform
Windows
OS Version
windows 11
ONNX Runtime Installation
Built from Source
ONNX Runtime Version or Commit ID
1.15.1
ONNX Runtime API
Python
Architecture
X64
Execution Provider
Default CPU, CUDA
Execution Provider Library Version
torch 2.0.1, cuDNN 11.7
Model File
No response
Is this a quantized model?
No
The text was updated successfully, but these errors were encountered: