In [33]:
import time

import torch
import torch_tensorrt
import time
import numpy as np

from removebg.basnet.model.u2net import U2NET
from torchvision import transforms

from PIL import Image

import torch

# U2NET

## Prepare inputs

In [42]:
img_paths = [
    "/home/g.racic/sync/cailimage/pix_images/images/526_95bd2ba559f6b06a26d34e6aa0730a12.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/101_dd5783e2de4e0129850c5c347c00b6c4.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/102_5673a8240902ac026c2b1faf553b4999.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/103_85540ed2f1683e7a89cf5b729e7ae71b.jpg",
    "/home/g.racic/sync/cailimage/pix_images/images/105_289028da5128a3e7c6882f37590978de.jpg"
]

In [43]:
fts = transforms.Compose(
    [transforms.Resize((352, 352)),
     transforms.ToTensor(),
     transforms.Normalize((.485, .456, .406), (.229, .224, .225))]
)
fts

Compose(
    Resize(size=(352, 352), interpolation=bilinear, max_size=None, antialias=None)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)

In [44]:
def preprocess_img(img_path: str):
    image = Image.open(img_path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return fts(image)

In [45]:
img_tensors = [preprocess_img(img_path) for img_path in img_paths]
img_tensors = torch.stack(img_tensors)
img_tensors = img_tensors.to("cuda")
img_tensors.shape

torch.Size([1, 3, 352, 352])

## Trace and compile model

In [38]:
# viewfs://preprod-am6/user/rat/imagelab/image_seg/removebg_u2net_352_v1.pth
model_path = "/home/g.racic/sync/removebg_u2net_352_v1.pth"

In [39]:
u2net = U2NET()

In [40]:
u2net.load_state_dict(torch.load(model_path, map_location=None))

<All keys matched successfully>

In [41]:
u2net = u2net.eval().to("cuda")

In [10]:
traced_u2net = torch.jit.trace(u2net, [img_tensors])

In [11]:
# /!\ BE CAREFUL HERE. WE NEED TO EITHER PROVIDE AN INPUT SPEC WITH SHAPE RANGE
# OR THE INPUT USED FOR CALIBRATION MUST HAVE THE SAME SHAPE AS FOR INFERENCE
trt_u2net = torch_tensorrt.compile(traced_u2net, 
    inputs= [img_tensors.half()],
    enabled_precisions= { torch.half }
)

## Benchmark

In [12]:
def benchmark(model, input_data, dtype='fp32', nwarmup=50, nruns=1000):
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            features = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%100==0:
                print('Iteration %d/%d, ave batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))

    print("Input shape:", input_data.size())    
    print('Average batch time: %.2f ms'%(np.mean(timings)*1000))

In [13]:
input_data = torch.rand((5, 3, 352, 352))

In [14]:
benchmark(u2net, input_data, nwarmup=50, nruns=300)

Warm up ...
Start timing ...
Iteration 100/300, ave batch time 66.04 ms
Iteration 200/300, ave batch time 66.15 ms
Iteration 300/300, ave batch time 66.28 ms
Input shape: torch.Size([5, 3, 352, 352])
Average batch time: 66.28 ms


In [15]:
benchmark(traced_u2net, input_data, nwarmup=50, nruns=300)

Warm up ...
Start timing ...
Iteration 100/300, ave batch time 67.45 ms
Iteration 200/300, ave batch time 67.28 ms
Iteration 300/300, ave batch time 66.99 ms
Input shape: torch.Size([5, 3, 352, 352])
Average batch time: 66.99 ms


In [16]:
benchmark(trt_u2net, input_data, dtype="fp16",nwarmup=50, nruns=300)

Warm up ...
Start timing ...
Iteration 100/300, ave batch time 25.73 ms
Iteration 200/300, ave batch time 25.73 ms
Iteration 300/300, ave batch time 25.72 ms
Input shape: torch.Size([5, 3, 352, 352])
Average batch time: 25.72 ms


# U2NETp

## Prepare inputs

In [17]:
fts = transforms.Compose(
    [transforms.Resize((192, 192)),
     transforms.ToTensor(),
     transforms.Normalize((.485, .456, .406), (.229, .224, .225))]
)
fts

Compose(
    Resize(size=(192, 192), interpolation=bilinear, max_size=None, antialias=None)
    ToTensor()
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
)

In [18]:
img_tensors = [preprocess_img(img_path) for img_path in img_paths]
img_tensors = torch.stack(img_tensors)
img_tensors = img_tensors.to("cuda")
img_tensors.shape

torch.Size([5, 3, 192, 192])

## compile model

In [19]:
# viewfs://preprod-am6/user/rat/imagelab/image_seg/removebg_u2netp_192_v1_standalone.pth
model_path = "/home/g.racic/sync/removebg_u2netp_192_v1_standalone.pth"

In [20]:
# Already torchscript model
u2netp = torch.jit.load(model_path)

In [21]:
u2netp = u2netp.eval().to("cuda")

In [22]:
# /!\ BE CAREFUL HERE. WE NEED TO EITHER PROVIDE AN INPUT SPEC WITH SHAPE RANGE
# OR THE INPUT USED FOR CALIBRATION MUST HAVE THE SAME SHAPE AS FOR INFERENCE
trt_u2netp = torch_tensorrt.compile(u2netp, 
    inputs= [img_tensors.half()],
    enabled_precisions= { torch.half }
)

## Benchmark

In [23]:
input_data = torch.rand((5, 3, 192, 192))

In [24]:
benchmark(u2netp, input_data, nwarmup=50, nruns=300)

Warm up ...
Start timing ...
Iteration 100/300, ave batch time 13.75 ms
Iteration 200/300, ave batch time 13.76 ms
Iteration 300/300, ave batch time 13.76 ms
Input shape: torch.Size([5, 3, 192, 192])
Average batch time: 13.76 ms


In [25]:
benchmark(trt_u2netp, input_data, dtype="fp16",nwarmup=50, nruns=300)

Warm up ...
Start timing ...
Iteration 100/300, ave batch time 8.17 ms
Iteration 200/300, ave batch time 8.17 ms
Iteration 300/300, ave batch time 8.17 ms
Input shape: torch.Size([5, 3, 192, 192])
Average batch time: 8.17 ms
