###Adapted from trt_pose's live_demo.ipynb

Set up topology for the human pose estimation task

In [1]:
import json
import torch
import torch2trt
import trt_pose.coco

with open('human_pose.json', 'r') as f:
    human_pose = json.load(f)

topology = trt_pose.coco.coco_category_to_topology(human_pose)
num_parts = len(human_pose['keypoints'])
num_links = len(human_pose['skeleton'])

Set up example data to measure framerate

In [35]:
WIDTH = 224
HEIGHT = 224

data = torch.zeros((1, 3, HEIGHT, WIDTH)).cuda()

Load the previously optimized tensorRT model

In [37]:
from torch2trt import TRTModule

useResNet = True
if useResNet:
    OPTIMIZED_MODEL = 'resnet_trt.pth'
else:
    OPTIMIZED_MODEL = 'densenet_trt.pth'
    
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(OPTIMIZED_MODEL))

<All keys matched successfully>

We can benchmark the model in FPS with the following code. (ResNet: 9.97, DenseNet: 10.15)

In [38]:
import time

def fps(model):
    t0 = time.time()
    torch.cuda.current_stream().synchronize()
    for i in range(500):
        y = model(data)
    torch.cuda.current_stream().synchronize()
    t1 = time.time()

    print(500.0 / (t1 - t0))
    
fps(model_trt)

9.968327542821843


Next, let's define a function that will preprocess the image, which is originally in BGR8 / HWC format.

In [32]:
import cv2
import torchvision.transforms as transforms
import PIL.Image

mean = torch.Tensor([0.485, 0.456, 0.406]).cuda()
std = torch.Tensor([0.229, 0.224, 0.225]).cuda()
device = torch.device('cuda')

def preprocess(image):
    global device
    device = torch.device('cuda')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = PIL.Image.fromarray(image)
    image = transforms.functional.to_tensor(image).to(device)
    image = transforms.functional.resize(image, [224, 224])
    image.sub_(mean[:, None, None]).div_(std[:, None, None])
    return image[None, ...]

#### Next, we'll define two callable classes that will be used to parse the objects from the neural network, as well as draw the parsed objects on an image.

In [28]:
from trt_pose.draw_objects import DrawObjects
from trt_pose.parse_objects import ParseObjects

parse_objects = ParseObjects(topology)
draw_objects = DrawObjects(topology)

Set up a video stream with openCV for input and a video writer for output

In [33]:
import cv2

cap = cv2.VideoCapture('example_video.mpg')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(3))
height = int(cap.get(4))

out = cv2.VideoWriter('output.avi', cv2.VideoWriter_fourcc('M','J','P','G'), fps, (width, height))

Finally, we'll define the main execution loop.  This will perform the following steps

1.  Preprocess the camera image
2.  Execute the neural network
3.  Parse the objects from the neural network output
4.  Draw the objects onto the camera image
5.  Convert the image to JPEG format and stream to the display widget and output video

In [30]:
from jetcam.utils import bgr8_to_jpeg

def execute(image):
    data = preprocess(image)
    cmap, paf = model_trt(data)
    cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
    counts, objects, peaks = parse_objects(cmap, paf)#, cmap_threshold=0.15, link_threshold=0.15)
    draw_objects(image, counts, objects, peaks)
    out.write(image)

Run through the input file and process each frame

In [34]:
ret, frame = cap.read()
f = 0
while(ret):
    f += 1
    execute(frame)
    ret, frame = cap.read()
    
print(f)
cap.release()
out.release()

249
