# Run MobileNet on a Camera 

* [ssdlite320_mobilenet_v3_large](https://pytorch.org/vision/stable/models/generated/torchvision.models.detection.ssdlite320_mobilenet_v3_large.html)
* [Real Time Inference on Raspberry Pi 4](https://pytorch.org/tutorials/intermediate/realtime_rpi.html)



In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
if ".." not in sys.path:
    sys.path.append("..")  # Append pieye module root to sys.path

import cv2
import numpy as np
from PIL import Image
import torch
from torchvision import models, transforms

# from pieye.video import play
from pieye.utils import show_bboxes


# Note, The aarch64 version of pytorch requires using the `qnnpack` engine.
torch.backends.quantized.engine = "qnnpack"
# torch.set_num_threads(2)

In [8]:
video = cv2.VideoCapture(0)
video.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
video.set(cv2.CAP_PROP_FRAME_HEIGHT, 320)
video.set(cv2.CAP_PROP_FPS, 30)

[ WARN:0@161.299] global cap_v4l.cpp:999 open VIDEOIO(V4L2:/dev/video0): can't open camera by index
[ERROR:0@161.299] global obsensor_uvc_stream_channel.cpp:158 getStreamChannelGroup Camera index out of range


< cv2.VideoCapture 0x7f5c3bc3d6b0>

In [None]:
_, frame = video.read()

In [None]:
image = Image.fromarray(frame[:, :, [2, 1, 0]])

In [21]:
preprocess = transforms.Compose([
    # convert the frame to a CHW torch tensor for training
    transforms.ToTensor(),
    # normalize the colors to the range that mobilenet_v2/3 expect
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

torch.Size([1, 3, 320, 320])

In [27]:
# Quantized (int8) model
# Quantized MobileNetV3-Large is about the same speed and size as MobileNetv3-Small, but it has higher accuracy.
model = models.quantization.mobilenet_v3_large(pretrained=True, quantize=True)

# Convert into jit (graph mode). In eager mode, operators in a model are immediately executed as they are encountered. In contrast, in graph mode, operators are first synthesized into a graph, which will then be compiled and executed as a whole
# This reduce overhead
model = torch.jit.script(model)

# Let batchnorm or dropout layers work in eval mode instead of training mode.
model = model.eval()

Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v3_large_qnnpack-5bcacf28.pth" to /home/junkimin/.cache/torch/hub/checkpoints/mobilenet_v3_large_qnnpack-5bcacf28.pth
100.0%


In [None]:
input_tensor = preprocess(image)
# The model can handle multiple images simultaneously so we need to add an
# empty dimension for the batch.
# [320, 320, 3] -> [1, 3, 320, 320]
input_batch = input_tensor.unsqueeze(0)
input_batch.shape

In [30]:
# deactivate autograd engine to not backprop
with torch.no_grad():
    output = model(input_batch)

In [32]:
output.shape

torch.Size([1, 1000])

In [33]:
with open("imagenet_class_labels.txt") as f:
    idx_to_label = eval(f.read())

In [35]:
display(image)

top = list(enumerate(output[0].softmax(dim=0)))
top.sort(key=lambda x: x[1], reverse=True)
for idx, val in top[:10]:
    print(f"{val.item()*100:.2f}% {idx_to_label[idx]}")

11.78% balloon
11.78% nipple
9.80% candle, taper, wax light
9.80% ping-pong ball
6.77% hair spray
6.77% water bottle
5.63% lighter, light, igniter, ignitor
5.63% sunscreen, sunblock, sun blocker
5.63% ice lolly, lolly, lollipop, popsicle
3.24% cocktail shaker


### Object Detection

In [None]:
detection_threshold = 0.5

In [None]:
preprocess_no_normalize = transforms.Compose([
    # convert the frame to a CHW torch tensor for training
    transforms.ToTensor(),
])

In [39]:
# Load object detection model. TODO: Need to quantize manually
model = models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
model = model.eval()

Downloading: "https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth" to /home/junkimin/.cache/torch/hub/checkpoints/ssdlite320_mobilenet_v3_large_coco-a79551df.pth
100.0%


In [45]:
input_tensor = preprocess_no_normalize(image)
input_batch = input_tensor.unsqueeze(0)

with torch.no_grad():
    outputs = model(input_batch)

In [52]:
show_bboxes(image=input_tensor, outputs=outputs)

In [None]:
video.release()