# Resnet50 on webcam

## Import libraries

In [1]:
import torch
import torchvision
import cv2
import numpy as np
import time

## Download model

In [2]:
model = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)

## Labels

In [3]:
# dict with ImageNet labels
with open('imagenet_labels.txt') as f:
    labels = eval(f.read())

## Inference

In [6]:
# Open webcam and start inference
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

while True:
    t0 = time.time()
    ret, frame = cap.read()
    if not ret:
        continue

    # Preprocess image
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # img = cv2.resize(img, (224, 224))
    img = np.transpose(img, (2, 0, 1))
    img = img.astype(np.float32) / 255.0
    img = torch.from_numpy(img)
    img = img.unsqueeze(0)

    # Inference
    model.eval()
    with torch.no_grad():
        start = time.time()
        outputs = model(img)
        end = time.time()
        cv2.putText(frame, f"Inference time: {((end - start)*1000):.2f} ms", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Postprocess
    outputs = torch.nn.functional.softmax(outputs, dim=1)
    outputs = outputs.squeeze(0)
    outputs = outputs.tolist()
    idx = outputs.index(max(outputs))
    cv2.putText(frame, f"Predicted: {idx}-{labels[idx]}", (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # FPS
    t = time.time() - t0
    cv2.putText(frame, f"FPS: {1/t:.2f}", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Image shape
    cv2.putText(frame, f"Image shape: {img.shape}", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Display
    cv2.imshow("frame", frame)
    if cv2.waitKey(1) == ord('q'):
        break


cap.release()
cv2.destroyAllWindows()

In [9]:
cap.release()
cv2.destroyAllWindows()