In [2]:
!pip install torch torchvision opencv-python
!pip install -U ultralytics "ray[tune]"



In [3]:
import cv2
import torch
from torchvision.transforms import v2
from torchvision import transforms
from ultralytics import YOLO
from torchvision import models
import torch.nn as nn

### Preprocess functions:


In [4]:
org_transform = v2.Compose([
    v2.Resize((224, 224)),
    transforms.ToTensor()
])
def yolo_transform(frame):
    resized_image = cv2.resize(frame, (640,640))
    resized_image = transforms.ToTensor(resized_image)
    return resized_image

In [5]:
yolo_model = YOLO('best.pt')
shufflenet_model = models.shufflenet_v2_x1_0(pretrained=True)
shufflenet_model.fc = nn.Linear(shufflenet_model.fc.in_features, 2)
shufflenet_model.load_state_dict(torch.load('shufflenet_finetuned.pth'))
shufflenet_model.eval()



ShuffleNetV2(
  (conv1): Sequential(
    (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (stage2): Sequential(
    (0): InvertedResidual(
      (branch1): Sequential(
        (0): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=24, bias=False)
        (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Conv2d(24, 58, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (3): BatchNorm2d(58, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): ReLU(inplace=True)
      )
      (branch2): Sequential(
        (0): Conv2d(24, 58, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(58, eps=1e-05, momentum=0.1, affine=True, track_running_

In [6]:
def extract_shufflenet_class(shufflenet_output):
    _, predicted = torch.max(shufflenet_output, 1)

    return predicted.item()


## Open origin_video and detect

In [7]:
video_path = 'origin_video.mp4'
cap = cv2.VideoCapture(video_path)

In [8]:
if not cap.isOpened():
    exit()

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_path = 'demo_video.avi'
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while True:
    ret, frame = cap.read()

    if not ret:
        break

    # Object detect
    with torch.no_grad():
        yolo_output = yolo_model(frame)

    # Box coordinates
    yolo_boxes = yolo_output[0].boxes.xyxy.tolist()

    for box in yolo_boxes:
        x, y, x2, y2 = box
        x, y, x2, y2 = int(x), int(y), int(x2), int(y2)

        object_image = frame[y:y2, x:x2]

        # Preprocess
        shufflenet_input = org_transform(object_image)
        shufflenet_input = shufflenet_input.unsqueeze(0) # Make batch of 1

        with torch.no_grad():
            shufflenet_output = shufflenet_model(shufflenet_input)

        class_label = extract_shufflenet_class(shufflenet_output)
        if class_label == 0: # Co mu
            color = (0, 255, 0)  # Green
        else: # Khong mu
            color = (0, 0, 255)  # Red

        class_label = "violate" if class_label else "ok"
        #class_label = str(shufflenet_output)
        cv2.rectangle(frame, (x, y), (x2, y2), color, 2)
        cv2.putText(frame, class_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    out.write(frame)

cap.release()
out.release()


0: 384x640 2 bikes, 112.2ms
Speed: 5.0ms preprocess, 112.2ms inference, 537.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 6.9ms
Speed: 3.8ms preprocess, 6.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 7.6ms
Speed: 2.6ms preprocess, 7.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 7.4ms
Speed: 3.7ms preprocess, 7.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 10.4ms
Speed: 2.1ms preprocess, 10.4ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 6.8ms
Speed: 2.3ms preprocess, 6.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 6.9ms
Speed: 2.7ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 7.5ms
Speed: 2.1ms preprocess, 7.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no