In [8]:
!pip install torch torchvision opencv-python
!pip install -U ultralytics "ray[tune]"

In [9]:
import cv2
import torch
from torchvision.transforms import v2
from torchvision import transforms
from ultralytics import YOLO
from torchvision import models
import torch.nn as nn

In [10]:
video_path = 'original_video.mp4'
cap = cv2.VideoCapture(video_path)

### Preprocess functions:


In [11]:
org_transform = v2.Compose([
    v2.Resize((224, 224)),
    transforms.ToTensor()
])
def yolo_transform(frame):
    resized_image = cv2.resize(frame, (640,640))
    resized_image = transforms.ToTensor(resized_image)
    return resized_image

In [12]:
yolo_model = YOLO('best.pt')
shufflenet_model = models.shufflenet_v2_x1_0(pretrained=True)
shufflenet_model.fc = nn.Linear(shufflenet_model.fc.in_features, 2)
shufflenet_model.load_state_dict(torch.load('shufflenet_finetuned.pth'))



<All keys matched successfully>

In [13]:
def extract_shufflenet_class(shufflenet_output):
    # Apply softmax to convert logits to probabilities
    class_probs = torch.softmax(shufflenet_output, dim=-1)

    # Get the index of the class with the highest probability
    _, predicted_class = class_probs.max(dim=-1)

    # Extract the predicted class index as an integer
    predicted_class = predicted_class.item()

    return predicted_class


In [14]:
if not cap.isOpened():
    exit()

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_path = 'demo_video.avi'
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

while True:
    ret, frame = cap.read()

    if not ret:
        break

    # Object detect
    with torch.no_grad():
        yolo_output = yolo_model(frame)

    # Box coordinates
    yolo_boxes = yolo_output[0].boxes.xyxy.tolist()

    for box in yolo_boxes:
        x, y, x2, y2 = box
        x, y, x2, y2 = int(x), int(y), int(x2), int(y2)

        object_image = frame[y:y2, x:x2]

        # Preprocess
        shufflenet_input = org_transform(object_image)
        shufflenet_input = shufflenet_input.unsqueeze(0) # Make batch of 1

        with torch.no_grad():
            shufflenet_output = shufflenet_model(shufflenet_input)

        class_label = extract_shufflenet_class(shufflenet_output)
        class_encode = shufflenet_output.argmax(dim=1).item()
        class_label = "violate" if class_label else "ok"
        if class_encode == 0:
            color = (255, 0, 0)  # Blue
        elif class_encode == 1:
            color = (0, 0, 255)  # Red

        cv2.rectangle(frame, (x, y), (x2, y2), color, 2)
        cv2.putText(frame, class_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    out.write(frame)

cap.release()
out.release()

cv2.destroyAllWindows()


0: 384x640 (no detections), 10.8ms
Speed: 1.6ms preprocess, 10.8ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 13.0ms
Speed: 1.2ms preprocess, 13.0ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 9.9ms
Speed: 1.4ms preprocess, 9.9ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 10.3ms
Speed: 1.4ms preprocess, 10.3ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 bike, 9.6ms
Speed: 2.6ms preprocess, 9.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 bikes, 10.3ms
Speed: 1.4ms preprocess, 10.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 bikes, 7.4ms
Speed: 2.2ms preprocess, 7.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 bikes, 6.7ms
Speed: 2.1ms preprocess, 6.7ms inference, 1.1ms postprocess per image at 