In [5]:
! pip3 install opencv-python numpy ultralytics


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import cv2
import numpy as np
from ultralytics import YOLO

## Function to extract object names

In [35]:
def get_list_of_objects(result, min_confidence):
    objectIndices = result[0].probs.top5
    confidences = result[0].probs.top5conf
    objects = []
    # Go through all predictions and only collect the ones with high confidence
    for i in range(0, len(objectIndices)):
        if confidences[i] > min_confidence:
            objects.append(result[0].names[objectIndices[i]])
    return objects

## Function to get objects from single image

In [38]:
def get_objects_from_frame(model, frame, min_confidence):
    result = model.predict(source=frame)
    return get_list_of_objects(result, min_confidence)

## Use a pretrained YOLO model to predict objects in image

In [17]:
model = YOLO("yolov8x-cls.pt")

In [18]:
result = model.predict(source="https://ultralytics.com/images/bus.jpg")


Found https://ultralytics.com/images/bus.jpg locally at bus.jpg
image 1/1 /home/paul/Uni/SEP/ShortSearch/bus.jpg: 224x224 minibus 0.95, amphibian 0.02, trolleybus 0.01, recreational_vehicle 0.01, passenger_car 0.00, 85.6ms
Speed: 1.0ms preprocess, 85.6ms inference, 0.1ms postprocess per image at shape (1, 3, 224, 224)


In [30]:
get_list_of_objects(result, 0.9)

['minibus']

In [3]:
# result[0].names

## Get all frames from a video and find objects

In [33]:
def detect_objects_in_video(video_path, model, min_confidence):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error opening video file")
        return

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Extract every fifth frame
    frame_interval = 5
    detected_objects = set()
    for frame_number in range(0, frame_count, frame_interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()

        if not ret:
            print(f"Error reading frame {frame_number}")
            break

        # Call the provided function with the extracted frame
        for detected_object in get_objects_from_frame(model, frame, min_confidence):   
            detected_objects.add(detected_object)

    cap.release()
    cv2.destroyAllWindows()
    print(detected_objects)
    return detected_objects

## Run video classification

In [39]:
detect_objects_in_video("./test.mp4", model, 0.9)


0: 224x224 toilet_tissue 0.88, paper_towel 0.12, mortar 0.00, pot 0.00, oil_filter 0.00, 92.8ms
Speed: 1.4ms preprocess, 92.8ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 toilet_tissue 0.93, paper_towel 0.07, mortar 0.00, toilet_seat 0.00, oil_filter 0.00, 265.1ms
Speed: 32.9ms preprocess, 265.1ms inference, 0.1ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 toilet_tissue 0.88, paper_towel 0.12, mortar 0.00, toilet_seat 0.00, oil_filter 0.00, 106.9ms
Speed: 6.5ms preprocess, 106.9ms inference, 0.1ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 toilet_tissue 0.86, paper_towel 0.14, oil_filter 0.00, toilet_seat 0.00, mortar 0.00, 93.1ms
Speed: 1.4ms preprocess, 93.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 toilet_tissue 0.87, paper_towel 0.13, oil_filter 0.00, mortar 0.00, dough 0.00, 96.1ms
Speed: 1.8ms preprocess, 96.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

{'toilet_tissue'}


{'toilet_tissue'}