In [9]:
pip install opencv-contrib-python

Note: you may need to restart the kernel to use updated packages.


In [10]:
import cv2
import numpy as np
import tensorflow as tf
from ultralytics import YOLO  # YOLO for TensorFlow

class ObjectDetection:
    """
    Class implements YOLOv10 model using TensorFlow/Keras for video inference.
    """

    def __init__(self, out_file="abc.mp4"):
        """
        Initializes the class with output file.
        :param out_file: A valid output file name.
        """
        self.model = self.load_model()
        self.out_file = out_file
        self.class_names = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

    def get_video(self):
        """
        Get webcam video from the device.
        :return: OpenCV video capture object.
        """
        return cv2.VideoCapture(0)

    def load_model(self):
        """
        Loads YOLOv10 model from TensorFlow/Keras.
        :return: Loaded YOLO model.
        """
        model = YOLO('yolov10m.pt')  # Load YOLOv10n model (Resource friendly, good for Kill Joy Bot) -> use yolov10m for general purpose (resource heavy)
        return model

    def score_frame(self, frame):
        """
        Takes a single frame as input and scores it using the YOLOv10 model.
        :param frame: Input frame in numpy format.
        :return: Labels and coordinates of objects detected.
        """
        results = self.model.predict(frame)
        predictions = results[0].boxes
        labels = predictions.cls.numpy()  # Get labels
        cord = predictions.xyxy.numpy()  # Get bounding box coordinates
        return labels, cord

    def plot_boxes(self, results, frame):
        """
        Plots the bounding boxes and labels on the frame.
        :param results: Labels and coordinates of objects detected.
        :param frame: Input frame in numpy format.
        :return: Frame with bounding boxes and labels.
        """
        labels, cord = results
        for label, box in zip(labels, cord):
            x1, y1, x2, y2 = map(int, box)
            class_name = self.class_names[int(label)]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        return frame

    def run(self):
        """
        Runs the object detection on the webcam video feed.
        """
        cap = self.get_video()
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            results = self.score_frame(frame)
            frame = self.plot_boxes(results, frame)

            cv2.imshow('YOLOv8 Object Detection', frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()

# Example usage
if __name__ == "__main__":
    detector = ObjectDetection()
    detector.run()


0: 480x640 1 person, 1 bed, 348.1ms
Speed: 2.0ms preprocess, 348.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 395.9ms
Speed: 2.0ms preprocess, 395.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 bed, 310.2ms
Speed: 2.0ms preprocess, 310.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 357.6ms
Speed: 2.0ms preprocess, 357.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 341.1ms
Speed: 3.0ms preprocess, 341.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 306.2ms
Speed: 2.0ms preprocess, 306.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 301.2ms
Speed: 2.0ms preprocess, 301.2ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 2 beds, 372.5ms
Speed: 3.0ms

KeyboardInterrupt: 

: 