In [7]:
import cv2
from ultralytics import YOLO  # Import YOLO from ultralytics

class ObjectDetection:
    """
    Class implements YOLOv10 model using PyTorch for video inference.
    """

    def __init__(self, out_file="abc.mp4"):
        """
        Initializes the class with output file.
        :param out_file: A valid output file name.
        """
        self.model = self.load_model()
        self.out_file = out_file
        self.class_names = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

    def load_model(self):
        """
        Loads YOLOv10 model from ultralytics.
        :return: Loaded YOLOv10 model.
        """
        return YOLO('yolov10n.pt')  # Load YOLOv10 model (nano version)

    def get_video(self):
        """
        Get webcam video from the device.
        :return: OpenCV video capture object.
        """
        return cv2.VideoCapture(0)

    def score_frame(self, frame):
        """
        Run YOLO model on a single frame to detect objects.
        :param frame: Input frame in numpy format.
        :return: Detected labels and coordinates.
        """
        results = self.model(frame)
        predictions = results[0].boxes
        labels = predictions.cls.numpy()  # Get labels
        cord = predictions.xyxy.numpy()  # Get bounding box coordinates
        return labels, cord

    def plot_boxes(self, results, frame):
        """
        Plots the bounding boxes and labels on the frame.
        :param results: Labels and coordinates of objects detected.
        :param frame: Input frame in numpy format.
        :return: Frame with bounding boxes and labels.
        """
        labels, cord = results
        for label, box in zip(labels, cord):
            x1, y1, x2, y2 = map(int, box[:4])
            class_name = self.class_names[int(label)]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        return frame

    def run(self):
        """
        Runs the object detection on the webcam video feed.
        """
        cap = self.get_video()
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            results = self.score_frame(frame)
            frame = self.plot_boxes(results, frame)

            cv2.imshow('YOLOv10 Object Detection', frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()

# Example usage
if __name__ == "__main__":
    detector = ObjectDetection()
    detector.run()


0: 480x640 1 person, 128.7ms
Speed: 3.0ms preprocess, 128.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 87.8ms
Speed: 2.0ms preprocess, 87.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 91.8ms
Speed: 1.0ms preprocess, 91.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 92.7ms
Speed: 2.0ms preprocess, 92.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 110.7ms
Speed: 1.0ms preprocess, 110.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 176.5ms
Speed: 1.0ms preprocess, 176.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 114.7ms
Speed: 2.0ms preprocess, 114.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 ce

KeyboardInterrupt: 