In [1]:
import cv2
from ultralytics import YOLO  # Import YOLO from ultralytics

class ObjectDetection:
    """
    Class implements YOLOv10 model using PyTorch for video inference.
    """

    def __init__(self):
        """
        Initializes the class with output file.
        """
        self.model = self.load_model()
        self.class_names = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

    def load_model(self):
        """
        Loads YOLOv10 model from ultralytics.
        :return: Loaded YOLOv10 model.
        """
        return YOLO('yolov10n.pt')  # Load YOLOv10 model (nano version)

    def get_video(self):
        """
        Get webcam video from the device.
        :return: OpenCV video capture object.
        """
        return cv2.VideoCapture(0)

    def score_frame(self, frame):
        """
        Run YOLO model on a single frame to detect objects.
        :param frame: Input frame in numpy format.
        :return: Detected labels and coordinates.
        """
        results = self.model(frame)
        predictions = results[0].boxes
        labels = predictions.cls.numpy()  # Get labels
        cord = predictions.xyxy.numpy()  # Get bounding box coordinates

        # Filter results to only include birds
        bird_class_index = self.class_names.index("bird")
        bird_indices = [i for i, label in enumerate(labels) if label == bird_class_index]
        bird_labels = labels[bird_indices]
        bird_cord = cord[bird_indices]

        return bird_labels, bird_cord

    def plot_boxes(self, results, frame):
        """
        Plots the bounding boxes and labels on the frame.
        :param results: Labels and coordinates of objects detected.
        :param frame: Input frame in numpy format.
        :return: Frame with bounding boxes and labels.
        """
        labels, cord = results
        for label, box in zip(labels, cord):
            x1, y1, x2, y2 = map(int, box[:4])
            class_name = self.class_names[int(label)]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        return frame

    def run(self):
        """
        Runs the object detection on the webcam video feed.
        """
        cap = self.get_video()
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            results = self.score_frame(frame)
            frame = self.plot_boxes(results, frame)

            cv2.imshow('(Nama Robot) Eyes', frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        cap.release()
        cv2.destroyAllWindows()




if __name__ == "__main__":
    detector = ObjectDetection()
    detector.run()


0: 480x640 (no detections), 127.7ms
Speed: 4.0ms preprocess, 127.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 108.7ms
Speed: 2.0ms preprocess, 108.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 87.8ms
Speed: 1.0ms preprocess, 87.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 89.8ms
Speed: 2.0ms preprocess, 89.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 81.8ms
Speed: 2.0ms preprocess, 81.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 keyboard, 87.8ms
Speed: 1.0ms preprocess, 87.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 82.8ms
Speed: 2.0ms preprocess, 82.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 keyboard, 86.8ms
Speed: 1.0ms preprocess, 86.8ms inference, 0.0ms pos

KeyboardInterrupt: 