In [None]:
import cv2
import cvzone
import math
import tkinter as tk
from tkinter import filedialog
from threading import Thread
from ultralytics import YOLO

class ObjectDetectionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Object Detection GUI")
        self.root.configure(bg='black')  # Set background color to black

        self.font = ("Helvetica", 12, "bold")  # Font style for buttons, change as needed
        self.heading_font = ("Helvetica", 16, "bold")  # Font style for heading

        self.heading_label = tk.Label(root, text="Object Detection Application", bg='black', fg='orange', font=self.heading_font)
        self.heading_label.pack(pady=20)

        self.webcam_button = tk.Button(root, text="Use Webcam", command=self.use_webcam, bg='orange', fg='black', font=self.font)
        self.webcam_button.pack(pady=20)

        self.video_button = tk.Button(root, text="Select Video File", command=self.select_video, bg='orange', fg='black', font=self.font)
        self.video_button.pack(pady=20)

        self.cap = None
        self.is_running = False
        self.model = YOLO('yolov8l.pt')
        self.classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
                           "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
                           "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
                           "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
                           "kite", "pen", "skateboard", "surfboard", "tennis racket",
                           "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
                           "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
                           "sofa", "potted plant", "bed", "dining table", "toilet", "TV monitor", "laptop", "mouse",
                           "remote", "keyboard", "cell phone", "toaster", "sink", "refrigerator",
                           "book", "clock", "vase", "scissors", "teddy bear", "hair dryer", "pen"]

    def use_webcam(self):
        if self.is_running:
            self.stop_capture()
        self.cap = cv2.VideoCapture(0)
        self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
        self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
        self.start_detection()

    def select_video(self):
        if self.is_running:
            self.stop_capture()
        video_path = filedialog.askopenfilename()
        if video_path:
            self.cap = cv2.VideoCapture(video_path)
            self.start_detection()

    def start_detection(self):
        if not self.is_running:
            self.is_running = True
            Thread(target=self.detect_objects).start()

    def stop_capture(self):
        if self.cap is not None and self.cap.isOpened():
            self.cap.release()
        self.is_running = False
        cv2.destroyAllWindows()

    def detect_objects(self):
        while self.cap.isOpened():
            success, img = self.cap.read()
            if not success:
                break

            results = self.model(img, stream=True)
            for r in results:
                boxes = r.boxes
                for box in boxes:
                    x1, y1, x2, y2 = box.xyxy[0]
                    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
                    w, h = x2 - x1, y2 - y1
                    cvzone.cornerRect(img, (x1, y1, w, h))
                    conf = math.ceil(box.conf[0] * 100) / 100
                    cls = int(box.cls[0])
                    cvzone.putTextRect(img, f'{self.classNames[cls]} {conf}', (max(0, x1), max(35, y1)), scale=1, thickness=1)

            cv2.imshow("Object Detection", img)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        self.stop_capture()

if __name__ == "__main__":
    root = tk.Tk()
    app = ObjectDetectionApp(root)
    root.mainloop()



0: 480x640 1 person, 940.9ms
Speed: 15.4ms preprocess, 940.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 683.7ms
Speed: 5.6ms preprocess, 683.7ms inference, 15.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 675.1ms
Speed: 0.0ms preprocess, 675.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 659.6ms
Speed: 0.0ms preprocess, 659.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 691.0ms
Speed: 0.0ms preprocess, 691.0ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 674.9ms
Speed: 0.0ms preprocess, 674.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cell phone, 707.8ms
Speed: 0.0ms preprocess, 707.8ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cell phone, 756.6ms
Speed: 0.0ms preprocess, 756.