In [None]:
# %pip install ultralytics mss opencv-python matplotlib pygetwindow pyautogui torch torchvision Pillow

## RESNET MODEL

In [None]:
import webbrowser
import pygetwindow as gw
import mss
import numpy as np
import torch
import torch.nn as nn
import cv2
from PIL import Image
from torchvision import models, transforms
import time

In [4]:
# === Model & Device ===
IMG_SIZE = 224
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CLASSES = [
    "front_left_door_closed", "front_left_door_open",
    "front_right_door_closed", "front_right_door_open",
    "hood_closed", "hood_open",
    "rear_left_door_closed", "rear_left_door_open",
    "rear_right_door_closed", "rear_right_door_open"
]

transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

model = models.resnet18(pretrained=False)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 256),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(256, len(CLASSES)),
    nn.Sigmoid()
)
model.load_state_dict(torch.load("multi_label_carparts_resnet18.pt", map_location=DEVICE))
model.to(DEVICE)
model.eval()

# === Open Browser ===
url = "http://103.233.100.26:8080/"
webbrowser.open(url)

print("⌛ Waiting for the browser window to appear...")

window_title = None
while window_title is None:
    windows = gw.getWindowsWithTitle(url)
    if not windows:
        windows = [w for w in gw.getWindowsWithTitle(" - Google Chrome") if "Car Control Simulation" in w.title]
    if windows:
        win = windows[0]
        window_title = win.title
    else:
        time.sleep(1)  # wait and try again

print(f"✅ Found window: {window_title}")
win = gw.getWindowsWithTitle(window_title)[0]

# === Inference Loop ===
with mss.mss() as sct:
    while True:
        bbox = {
            "top": win.top,
            "left": win.left,
            "width": win.width,
            "height": win.height
        }
        screen = sct.grab(bbox)
        img = Image.frombytes("RGB", screen.size, screen.rgb)

        # Preprocess
        input_tensor = transform(img).unsqueeze(0).to(DEVICE)

        # Inference
        with torch.no_grad():
            outputs = model(input_tensor)
            preds = (outputs > 0.5).float().squeeze()

        # Draw predictions on image
        frame = np.array(img)
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        y = 30
        for i, cls in enumerate(CLASSES):
            if preds[i] == 1:
                # Determine status and symbol
                status = "open"
                color = (0, 255, 0)  # Green for open
                symbol = "O"  # Green check for open
                if "closed" in cls:
                    status = "closed"
                    color = (0, 0, 255)  # Red for closed
                    symbol = "X"  # Red cross for closed

                # Prepare the label to display (e.g., hood: open/closed)
                text = f"{' '.join(cls.split('_')[:-1])}: {status}"

                font = cv2.FONT_HERSHEY_SIMPLEX
                font_scale = 0.7
                thickness = 2

                # Text size to calculate the box width
                text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
                text_w, text_h = text_size
                x, y_pos = 10, y

                # Draw filled rectangle (background box)
                cv2.rectangle(frame, (x - 5, y_pos - text_h - 5), (x + text_w + 5, y_pos + 5), (0, 0, 0), -1)

                # Draw the status text
                cv2.putText(frame, text, (x, y_pos), font, font_scale, (255, 255, 255), thickness)

                # Draw the symbol (✓ or ✘) in front of the text
                cv2.putText(frame, symbol, (x + text_w + 10, y_pos), font, font_scale, color, thickness)

                # Adjust vertical position for the next line
                y += text_h + 15

        # Show result
        cv2.imshow("🚗 Predicted Car State", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cv2.destroyAllWindows()


  model.load_state_dict(torch.load("multi_label_carparts_resnet18.pt", map_location=DEVICE))


⌛ Waiting for the browser window to appear...
✅ Found window: Car Control Simulation - Google Chrome


## YOLO OBJECT DETECTION MODEL

In [30]:
import pygetwindow as gw
from ultralytics import YOLO
import mss
import numpy as np
import cv2
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
import cv2

In [31]:
# Debug monitor info
with mss.mss() as sct:
    for monitor in sct.monitors:
        print(monitor)

{'left': 0, 'top': 0, 'width': 3840, 'height': 1200}
{'left': 0, 'top': 0, 'width': 1920, 'height': 1200}
{'left': 1920, 'top': 116, 'width': 1920, 'height': 1080}


In [35]:
# Load your custom YOLO model
model = YOLO(r"D:\School\Semester 8\PENGANTAR DEEP LEARNING\UTS\runs\detect\car-object-detection\weights\best.pt")

# === Open Browser ===
url = "http://103.233.100.26:8080/"
driver = webdriver.Chrome()

print("⌛ Waiting for the browser window to appear...")

driver.get(url)

window_title = None
while window_title is None:
    windows = gw.getWindowsWithTitle(url)
    if not windows:
        windows = [w for w in gw.getWindowsWithTitle(" - Google Chrome") if "Car Control Simulation" in w.title]
    if windows:
        win = windows[0]
        window_title = win.title
    else:
        time.sleep(1)  # wait and try again

print(f"✅ Found window: {window_title}")
win = gw.getWindowsWithTitle(window_title)[0]

⌛ Waiting for the browser window to appear...
✅ Found window: Car Control Simulation - Google Chrome


In [36]:
# Get canvas position and size

def get_canvas_bbox(driver):
    # Get canvas position and size relative to browser viewport
    canvas = driver.find_element(By.TAG_NAME, "canvas")
    canvas_rect = driver.execute_script("""
        const rect = arguments[0].getBoundingClientRect();
        return {
            x: rect.left,
            y: rect.top,
            width: rect.width,
            height: rect.height
        };
    """, canvas)

    # Get the window's screen position
    import pygetwindow as gw
    win_title_part = "Car Control"
    win = next((w for w in gw.getWindowsWithTitle("") if win_title_part in w.title), None)
    if not win:
        raise Exception(f"Window with title containing '{win_title_part}' not found.")
    win_x, win_y = win.left, win.top
    
    offset_y = 180  # Browser offset y axis
    offset_x = 2  # Browser offset x axis

    # Handle DPI scaling
    dpi_scale = driver.execute_script("return window.devicePixelRatio")
    dpi_scale = dpi_scale * 1.01

    # Convert to absolute screen coordinates
    left = int(win_x + canvas_rect['x'] * dpi_scale + offset_x)
    top = int(win_y + canvas_rect['y'] * dpi_scale + offset_y)
    right = int(left + canvas_rect['width'] * dpi_scale)
    bottom = int(top + canvas_rect['height'] * dpi_scale)

    print(f"🖥️ Screen BBOX for mss: ({left}, {top}, {right}, {bottom})")
    return (left, top, right, bottom)

bbox = get_canvas_bbox(driver)


🖥️ Screen BBOX for mss: (186, 268, 1737, 1049)


In [37]:
with mss.mss() as sct:
    try:
        cv2.namedWindow("YOLO Detection", cv2.WINDOW_NORMAL)
        # cv2.resizeWindow("YOLO Detection", 1280, 720)

        while True:
            screenshot = sct.grab(bbox)
            frame = np.array(screenshot)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)

            results = model(frame)
            annotated = results[0].plot()

            cv2.imshow("YOLO Detection", annotated)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    except KeyboardInterrupt:
        print("🛑 Stopped by user.")

cv2.destroyAllWindows()



0: 352x640 (no detections), 78.5ms
Speed: 5.3ms preprocess, 78.5ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 18.9ms
Speed: 1.7ms preprocess, 18.9ms inference, 0.9ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 15.8ms
Speed: 2.0ms preprocess, 15.8ms inference, 0.9ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 15.7ms
Speed: 1.7ms preprocess, 15.7ms inference, 0.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 15.9ms
Speed: 1.6ms preprocess, 15.9ms inference, 0.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 17.9ms
Speed: 2.3ms preprocess, 17.9ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 17.2ms
Speed: 2.0ms preprocess, 17.2ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 24.8ms
Speed: 1.5ms preprocess, 24.8ms i