### Tool Detection with YOLOv8

In [1]:
from ultralytics import YOLOv10
import supervision as sv

model_yolo = YOLOv10("chkpts/YOLOv10/yolov10x.pt")


def detect_tools(image):
    results = model_yolo(image, verbose=False)
    dets = []
    for i in range(len(results[0].boxes.xyxy)):
        x1, y1, x2, y2 = results[0].boxes.xyxy.numpy()[i].flatten()
        conf = results[0].boxes.conf.numpy()[i].flatten()[0]
        cls = results[0].boxes.cls.numpy()[i].flatten()[0]
        dets.append([x1, y1, x2, y2, conf, cls])
    return dets

detect_tools("data/ART-Net/Train/Train_Positive/Train_Pos_sample_0010.png")

[[0.9185486, 133.97276, 915.95306, 467.31335, 0.35837874, 43.0]]

### Dealing with SSL Issues

In [2]:
import os
import certifi

os.environ["SSL_CERT_FILE"] = certifi.where()

### Direction Estimation with Siamese Network

In [3]:
import torch
import torch.nn as nn
import torchvision.models as models

class ResNetSiamese(nn.Module):
    def __init__(self, pretrained=True):
        super(ResNetSiamese, self).__init__()
        self.resnet = models.resnet50(pretrained=pretrained)
        self.resnet.fc = nn.Identity()  # Remove the final fully connected layer

        self.attention = nn.MultiheadAttention(embed_dim=512, num_heads=8)
        self.fc_direction = nn.Linear(512, 4)  # Assuming 4 direction classes
        self.fc_operator = nn.Linear(
            512 + 7, 4
        )  # Assuming 7 tool categories and 4 operators

    def forward(self, x, category):
        batch_size = x.size(0)
        features = self.resnet(x)
        features = features.view(batch_size, -1, 512)  # Reshape for attention
        attn_output, _ = self.attention(features, features, features)

        # Direction estimation
        direction_output = self.fc_direction(attn_output.mean(dim=1))

        # Operator estimation
        category_one_hot = torch.nn.functional.one_hot(category, num_classes=7).float()
        operator_input = torch.cat((attn_output.mean(dim=1), category_one_hot), dim=1)
        operator_output = self.fc_operator(operator_input)

        return direction_output, operator_output


model_siamese = ResNetSiamese()



In [4]:
def train_direction_estimator(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        for images, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

### Harmonizing Bipartite Graph Matching (HBGM)

In [5]:
import os
import numpy as np

def calculate_cost_matrix(tracklets, detections):
    # Example cost calculation using IoU
    cost_matrix = np.zeros((len(tracklets), len(detections)))
    for i, track in enumerate(tracklets):
        for j, det in enumerate(detections):
            cost_matrix[i, j] = 1 - iou(track, det)
    return cost_matrix


def iou(box1, box2):
    # Calculate Intersection over Union (IoU) between two bounding boxes
    x1, y1, x2, y2, _, _ = box1
    x1_, y1_, x2_, y2_, _, _ = box2
    xi1, yi1 = max(x1, x1_), max(y1, y1_)
    xi2, yi2 = min(x2, x2_), min(y2, y2_)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2_ - x1_) * (y2_ - y1_)
    union_area = box1_area + box2_area - inter_area
    return inter_area / union_area


def bipartite_graph_matching(tracklets, detections):
    cost_matrix = calculate_cost_matrix(tracklets, detections)
    # Use Hungarian algorithm or another method to solve bipartite matching
    from scipy.optimize import linear_sum_assignment

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    return row_ind, col_ind


def update_tracklets(tracklets, detections, row_ind, col_ind):
    for r, c in zip(row_ind, col_ind):
        tracklets[r] = detections[c]
    return tracklets

### Full Pipeline Integration

In [6]:
import cv2


def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracklets = []  # Initialize empty tracklets

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detections = detect_tools(frame)  # Step 1: Detect tools

        if tracklets:
            row_ind, col_ind = bipartite_graph_matching(
                tracklets, detections
            )  # Step 3: Match tracklets with detections
            tracklets = update_tracklets(
                tracklets, detections, row_ind, col_ind
            )  # Update tracklets
        else:
            tracklets = detections

        # Visualization or further processing...
        for det in detections:
            x1, y1, x2, y2, conf, _ = det
            # Make all points integers
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{conf:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

        cv2.imshow("Frame", frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()

### Running using a directory

In [9]:
from natsort import natsorted


def preprocess_image(image, bbox):
    x, y, w, h = bbox
    cropped_image = image[y : y + h, x : x + w]
    padded_image = np.pad(cropped_image, ((10, 10), (10, 10), (0, 0)), "constant")
    resized_image = cv2.resize(padded_image, (224, 224))
    return resized_image


def process_directory(directory_path):
    image_files = [
        f for f in os.listdir(directory_path) if f.endswith((".png", ".jpg", ".jpeg"))
    ]
    image_files = natsorted(image_files)[:100]
    tracklets = []
    frame_array = []

    for image_file in image_files:
        frame = cv2.imread(os.path.join(directory_path, image_file))
        detections = detect_tools(frame)

        if tracklets:
            row_ind, col_ind = bipartite_graph_matching(tracklets, detections)
            tracklets = update_tracklets(tracklets, detections, row_ind, col_ind)
        else:
            tracklets = detections

        # Preprocess and predict direction and operator
        for det in detections:
            x1, y1, x2, y2, conf, _ = det
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(
                frame,
                f"{conf:.2f}",
                (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9,
                (36, 255, 12),
                2,
            )

        frame_array.append(frame)

    height, width, layers = frame_array[0].shape
    size = (width, height)
    out = cv2.VideoWriter("output.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 30, size)

    for frame in frame_array:
        out.write(frame)
    out.release()

In [10]:
def main_pipeline(input_path):
    if os.path.isdir(input_path):
        process_directory(input_path)
    else:
        process_video(input_path)


main_pipeline("data/ART-Net/Train/Train_Positive")