In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
import tqdm
import matplotlib.pyplot as plt
import torch
import pyflow.pyflow as pyflow
import xml.etree.ElementTree as elemTree
from typing import Dict

In [None]:
def detect_objects(model, image):
    # Perform inference
    with torch.no_grad():
        results = model(image, verbose=False)
    # Extract bounding box coordinates, labels, and confidence scores
    return [box.xyxy[0].to(int).tolist() for box in results[0].boxes if box.cls == 2]

In [None]:
def voc_iou(pred, gt):
    """
    Calculate IoU between detect box and gt boxes.
    :param pred: Predicted bounding box coordinates [x1, y1, x2, y2].
    :param gt: Ground truth bounding box coordinates [[x1, y1, x2, y2]].
    """
    # compute overlaps
    # intersection
    ixmin = np.maximum(gt[0], pred[0])
    iymin = np.maximum(gt[1], pred[1])
    ixmax = np.minimum(gt[2], pred[2])
    iymax = np.minimum(gt[3], pred[3])
    iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
    ih = np.maximum(iymax - iymin + 1.0, 0.0)
    inters = iw * ih

    # union
    uni = (
        (pred[2] - pred[0] + 1.0) * (pred[3] - pred[1] + 1.0)
        + (gt[2] - gt[0] + 1.0) * (gt[3] - gt[1] + 1.0)
        - inters
    )

    return inters / uni

In [None]:
model = YOLO("yolov8m.pt")
start_frame = 0
cap = cv2.VideoCapture(r'data\S03\c010\vdo.avi')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
detections = [detect_objects(model, cap.read()[1]) for _ in tqdm.tqdm(range(start_frame, n_frames))]
cap.release()

In [None]:
def get_flow_region(prev_roi, current_roi):

    if prev_roi.shape[0] == 0 or prev_roi.shape[1] == 0 or current_roi.shape[0] == 0 or current_roi.shape[1] == 0:
        return 0,0
    
    prev_roi = cv2.resize(prev_roi, (64, 64)).astype(float)  / 255
    current_roi = cv2.resize(current_roi, (64, 64)).astype(float)  / 255

    u, v, _ = pyflow.coarse2fine_flow(prev_roi, current_roi, 0.01, 0.5, 30, 7, 1, 30)

    dx = np.max(u)
    dy = np.max(v)

    return dx, dy


def track_objects(detections, start_frame=0, optical_flow=True):
    """
    Tracking of objects across frames using IoU and optical flow.
    detections: list of lists containing detected bounding boxes for each frame.
    """
    active_objects = {}  # Maps object ID to last seen bounding box
    next_track_id = 0
    iou_threshold = 0.3  # Minimum IoU to consider a match

    tracking = dict()
    tracking_video = []
    cap = cv2.VideoCapture(r'data\S03\c010\vdo.avi')
    _, prev_frame = cap.read()

    # Initialize tracks with the first frame detections
    for box in detections[0]:
        active_objects[next_track_id] = box
        next_track_id += 1

    tracking_video.append(active_objects)
    tracking[1] = [value + [key] for key, value in active_objects.items()]

    # Iterate over each frame
    for idx, current_detections in tqdm.tqdm(enumerate(detections[1:], start=1), total=len(detections[1:])):
        _, current_frame = cap.read()

        if optical_flow:
            flows = {}
            for track_id, box in active_objects.items():
                # Estimate new position using optical flow (or any other means)
                dx, dy = get_flow_region(prev_frame[box[1]:box[3], box[0]:box[2]], current_frame[box[1]:box[3], box[0]:box[2]])
                flows[track_id] = [dx, dy]

        # Match current detections to updated tracks based on IoU
        current_objects = {}
        for bbox_curr in current_detections:
            best_id, max_iou = None, 0
                
            for track_id, bbox_prev in active_objects.items():
                if optical_flow:
                    bbox_curr = np.round([bbox_curr[0] + flows[track_id][0], bbox_curr[1] + flows[track_id][1], bbox_curr[2] + flows[track_id][0], bbox_curr[3] + flows[track_id][1]]).astype(int).tolist()
                iou = voc_iou(bbox_curr, bbox_prev)
                if iou > max_iou:
                    max_iou, best_id = iou, track_id

            if max_iou > iou_threshold:
                current_objects[best_id] = bbox_curr
            else:
                current_objects[next_track_id] = bbox_curr
                next_track_id += 1

        # Update tracking information for the next frame
        active_objects = current_objects
        tracking_video.append(active_objects)
        tracking[idx+start_frame+1] = [value + [key] for key, value in active_objects.items()]

        prev_frame = current_frame.copy()  # Update the frame for the next iteration

    return tracking, tracking_video # Return the tracking information

tracking, tracking_video = track_objects(detections, start_frame, optical_flow=False)
tracking_of, tracking_video_of = track_objects(detections, start_frame, optical_flow=True)


In [None]:
cap = cv2.VideoCapture('data/S03/c010/vdo.avi')

# Store tracking history for each object
tracking_history = {}
tracking_history_of = {}
# Store colors for each object ID
colors = {}

for start in tqdm.tqdm(range(start_frame, n_frames, 100)):

    cap.set(cv2.CAP_PROP_POS_FRAMES, start)
    video = cv2.VideoWriter(f'tracking/trackerof_joint/tracking_{start}.mp4', -1, fps, (width, height), True)

    for i in range(start, min(start + 100, n_frames)):
        ret, frame = cap.read()
        if not ret:
            break

        for obj_id, bbox in tracking_video[i-start_frame].items():
            # Assign a unique color if new object
            if obj_id not in colors:
                colors[obj_id] = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))

            # Draw the bounding box
            start_point = (int(bbox[0]), int(bbox[1]))
            end_point = (int(bbox[2]), int(bbox[3]))
            frame = cv2.rectangle(frame, start_point, end_point, (0,0,255), 2)
            # frame = cv2.rectangle(frame, start_point, end_point, colors[obj_id], 2)
            frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2, cv2.LINE_AA)
            # frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, colors[obj_id], 2, cv2.LINE_AA)
            
            # Update tracking history
            center_position = ((start_point[0] + end_point[0]) // 2, (start_point[1] + end_point[1]) // 2)
            if obj_id not in tracking_history:
                tracking_history[obj_id] = [center_position]
            else:
                tracking_history[obj_id].append(center_position)
            
            # Draw tracking line (polyline for all historical positions)
            if len(tracking_history[obj_id]) > 1:
                for j in range(1, len(tracking_history[obj_id])):
                    cv2.line(frame, tracking_history[obj_id][j - 1], tracking_history[obj_id][j], (0,0,255), 2)
                    # cv2.line(frame, tracking_history[obj_id][j - 1], tracking_history[obj_id][j], colors[obj_id], 2)

        # Draw detected bounding boxes and tracking lines
        for obj_id, bbox in tracking_video_of[i-start_frame].items():
            # Assign a unique color if new object
            if obj_id not in colors:
                colors[obj_id] = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))

            # Draw the bounding box
            start_point = (int(bbox[0]), int(bbox[1]))
            end_point = (int(bbox[2]), int(bbox[3]))
            frame = cv2.rectangle(frame, start_point, end_point, (255,0,0), 2)
            # frame = cv2.rectangle(frame, start_point, end_point, colors[obj_id], 2)
            frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 2, cv2.LINE_AA)
            # frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, colors[obj_id], 2, cv2.LINE_AA)
            
            # Update tracking history
            center_position = ((start_point[0] + end_point[0]) // 2, (start_point[1] + end_point[1]) // 2)
            if obj_id not in tracking_history_of:
                tracking_history_of[obj_id] = [center_position]
            else:
                tracking_history_of[obj_id].append(center_position)
            
            # Draw tracking line (polyline for all historical positions)
            if len(tracking_history_of[obj_id]) > 1:
                for j in range(1, len(tracking_history_of[obj_id])):
                    # cv2.line(frame, tracking_history_of[obj_id][j - 1], tracking_history_of[obj_id][j], colors[obj_id], 2)
                    cv2.line(frame, tracking_history_of[obj_id][j - 1], tracking_history_of[obj_id][j], (255,0,0), 2)

        video.write(frame)
video.release()
cap.release()

In [None]:
# Placeholder values for <conf>, <x>, <y>, <z> since these are not provided
conf, x, y, z = 1, -1, -1, -1  # Using -1 to indicate unknown or not applicable

# Convert data to the required gt.txt format
gt_content = []
for frame, bboxes in tracking_of.items():
    for bbox in bboxes:
        bb_left, bb_top, bb_right, bb_bottom, obj_id = map(int, bbox)
        bb_width = bb_right - bb_left
        bb_height = bb_bottom - bb_top
        gt_content.append(f"{frame}, {obj_id}, {bb_left}, {bb_top}, {bb_width}, {bb_height}, {conf}, {x}, {y}, {z}")

# Join all entries to form the final content for the gt.txt file
gt_text = "\n".join(gt_content)

file_path = 'TrackEval/data/trackers/mot_challenge/week3-train/yolotracker/data/week3-01.txt'  # Define the file path
with open(file_path, 'w') as f:
    f.write(gt_text)

!python TrackEval/scripts/run_mot_challenge.py --BENCHMARK week3 --SPLIT_TO_EVAL train --TRACKERS_TO_EVAL yolotracker --METRICS HOTA Identity --DO_PREPROC False