In [1]:
import cv2
import numpy as np
from ultralytics import YOLO
import tqdm
import matplotlib.pyplot as plt

In [2]:
def detect_objects(model, image):
    # Perform inference
    results = model(image, verbose=False)
    # Extract bounding box coordinates, labels, and confidence scores
    return [box.xyxy[0].to(int).tolist() for box in results[0].boxes if box.cls == 2]

In [3]:
def voc_iou(pred, gt):
    """
    Calculate IoU between detect box and gt boxes.
    :param pred: Predicted bounding box coordinates [x1, y1, x2, y2].
    :param gt: Ground truth bounding box coordinates [[x1, y1, x2, y2]].
    """
    # compute overlaps
    # intersection
    ixmin = np.maximum(gt[0], pred[0])
    iymin = np.maximum(gt[1], pred[1])
    ixmax = np.minimum(gt[2], pred[2])
    iymax = np.minimum(gt[3], pred[3])
    iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
    ih = np.maximum(iymax - iymin + 1.0, 0.0)
    inters = iw * ih

    # union
    uni = (
        (pred[2] - pred[0] + 1.0) * (pred[3] - pred[1] + 1.0)
        + (gt[2] - gt[0] + 1.0) * (gt[3] - gt[1] + 1.0)
        - inters
    )

    return inters / uni

In [4]:
model = YOLO("yolov8m.pt")
start_frame = 0
cap = cv2.VideoCapture(r'data\S03\c010\vdo.avi')
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
detections = [detect_objects(model, cap.read()[1]) for _ in tqdm.tqdm(range(start_frame, n_frames))]
cap.release()

  3%|▎         | 57/2141 [01:47<1:08:38,  1.98s/it]

KeyboardInterrupt: 

In [6]:
def get_flow_region(prev_roi, current_roi):

    # Convert frames to grayscale
    prev_gray = cv2.cvtColor(prev_roi, cv2.COLOR_BGR2GRAY)
    current_gray = cv2.cvtColor(current_roi, cv2.COLOR_BGR2GRAY)

    # Detect Harris corners in the ROI
    # prev_points = cv2.goodFeaturesToTrack(prev_gray, maxCorners=100, qualityLevel=0.2, minDistance=3)
    # if prev_points is None:
    #     return np.array([])

    h, w = prev_gray.shape
    y, x = np.mgrid[0:h, 0:w].astype(np.float32)
    prev_points = np.stack((x, y), axis=-1).reshape(-1, 1, 2)

    # Calculate optical flow (Lucas-Kanade)
    current_points, status, error = cv2.calcOpticalFlowPyrLK(prev_gray, current_gray, prev_points, None)
    # Select good points (where status == 1)
    good_new = current_points[status == 1]
    good_old = prev_points[status == 1]
    # Compute the flow vectors
    flow_vectors = good_new - good_old
    return flow_vectors
    

def track_objects(detections, start_frame=0, optical_flow=True):
    """
    Tracking of objects across frames using IoU and optical flow.
    detections: list of lists containing detected bounding boxes for each frame.
    """
    active_objects = {}  # Maps object ID to last seen bounding box
    next_track_id = 0
    iou_threshold = 0.3  # Minimum IoU to consider a match

    tracking = dict()
    tracking_video = []
    cap = cv2.VideoCapture(r'data\S03\c010\vdo.avi')
    _, prev_frame = cap.read()

    # Initialize tracks with the first frame detections
    for box in detections[0]:
        active_objects[next_track_id] = box
        next_track_id += 1

    tracking_video.append(active_objects)
    tracking[1] = [value + [key] for key, value in active_objects.items()]

    # Iterate over each frame
    for idx, current_detections in tqdm.tqdm(enumerate(detections[1:], start=1), total=len(detections[1:])):
        _, current_frame = cap.read()

        updated_tracks = {}
        for track_id, box in active_objects.items():
            # Estimate new position using optical flow (or any other means)
            flow_region = get_flow_region(prev_frame[box[1]:box[3], box[0]:box[2]], current_frame[box[1]:box[3], box[0]:box[2]])
            if flow_region.size > 0 and optical_flow:
                dx, dy = np.max(flow_region, axis=0)
                new_box = np.round([box[0] + dx, box[1] + dy, box[2] + dx, box[3] + dy]).astype(int)
            else:
                new_box = box  # No flow information, keep the old box

            updated_tracks[track_id] = new_box  # Update with new position

        # Match current detections to updated tracks based on IoU
        current_objects = {}
        for bbox_curr in current_detections:
            best_id, max_iou = None, 0
            for track_id, bbox_prev in updated_tracks.items():
                iou = voc_iou(bbox_curr, bbox_prev)
                if iou > max_iou:
                    max_iou, best_id = iou, track_id

            if max_iou > iou_threshold:
                current_objects[best_id] = bbox_curr
            else:
                current_objects[next_track_id] = bbox_curr
                next_track_id += 1

        # Update tracking information for the next frame
        active_objects = current_objects
        tracking_video.append(active_objects)
        tracking[idx+start_frame+1] = [value + [key] for key, value in active_objects.items()]

        prev_frame = current_frame.copy()  # Update the frame for the next iteration

    return tracking, tracking_video # Return the tracking information

tracking, tracking_video = track_objects(detections, start_frame, optical_flow=False)
tracking_of, tracking_video_of = track_objects(detections, start_frame, optical_flow=True)


  0%|          | 0/266 [00:00<?, ?it/s]

100%|██████████| 266/266 [09:13<00:00,  2.08s/it]
100%|██████████| 266/266 [08:35<00:00,  1.94s/it]


In [7]:
cap = cv2.VideoCapture('data/S03/c010/vdo.avi')

# Store tracking history for each object
tracking_history = {}
tracking_history_of = {}
# Store colors for each object ID
colors = {}

for start in tqdm.tqdm(range(start_frame, n_frames//8, 100)):

    cap.set(cv2.CAP_PROP_POS_FRAMES, start)
    video = cv2.VideoWriter(f'tracking/yolotrackerof/tracking_{start}.mp4', -1, fps, (width, height), True)

    for i in range(start, min(start + 100, n_frames//8)):
        ret, frame = cap.read()
        if not ret:
            break

        # Draw detected bounding boxes and tracking lines
        for obj_id, bbox in tracking_video[i-start_frame].items():
            # Assign a unique color if new object
            if obj_id not in colors:
                colors[obj_id] = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))

            # Draw the bounding box
            start_point = (int(bbox[0]), int(bbox[1]))
            end_point = (int(bbox[2]), int(bbox[3]))
            frame = cv2.rectangle(frame, start_point, end_point, (0,255,0), 1)
            frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 1, cv2.LINE_AA)
            
            # Update tracking history
            center_position = ((start_point[0] + end_point[0]) // 2, (start_point[1] + end_point[1]) // 2)
            if obj_id not in tracking_history:
                tracking_history[obj_id] = [center_position]
            else:
                tracking_history[obj_id].append(center_position)
            
            # Draw tracking line (polyline for all historical positions)
            if len(tracking_history[obj_id]) > 1:
                for j in range(1, len(tracking_history[obj_id])):
                    cv2.line(frame, tracking_history[obj_id][j - 1], tracking_history[obj_id][j], (0,255,0), 2)

        # Draw detected bounding boxes and tracking lines
        for obj_id, bbox in tracking_video_of[i-start_frame].items():
            # Assign a unique color if new object
            if obj_id not in colors:
                colors[obj_id] = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))

            # Draw the bounding box
            start_point = (int(bbox[0]), int(bbox[1]))
            end_point = (int(bbox[2]), int(bbox[3]))
            frame = cv2.rectangle(frame, start_point, end_point, (255,0,0), 1)
            frame = cv2.putText(frame, str(obj_id), start_point, cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 1, cv2.LINE_AA)
            
            # Update tracking history
            center_position = ((start_point[0] + end_point[0]) // 2, (start_point[1] + end_point[1]) // 2)
            if obj_id not in tracking_history_of:
                tracking_history_of[obj_id] = [center_position]
            else:
                tracking_history_of[obj_id].append(center_position)
            
            # Draw tracking line (polyline for all historical positions)
            if len(tracking_history_of[obj_id]) > 1:
                for j in range(1, len(tracking_history_of[obj_id])):
                    cv2.line(frame, tracking_history_of[obj_id][j - 1], tracking_history_of[obj_id][j], (255,0,0), 2)

        video.write(frame)

video.release()
cap.release()

100%|██████████| 3/3 [00:13<00:00,  4.43s/it]
