In [2]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import motmetrics
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
import motmetrics

# import some common libraries
import numpy as np
import os, json, cv2, random

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

  from .autonotebook import tqdm as notebook_tqdm


Code to compute IOU

In [4]:
def compute_iou(bboxA, bboxB):
    # Code provided by teacher in M1 subject
    # compute the intersection over union of two bboxes
    
    # Format of the bboxes is [xtl, ytl, xbr, ybr, ...], where tl and br
    # indicate top-left and bottom-right corners of the bbox respectively.

    # determine the coordinates of the intersection rectangle
    xA = max(bboxA[0], bboxB[0])
    
    yA = max(bboxA[1], bboxB[1])
    xB = min(bboxA[2], bboxB[2])
    yB = min(bboxA[3], bboxB[3])
    
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
 
    # compute the area of both bboxes
    bboxAArea = (bboxA[3] - bboxA[1] + 1) * (bboxA[2] - bboxA[0] + 1)
    bboxBArea = (bboxB[3] - bboxB[1] + 1) * (bboxB[2] - bboxB[0] + 1)
    
    iou = interArea / float(bboxAArea + bboxBArea - interArea)
    
    # return the intersection over union value
    return iou




Convert annotations.xml to gt.txt

In [50]:

import xml.etree.ElementTree as ET
def parse_xml(file_path):
    """
    Parses an XML file and extracts bounding box information for each frame and track.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        tuple: A tuple containing two dictionaries, `tracks` and `frames`.
            `tracks` contains information for each track, with the track IDs as keys and the box information
            for each frame as values.
            `frames` contains information for each frame, with the frame numbers as keys and a list of boxes as
            values.
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    frames = {}

    # Iterate over the tracks and extract their bounding box information
    for track in root.findall(".//track[@label='car']"):
        track_id = track.get('id')
        for box in track.findall(".//box"):
            box_frame = int(box.get('frame'))
            xtl, ytl, xbr, ybr = map(float, [box.get('xtl'), box.get('ytl'), box.get('xbr'), box.get('ybr')])
            outside, occluded, keyframe = map(int, [box.get('outside'), box.get('occluded'), box.get('keyframe')])
            parked = box.find(".//attribute[@name='parked']").text == 'true'

            # Add the box to the list of boxes for this frame
            if box_frame not in frames:
                frames[box_frame] = []

            frames[box_frame].append({
                'xtl': xtl,
                'ytl': ytl,
                'xbr': xbr,
                'ybr': ybr,
                'track_id': track_id,
                'occluded': occluded
            })

    return frames


frames_gt = parse_xml('annotations.xml')
frames_gt = dict(sorted(frames_gt.items(), key=lambda x: x[0]))
# Open the output file for writing
output_file = "output_annotations.txt"
output_fp = open(output_file, "w")

for frame in frames_gt:
    for bbox in frames_gt[frame]:
        x, y, z = -1, -1, -1  # No information about x, y, z
        line = "{},{},{},{},{},{},{},{},{},{}\n".format(
            str(int(frame)+1),
            bbox['track_id'],
            bbox['xtl'],
            bbox['ytl'],
            bbox['xbr'] - bbox['xtl'],
            bbox['ybr'] - bbox['ytl'],
            1,
            x,
            y,
            z
        )

        output_fp.write(line)

#Release the output file
output_fp.close()

## Compute object tracking using Detectron2 (Mask R-CNN, Retinanet, Faster R-CNN)

In [None]:
# Load the video
video = cv2.VideoCapture('vdo.avi')


#Load configuration
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = 'model_final.pth'
predictor = DefaultPredictor(cfg)


# Initialize variables

object_tracker = {}
current_id = 0
frame_num = 1

#Initialize video array
video_output = []
# Open the output file for writing
output_file = "output_faster-rcnn-finetuned.txt"
output_fp = open(output_file, "w")
while True:
    # Read the current frame from the video
    
    ret, frame = video.read()

    # Stop if there are no more frames
    if not ret:
        break

    # Detect objects in the current frame using the pretrained model
    outputs = predictor(frame) 
    current_objects = outputs['instances']

    instances = outputs["instances"].to("cpu")
    boxes = instances.pred_boxes.tensor.numpy()
    print("Boxes", boxes)
    classes = instances.pred_classes.numpy()
    print("Classes", classes)
    #print(classes)
    scores = instances.scores.numpy()
    num_boxes = boxes.shape[0]
    # Initialize a new dictionary to store the detected objects in the current frame
    new_object_tracker = {}

    for i in range(num_boxes):
        box = boxes[i]
        score = scores[i]
        class_object = classes[i]
        # Filter out low-scoring objects
        if score < 0.5:
            continue

        # if class_object != 2:
        #     continue

        # Assign a new ID to each new detected object
        current_id += 1
        # Try to match the detected object with a previously tracked object based on IoU
        best_match_id = None
        best_match_iou = 0
        for object_id, object_box in object_tracker.items():
            iou = compute_iou(box, object_box)
            if iou > best_match_iou:
                best_match_iou = iou
                best_match_id = object_id
        # If the best match has IoU > 0.4, assign the same ID to the detected object
        if best_match_id is not None and best_match_iou > 0.4:
            new_object_tracker[best_match_id] = box
            del object_tracker[best_match_id]
        else:
            new_object_tracker[current_id] = box

        
    # Update the object tracker for the current frame
    object_tracker = new_object_tracker
    # Visualize the tracked objects in the current frame
    tracked_boxes = []
    tracked_ids = []
    v = Visualizer(frame[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
    #out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    index_color = 0
    for object_id, object_box in object_tracker.items():
        tracked_boxes.append(object_box)
        tracked_ids.append(object_id)
        out = v.draw_text(f"{object_id}", (object_box[0], object_box[1]), font_size=8)
        out = v.draw_box(object_box, )
    
    result = out.get_image()[:, :, ::-1]
    video_output.append(Image.fromarray(result))
    # Display the current frame with the tracked objects
    cv2.imshow("Object tracking", result)
    if cv2.waitKey(50) & 0xFF == ord('q'):
        break

    # Write the tracker output to the output file in the MOT16 format
    
    for track, bbox in object_tracker.items():
    
        x, y, z = -1, -1, -1  # No information about x, y, z
        line = "{},{},{},{},{},{},{},{},{},{}\n".format(
            frame_num,
            track,
            bbox[0],
            bbox[1],
            bbox[2] - bbox[0],
            bbox[3] - bbox[1],
            1,
            x,
            y,
            z
        )
        output_fp.write(line)

    frame_num += 1

# Release the video and the output file and close all windows
video.release()
cv2.destroyAllWindows()
output_fp.close()

Create Video

In [45]:
from PIL import Image
# Set up the video parameters
video_name = 'my_video.mp4'
fps = 30

# Get the shape of the first image to set the size of the video
height, width, _ = cv2.cvtColor(np.array(video_output[0]), cv2.COLOR_RGB2BGR).shape
size = (width, height)

# Create the video writer object
video_writer = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)

# Write each frame to the video
for image in video_output:
    video_writer.write(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))

# Release the video writer
video_writer.release()


## Calculate metrics

Using py-motmetrics

In [33]:
def motMetricsEnhancedCalculator(gtSource, tSource):
  # import required packages
  import motmetrics as mm
  import numpy as np
  
  # load ground truth
  gt = np.loadtxt(gtSource, delimiter=',')

  # load tracking output
  t = np.loadtxt(tSource, delimiter=',')

  # Create an accumulator that will be updated during each frame
  acc = mm.MOTAccumulator(auto_id=True)

  # Max frame number maybe different for gt and t files
  for frame in range(int(gt[:,0].max())):
    frame += 1 # detection and frame numbers begin at 1

    # select id, x, y, width, height for current frame
    # required format for distance calculation is X, Y, Width, Height \
    # We already have this format
    gt_dets = gt[gt[:,0]==frame,1:6] # select all detections in gt
    t_dets = t[t[:,0]==frame,1:6] # select all detections in t

    C = mm.distances.iou_matrix(gt_dets[:,1:], t_dets[:,1:], \
                                max_iou=0.5) # format: gt, t

    # Call update once for per frame.
    # format: gt object ids, t object ids, distance
    acc.update(gt_dets[:,0].astype('int').tolist(), \
              t_dets[:,0].astype('int').tolist(), C)

  mh = mm.metrics.create()

  summary = mh.compute(acc, metrics=['num_frames', 'idf1', 'idp', 'idr', \
                                     'recall', 'precision', 'num_objects', \
                                     'mostly_tracked', 'partially_tracked', \
                                     'mostly_lost', 'num_false_positives', \
                                     'num_misses', 'num_switches', \
                                     'num_fragmentations', 'mota', 'motp' \
                                    ], \
                      name='acc')

  strsummary = mm.io.render_summary(
      summary,
      #formatters={'mota' : '{:.2%}'.format},
      namemap={'idf1': 'IDF1', 'idp': 'IDP', 'idr': 'IDR', 'recall': 'Rcll', \
               'precision': 'Prcn', 'num_objects': 'GT', \
               'mostly_tracked' : 'MT', 'partially_tracked': 'PT', \
               'mostly_lost' : 'ML', 'num_false_positives': 'FP', \
               'num_misses': 'FN', 'num_switches' : 'IDsw', \
               'num_fragmentations' : 'FM', 'mota': 'MOTA', 'motp' : 'MOTP',  \
              }
  )
  print(strsummary)

In [55]:
motMetricsEnhancedCalculator('output_annotations.txt', \
                             'output.txt')
motMetricsEnhancedCalculator('output_retinanet.txt', \
                             'output.txt')
motMetricsEnhancedCalculator('output_faster-rcnn.txt', \
                             'output.txt')

     num_frames      IDF1       IDP       IDR      Rcll     Prcn     GT  MT  PT  ML    FP    FN  IDsw   FM      MOTA      MOTP
acc        2141  0.526952  0.580971  0.482125  0.609706  0.73471  21594  16  33   7  4754  8428   234  422  0.378716  0.173958
     num_frames      IDF1       IDP       IDR      Rcll      Prcn     GT   MT  PT  ML    FP  FN  IDsw  FM      MOTA      MOTP
acc        2141  0.618829  0.503627  0.802365  0.994844  0.624442  11248  242   8  14  6730  58    27   9  0.394115  0.094197
     num_frames      IDF1      IDP       IDR      Rcll     Prcn     GT   MT  PT   ML    FP    FN  IDsw  FM      MOTA      MOTP
acc        2141  0.753288  0.75106  0.755529  0.936791  0.93125  17814  240  70  296  1232  1126   167  94  0.858258  0.124866


Using TrackEval

In [53]:
%run TrackEval/scripts/run_mot_challenge.py --BENCHMARK MASK-RCNN --DO_PREPROC False
%run TrackEval/scripts/run_mot_challenge.py --BENCHMARK RetinaNet --DO_PREPROC False


Eval Config:
USE_PARALLEL         : False                         
NUM_PARALLEL_CORES   : 8                             
BREAK_ON_ERROR       : True                          
RETURN_ON_ERROR      : False                         
LOG_ON_ERROR         : c:\Users\hicha\Desktop\M6_week3\mcv-m6-2023-team4\week3\TrackEval\error_log.txt
PRINT_RESULTS        : True                          
PRINT_ONLY_COMBINED  : False                         
PRINT_CONFIG         : True                          
TIME_PROGRESS        : True                          
DISPLAY_LESS_PROGRESS : False                         
OUTPUT_SUMMARY       : True                          
OUTPUT_EMPTY_CLASSES : True                          
OUTPUT_DETAILED      : True                          
PLOT_CURVES          : True                          

MotChallenge2DBox Config:
PRINT_CONFIG         : True                          
GT_FOLDER            : c:\Users\hicha\Desktop\M6_week3\mcv-m6-2023-team4\week3\TrackEval\data/gt/mo

<Figure size 640x480 with 0 Axes>