In [1]:
#import necessary modules
from src.sort import Sort
from dataset import *
from PIL import Image
import cv2
import pickle
import time
import sys
import io
import csv
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import xml.etree.ElementTree as ET
import motmetrics as mm
import numpy as np
  

  from .autonotebook import tqdm as notebook_tqdm


## Instantiate SORT and load necessary files

In [2]:
#create instance of SORT
mot_kalman_tracker = Sort()
#frames to predict/visualise
frame_start = 0
frame_end = 2141

#load sequence
file_path = "../datasets/AICity_data/train/S03/c010/vdo.avi"
out = []
total_time = 0
dataset_dicts = get_dicts('all', pretrained=True)

100%|██████████| 2141/2141 [03:38<00:00,  9.81it/s]


In [3]:
def read_pkl_bboxes(input_pkl_path):
  """Load bounding box predictions from a pkl file containing the predicted bounding boxes.
  input_pkl_path: Input path of the pkl file containing gt and predicted bboxes. They are formatted in a two position array [bboxes_pred, bboxes_gt].
  This file can be generated saving bbox_pred and bbox_gt from task 1.1
  Outputs:
      pred_bboxes_proper_format: Predicted bboxes in the SORT format (List of lists where each box is defined as [xtl, ytl, xbr, ybr])
  
  """
  #load detections
  with open(input_pkl_path, 'rb') as handle:
      detections_dict = pickle.load(handle)


  bboxes_pred = detections_dict[0]
  bboxes_gt = detections_dict[1]
  detections = []
  index = frame_start
  pred_bboxes_proper_format = []

  for detection_index in bboxes_pred:
      bboxes_pred_list = bboxes_pred[detection_index]
      bboxes_gt_list = bboxes_gt[detection_index]
      curr_bboxes = []
      for bbox in bboxes_pred_list:
          bbox_proper_format = [ bbox['xtl'],  bbox['ytl'],bbox['xbr'],  bbox['ybr']]
          curr_bboxes.append(bbox_proper_format)
      pred_bboxes_proper_format.append(curr_bboxes)
      index +=1

  return pred_bboxes_proper_format

def read_csv_bboxes(input_txt_path):
  """Load bounding box predictions from a txt file containing the predicted bounding boxes.
  input_txt_path: Input path of the pkl file containing the predicted bboxes. Each row is written in the MOT challenge format
  This file can be generated with the outputs of 2.1 (tracking ids are ignored)
  Outputs:
      pred_bboxes_proper_format: Predicted bboxes in the SORT format (List of lists where each box is defined as [xtl, ytl, xbr, ybr])
  
  """
  #load detections
  pred_bboxes_proper_format = []
  with open("output_retinanet.txt", 'r') as file:
    csvreader = csv.reader(file)
    prev_frame = 1
    curr_bboxes = []
    for row in csvreader:
      curr_frame = row[0]
      if curr_frame!=prev_frame:
        pred_bboxes_proper_format.append(curr_bboxes)
        curr_bboxes = []
        prev_frame = curr_frame
      bbox_proper_format_temp = [ float(row[2]),  float(row[3]),float(row[4])+float(row[2]),  float(row[5])+float(row[3])]
      curr_bboxes.append(bbox_proper_format_temp)
  return pred_bboxes_proper_format

In [7]:
#examples on loading them
input_pkl_path = "faster05.pkl"
pred_bboxes_proper_format = read_pkl_bboxes(input_pkl_path)
input_txt_path = "output_faster-rcnn.txt"
pred_bboxes_proper_format = read_csv_bboxes(input_txt_path)


## Predict and visualise results

In [None]:

frame_end=10
frame_start = 0
output_gif_filename = 'retinanettest.gif'

In [9]:
def fig2img(fig):
    buf = io.BytesIO()
    fig.savefig(buf)
    buf.seek(0)
    img = Image.open(buf)
    return img


bboxes_gt = {}
bboxes_pred = {}
video_frames_out =[]
index = frame_start

colours = np.random.rand(32,3) #used only for display
for d in dataset_dicts[frame_start:frame_end]:
    image_id = d["image_id"]
    if image_id<frame_start or image_id>frame_end:
        index = index+1
        print("Skipping frame ", image_id)
        continue

    curr_detection = np.asarray(pred_bboxes_proper_format[index])
    curr_frame = cv2.imread(d["file_name"])
    curr_frame = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2RGB)
    
    start_time = time.time()
    trackers = mot_kalman_tracker.update(curr_detection)
    cycle_time = time.time() - start_time
    total_time += cycle_time
    index = index+1
    

    out.append(trackers)
    #save gif
    fig, ax = plt.subplots(1, 1, dpi=150)

    ax.set_title('Faster R-CNN detections')
    ax.imshow(curr_frame)
    ax.axis('off')
    for j in range(np.shape(curr_detection)[0]):
        color = colours[j]
        coords = (curr_detection[j,0],curr_detection[j,1]), curr_detection[j,2], curr_detection[j,3]
        ax.add_patch(patches.Rectangle((coords[0][0],coords[0][1]),coords[1]-coords[0][0],coords[2]-coords[0][1],fill=False,lw=3, ec = (1,1,1)))

    for d in trackers:
        d = d.astype(np.uint32)
        row = [image_id+1, d[4], d[0], d[1], d[2], d[3], -1,-1,-1,-1]
        ax.add_patch(patches.Rectangle((d[0],d[1]),d[2]-d[0],d[3]-d[1],fill=False,lw=3,ec=colours[d[4]%32,:]))
        ax.text(((d[0]+d[2])/2), max(0,(d[1]-30)), "ID: "+str(d[4]), horizontalalignment='center', verticalalignment='center',fontsize=12, color='white') 

    frame_out = fig2img(fig)
    video_frames_out.append(frame_out)
    plt.close(fig)
video_frames_out[0].save(output_gif_filename, save_all=True, append_images=video_frames_out[1:], duration=30, loop=0)
    



In [13]:
#read gt
def parse_xml(file_path):
    """
    Parses an XML file and extracts bounding box information for each frame and track.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        tuple: A tuple containing two dictionaries, `tracks` and `frames`.
            `tracks` contains information for each track, with the track IDs as keys and the box information
            for each frame as values.
            `frames` contains information for each frame, with the frame numbers as keys and a list of boxes as
            values.
    """
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    frames = {}

    # Iterate over the tracks and extract their bounding box information
    for track in root.findall(".//track[@label='car']"):
        track_id = track.get('id')
        for box in track.findall(".//box"):
            box_frame = int(box.get('frame'))
            xtl, ytl, xbr, ybr = map(float, [box.get('xtl'), box.get('ytl'), box.get('xbr'), box.get('ybr')])
            outside, occluded, keyframe = map(int, [box.get('outside'), box.get('occluded'), box.get('keyframe')])
            parked = box.find(".//attribute[@name='parked']").text == 'true'

            # Add the box to the list of boxes for this frame
            if box_frame not in frames:
                frames[box_frame] = []

            frames[box_frame].append({
                'xtl': xtl,
                'ytl': ytl,
                'xbr': xbr,
                'ybr': ybr,
                'track_id': track_id,
                'occluded': occluded
            })

    return frames


frames_gt =parse_xml('../datasets/ai_challenge_s03_c010-full_annotation.xml')
frames_gt = dict(sorted(frames_gt.items(), key=lambda x: x[0]))
# Open the output file for writing
output_file = "output_annotations.txt"
output_fp = open(output_file, "w")
for frame in frames_gt:
    for bbox in frames_gt[frame]:
        x, y, z = -1, -1, -1  # No information about x, y, z
        line = "{},{},{},{},{},{},{},{},{},{}\n".format(
            str(int(frame)+1),
            bbox['track_id'],
            bbox['xtl'],
            bbox['ytl'],
            bbox['xbr'] - bbox['xtl'],
            bbox['ybr'] - bbox['ytl'],
            1,
            x,
            y,
            z
        )

        output_fp.write(line)

#Release the output file
output_fp.close()


In [15]:
#simple way to compute metrics quick (the folder to do it with TrackEval is "data_kalman/")
#from https://github.com/cheind/py-motmetrics

def motMetricsEnhancedCalculator(gtSource, tSource):
  # load ground truth
  gt = np.loadtxt(gtSource, delimiter=',')

  # load tracking output
  t = np.loadtxt(tSource, delimiter=',')

  # Create an accumulator that will be updated during each frame
  acc = mm.MOTAccumulator(auto_id=True)

  # Max frame number maybe different for gt and t files
  for frame in range(int(gt[:,0].max())):
    frame += 1 # detection and frame numbers begin at 1

    # select id, x, y, width, height for current frame
    # required format for distance calculation is X, Y, Width, Height \
    # We already have this format
    gt_dets = gt[gt[:,0]==frame,1:6] # select all detections in gt
    t_dets = t[t[:,0]==frame,1:6] # select all detections in t

    C = mm.distances.iou_matrix(gt_dets[:,1:], t_dets[:,1:], \
                                max_iou=0.5) # format: gt, t

    # Call update once for per frame.
    # format: gt object ids, t object ids, distance
    acc.update(gt_dets[:,0].astype('int').tolist(), \
              t_dets[:,0].astype('int').tolist(), C)

  mh = mm.metrics.create()

  summary = mh.compute(acc, metrics=['num_frames', 'idf1', 'idp', 'idr', \
                                     'recall', 'precision', 'num_objects', \
                                     'mostly_tracked', 'partially_tracked', \
                                     'mostly_lost', 'num_false_positives', \
                                     'num_misses', 'num_switches', \
                                     'num_fragmentations', 'mota', 'motp' \
                                    ], \
                      name='acc')

  strsummary = mm.io.render_summary(
      summary,
      #formatters={'mota' : '{:.2%}'.format},
      namemap={'idf1': 'IDF1', 'idp': 'IDP', 'idr': 'IDR', 'recall': 'Rcll', \
               'precision': 'Prcn', 'num_objects': 'GT', \
               'mostly_tracked' : 'MT', 'partially_tracked': 'PT', \
               'mostly_lost' : 'ML', 'num_false_positives': 'FP', \
               'num_misses': 'FN', 'num_switches' : 'IDsw', \
               'num_fragmentations' : 'FM', 'mota': 'MOTA', 'motp' : 'MOTP',  \
              }
  )
  print(strsummary)


In [16]:
#due to memory issues, these files have been generated using the .py file as they required too 
motMetricsEnhancedCalculator('gt.txt', \
                             './outputs/kalman_dets_fasterrcnn.txt')
motMetricsEnhancedCalculator('gt.txt', \
                             "./outputs/kalman_dets_maskrcnn.txt")
motMetricsEnhancedCalculator('gt.txt', \
                             './outputs/kalman_dets_retinanet.txt')
motMetricsEnhancedCalculator('gt.txt', \
                             "./outputs/kalman_dets_finetuned.txt")

     num_frames      IDF1      IDP       IDR      Rcll      Prcn     GT  MT  PT  ML    FP     FN  IDsw  FM      MOTA     MOTP
acc        2141  0.460036  0.52739  0.407937  0.532694  0.688679  21594  14  29  13  5200  10091    49  83  0.289617  0.14509
     num_frames      IDF1       IDP       IDR     Rcll      Prcn     GT  MT  PT  ML    FP    FN  IDsw  FM      MOTA      MOTP
acc        2141  0.506023  0.585667  0.445448  0.54881  0.721566  21594  14  31  11  4573  9743    65  96  0.334028  0.165039
     num_frames     IDF1      IDP       IDR      Rcll      Prcn     GT  MT  PT  ML  FP     FN  IDsw   FM      MOTA      MOTP
acc        2141  0.48833  0.73637  0.365287  0.494119  0.996079  21594  10  24  22  42  10924    73  100  0.488793  0.144266
     num_frames      IDF1       IDP       IDR      Rcll      Prcn     GT  MT  PT  ML   FP   FN  IDsw  FM      MOTA      MOTP
acc        2141  0.795711  0.808352  0.783458  0.960406  0.990922  21594  43  13   0  190  855    40  52  0.949755  0.065