In [1]:
import torch
import torch.nn as nn
import numpy as np
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO
from tqdm import tqdm
from sort import Sort
import os, copy
import pickle

%matplotlib inline

In [2]:
# Model Definition remains the same
class Net(torch.nn.Module):
    def __init__(self, sequence):
        super(Net, self).__init__()

        self.model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', weights='ResNet50_Weights.DEFAULT')
        self.model.fc = nn.Identity()
        self.sequence = sequence
        
    def forward(self, x):
        return self.model(x)
    
    def get_embedding(self, bbox, frame_num, camera_id):

        name = 'vdo10.avi' if camera_id == 'c015' else 'vdo.avi'
        path = f"aic19-track1-mtmc-train/train/{self.sequence}/{camera_id}/{name}"
        cap = cv2.VideoCapture(path)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        cap.release()
        frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = torch.tensor((cv2.resize(frame, (224, 224)) / 255.0).transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0).cuda()
        return self.model(frame)[0].cpu().numpy()

In [3]:
video_paths = ['aic19-track1-mtmc-train/train/S01/c001/vdo.avi', 
               'aic19-track1-mtmc-train/train/S01/c002/vdo.avi', 
               'aic19-track1-mtmc-train/train/S01/c003/vdo.avi',
               'aic19-track1-mtmc-train/train/S01/c004/vdo.avi',
               'aic19-track1-mtmc-train/train/S01/c005/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c010/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c011/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c012/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c013/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c014/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c015/vdo10.avi',
               'aic19-track1-mtmc-train/train/S04/c016/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c017/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c018/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c019/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c020/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c021/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c022/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c023/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c024/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c025/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c026/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c027/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c028/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c029/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c030/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c031/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c032/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c033/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c034/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c035/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c036/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c037/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c038/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c039/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c040/vdo.avi']

In [4]:
def upscale_fps(input_path, output_path, input_fps=8, target_fps=10):
    """
    This function upscales the FPS of a video by interpolating frames using OpenCV and NumPy.
    
    Parameters:
    input_path (str): The path to the input video file.
    output_path (str): The path where the output video will be saved.
    input_fps (int): The original FPS of the video.
    target_fps (int): The target FPS to upscale the video to.
    """
    
    # Capture video from input path
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        return "Failed to open video file."
    
    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_path, fourcc, target_fps, (width, height))
    
    prev_frame = None
    success, frame = cap.read()
    while success:
        out.write(frame)  # Write the original frame
        
        # If there's a previous frame, generate interpolated frames
        if prev_frame is not None:
            # Generate interpolated frames
            for _ in range((target_fps // input_fps) - 1):
                interp_frame = cv2.addWeighted(prev_frame, 0.5, frame, 0.5, 0)
                out.write(interp_frame)
                
        prev_frame = frame
        success, frame = cap.read()
        
    # Release everything if job is finished
    cap.release()
    out.release()
    return "Video FPS upscaled successfully."

# upscale_fps('aic19-track1-mtmc-train/train/S03/c015/vdo.avi', 'aic19-track1-mtmc-train/train/S03/c015/vdo10.avi')

In [5]:
def create_detections(video_path, yolo):
    """
    Given a video, create detections for each frame in the video and store bounding boxes in a text file
    """
    cap = cv2.VideoCapture(video_path)

    # Create a text file to store the bounding boxes
    f = open(f"detections/{video_path.split('/')[-2].split('.')[0]}.txt", "w")

    frame_num = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Get the detections for the frame
        detections = yolo(frame, verbose=False)

        # Write the bounding boxes to the text file
        for box in detections[0].boxes:
            if box.cls == 2:
                x1, y1, x2, y2 = box.xyxy[0].int().tolist()
                f.write(f"{frame_num},{x1},{y1},{x2},{y2}\n")

        frame_num += 1

    f.close()
    cap.release()

# for video in tqdm(video_paths):
#     create_detections(video, yolo)

In [7]:
def convert_detections_to_sort_format(detections):
    """
    Convert detections to the format expected by the SORT algorithm. Stacks all the detections of the same frame into a single array.
    """
    max_frame = detections[-1][0]
    all_detections = []
    for frame_num in range(max_frame + 1):
        frame_detections = [d[1:] + [1] for d in detections if d[0] == frame_num]
        if len(frame_detections) == 0:
            # print(f"No detections found for frame {frame_num}")
            frame_detections = np.empty((0, 5))
        all_detections.append(frame_detections)
    return all_detections

def compute_similarity(embedding1, embedding2):
    """
    Compute the Euclidean distance between two embeddings.
    """
    return np.linalg.norm(embedding1 - embedding2)

sequence = 'S03'
if sequence == 'S01':
    camera_ids = ['c001', 'c002', 'c003', 'c004', 'c005']
elif sequence == 'S03':
    camera_ids = ['c010', 'c011', 'c012', 'c013', 'c014', 'c015']
elif sequence == 'S04':
    camera_ids = ['c016', 'c017', 'c018', 'c019', 'c020', 'c021', 'c022', 'c023', 'c024', 'c025', 'c026', 'c027', 'c028', 'c029', 'c030', 'c031', 'c032', 'c033', 'c034', 'c035', 'c036', 'c037', 'c038', 'c039', 'c040']

siamese_net = Net(sequence).cuda().eval()
siamese_net.load_state_dict(torch.load('siamese_model_6.pth', map_location=torch.device('cuda')))
# yolo = YOLO("yolov8n.pt")

# Load detections from text files
detections = {}
for camera_id in camera_ids:
    with open(f'detections/{camera_id}.txt', 'r') as f:
        lines = f.readlines()
        detections[camera_id] = [[int(x) for x in line.strip().split(',')] for line in lines]

# Compute single trackings for each camera
trackings = {c_id: [] for c_id in camera_ids}
for camera_id, dets in tqdm(detections.items(), desc="Computing trackings"):
    dets_sort = convert_detections_to_sort_format(dets)#[:150]
    mot_tracker = Sort()
    for frame_num, frame_dets in enumerate(dets_sort):
        d = mot_tracker.update(np.array(frame_dets)).astype(int)
        d = [[max(0, x) for x in det] for det in d]
        trackings[camera_id].append(d)

Using cache found in /home/lgudino/.cache/torch/hub/pytorch_vision_v0.10.0
Computing trackings: 100%|██████████| 6/6 [00:19<00:00,  3.23s/it]


In [None]:
# # Pre-compute all embeddings for each detection in every camera and frame
# precomputed_embeddings = {}
# for cam_id in camera_ids:
#     precomputed_embeddings[cam_id] = {}
#     for frame, detections in tqdm(enumerate(trackings[cam_id]), desc=f"Precomputing embeddings for {cam_id}", total=len(trackings[cam_id])):
#         with torch.no_grad():
#             precomputed_embeddings[cam_id][frame] = [
#                 siamese_net.get_embedding(det[:4], frame, cam_id) for det in detections
#             ]

# with open('precomputed_embeddings.pkl', 'wb') as f:
#     pickle.dump(precomputed_embeddings, f)

In [8]:
with open('precomputed_embeddings_s03.pkl', 'rb') as f:
    precomputed_embeddings = pickle.load(f)

In [9]:
matches = {}
# Initialize a dictionary to track the best match for each detection across all cameras and frames
best_matches = {cam_id: {frame: {} for frame in range(len(trackings[cam_id]))} for cam_id in camera_ids}

for i, cam1 in enumerate(camera_ids):
    for j, cam2 in tqdm(enumerate(camera_ids[i+1:]), desc=f"Comparing {cam1} with all other cameras", total=len(camera_ids) - i - 1):
        if cam1 == cam2:
            continue  # Skip comparing the camera with itself

        for frame1, detections1 in enumerate(trackings[cam1]):
            for frame2, detections2 in enumerate(trackings[cam2]):
                if abs(frame1 - frame2) > 70:
                    if frame1 < frame2:
                        break
                    else:
                        continue

                for idx1, det1 in enumerate(detections1):
                    for idx2, det2 in enumerate(detections2):
                        embedding1 = precomputed_embeddings[cam1][frame1][idx1]
                        embedding2 = precomputed_embeddings[cam2][frame2][idx2]
                        similarity = compute_similarity(embedding1, embedding2)

                        current_best = best_matches[cam1][frame1].get(idx1, (None, None, float('inf'), None, None))
                        _, _, current_best_similarity, _, _ = current_best

                        if similarity < current_best_similarity and similarity < 7:
                            best_matches[cam1][frame1][idx1] = (cam2, frame2, similarity, idx2, det2)

for cam1, frames in best_matches.items():
    for frame1, detections in frames.items():
        for idx1, (cam2, frame2, _, idx2, det2) in detections.items():
            if cam2 is None:
                continue  # No match found for this detection
            if (cam1, cam2) not in matches:
                matches[(cam1, cam2)] = []
            det1 = trackings[cam1][frame1][idx1]
            matches[(cam1, cam2)].append((frame1, frame2, det1, det2))

trackings_global = copy.deepcopy(trackings)
for (cam1, cam2), match_list in matches.items():
    for frame1, frame2, det1, det2 in tqdm(match_list, desc=f"Updating global trackings for {cam1} and {cam2}"):

        for frame_num, bbox_local in enumerate(trackings[cam2]):
            if frame2 == frame_num:
                for idx,bbox in enumerate(trackings[cam2][frame2]):
                    if bbox[-1] == det2[-1]:
                        break

                for idx2,bbox in enumerate(trackings[cam1][frame1]):
                    if bbox[-1] == det1[-1]:
                        break
                
                # print(cam1, frame1, idx2, cam2, frame2, idx)
                trackings_global[cam1][frame1][idx2][-1] = trackings[cam2][frame2][idx][-1]

Comparing c010 with all other cameras:   0%|          | 0/5 [00:00<?, ?it/s]

Comparing c010 with all other cameras: 100%|██████████| 5/5 [04:26<00:00, 53.30s/it]
Comparing c011 with all other cameras: 100%|██████████| 4/4 [04:55<00:00, 73.90s/it]
Comparing c012 with all other cameras: 100%|██████████| 3/3 [02:26<00:00, 48.81s/it]
Comparing c013 with all other cameras: 100%|██████████| 2/2 [00:29<00:00, 14.82s/it]
Comparing c014 with all other cameras: 100%|██████████| 1/1 [01:29<00:00, 89.86s/it]
Comparing c015 with all other cameras: 0it [00:00, ?it/s]
Updating global trackings for c010 and c011: 100%|██████████| 4736/4736 [00:01<00:00, 4604.72it/s]
Updating global trackings for c010 and c012: 100%|██████████| 397/397 [00:00<00:00, 4041.02it/s]
Updating global trackings for c010 and c014: 100%|██████████| 3687/3687 [00:00<00:00, 4732.31it/s]
Updating global trackings for c010 and c015: 100%|██████████| 3374/3374 [00:00<00:00, 5779.62it/s]
Updating global trackings for c010 and c013: 100%|██████████| 325/325 [00:00<00:00, 3449.10it/s]
Updating global trackings 

In [None]:
def visualize_tracking(video_path, camera_id, trackings_global, output_path):
    """
    Visualize tracking by drawing bounding boxes and track IDs on video frames for a specific camera.

    Parameters:
    - video_path: Path to the video file for the camera.
    - camera_id: ID of the camera to visualize detections for.
    - global_tracks: List of global tracks, each containing detections from multiple cameras.
    - output_path: Path to save the output video with tracking visualization.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))

    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MP4V'), frame_rate, (frame_width, frame_height))

    for frame_number in tqdm(range(0, 200)):
        ret, frame = cap.read()
        if not ret:
            break

        # Iterate through all tracks and their detections
        for bbox in trackings_global[camera_id][frame_number]:
            x1, y1, x2, y2, track_id = bbox
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 127, 0), 2)
            cv2.putText(frame, str(track_id), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 127, 0), 2)

        out.write(frame)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Load the video
for c in range(0, len(camera_ids)):
    video_path = video_paths[5+c]
    camera_id = video_path.split('/')[-2]
    output_path = f'tracking_output_{camera_id}.mp4' 
    visualize_tracking(video_path, camera_id, trackings_global, output_path)


In [12]:
# Placeholder values for <conf>, <x>, <y>, <z> since these are not provided
conf, x, y, z = 1, -1, -1, -1  # Using -1 to indicate unknown or not applicable

for cam in camera_ids:
(14.098+4.088+8.259)
    # Convert data to the required gt.txt format
    gt_content = []
    for frame, bboxes in enumerate(trackings_global[cam]):
        s = set()
        for bbox in bboxes:
            bb_left, bb_top, bb_right, bb_bottom, obj_id = map(int, bbox)
            if obj_id in s:
                continue
            s.add(obj_id)
            bb_width = bb_right - bb_left
            bb_height = bb_bottom - bb_top
            gt_content.append(f"{frame+1}, {obj_id}, {bb_left}, {bb_top}, {bb_width}, {bb_height}, {conf}, {x}, {y}, {z}")

    # Join all entries to form the final content for the gt.txt file
    gt_text = "\n".join(gt_content)

    file_path = f'TrackEval/data/trackers/mot_challenge/parabellum-s04-train/metric_learning/data/s04_{cam}.txt'  # Define the file path
    with open(file_path, 'w') as f:
        f.write(gt_text)

In [13]:
!python TrackEval/scripts/run_mot_challenge.py --BENCHMARK parabellum-s04 --SPLIT_TO_EVAL train --TRACKERS_TO_EVAL metric_learning --METRICS HOTA Identity --DO_PREPROC False

Error importing BURST due to missing underlying dependency: No module named 'pycocotools'

Eval Config:
USE_PARALLEL         : False                         
NUM_PARALLEL_CORES   : 8                             
BREAK_ON_ERROR       : True                          
RETURN_ON_ERROR      : False                         
LOG_ON_ERROR         : /mnt/DATA/lgudino/C6/mcv-c6-2024-team4/Week 4/TrackEval/error_log.txt
PRINT_RESULTS        : True                          
PRINT_ONLY_COMBINED  : False                         
PRINT_CONFIG         : True                          
TIME_PROGRESS        : True                          
DISPLAY_LESS_PROGRESS : False                         
OUTPUT_SUMMARY       : True                          
OUTPUT_EMPTY_CLASSES : True                          
OUTPUT_DETAILED      : True                          
PLOT_CURVES          : True                          

MotChallenge2DBox Config:
PRINT_CONFIG         : True                          
GT_FOLDER          