In [None]:
import torch
import torch.nn as nn
import numpy as np
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO
from tqdm import tqdm
from sort import Sort
import os
import pickle

%matplotlib inline

In [None]:
# Model Definition remains the same
class Net(torch.nn.Module):
    def __init__(self, sequence):
        super(Net, self).__init__()

        self.model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', weights='ResNet50_Weights.DEFAULT')
        self.model.fc = nn.Identity()
        self.sequence = sequence
        
    def forward(self, x):
        return self.model(x)
    
    def get_embedding(self, bbox, frame_num, camera_id):

        name = 'vdo10.avi' if camera_id == 'c015' else 'vdo.avi'
        path = f"aic19-track1-mtmc-train/train/{self.sequence}/{camera_id}/{name}"
        cap = cv2.VideoCapture(path)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        cap.release()
        frame = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = torch.tensor((cv2.resize(frame, (224, 224)) / 255.0).transpose(2, 0, 1), dtype=torch.float32).unsqueeze(0).cuda()
        return self.model(frame)[0].cpu().numpy()

In [None]:
video_paths = ['aic19-track1-mtmc-train/train/S01/c001/vdo.avi', 
               'aic19-track1-mtmc-train/train/S01/c002/vdo.avi', 
               'aic19-track1-mtmc-train/train/S01/c003/vdo.avi',
               'aic19-track1-mtmc-train/train/S01/c004/vdo.avi',
               'aic19-track1-mtmc-train/train/S01/c005/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c010/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c011/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c012/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c013/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c014/vdo.avi',
               'aic19-track1-mtmc-train/train/S03/c015/vdo10.avi',
               'aic19-track1-mtmc-train/train/S04/c016/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c017/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c018/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c019/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c020/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c021/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c022/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c023/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c024/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c025/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c026/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c027/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c028/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c029/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c030/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c031/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c032/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c033/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c034/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c035/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c036/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c037/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c038/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c039/vdo.avi',
               'aic19-track1-mtmc-train/train/S04/c040/vdo.avi']

In [None]:
def upscale_fps(input_path, output_path, input_fps=8, target_fps=10):
    """
    This function upscales the FPS of a video by interpolating frames using OpenCV and NumPy.
    
    Parameters:
    input_path (str): The path to the input video file.
    output_path (str): The path where the output video will be saved.
    input_fps (int): The original FPS of the video.
    target_fps (int): The target FPS to upscale the video to.
    """
    import cv2
    import numpy as np
    
    # Capture video from input path
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        return "Failed to open video file."
    
    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_path, fourcc, target_fps, (width, height))
    
    prev_frame = None
    success, frame = cap.read()
    while success:
        out.write(frame)  # Write the original frame
        
        # If there's a previous frame, generate interpolated frames
        if prev_frame is not None:
            # Generate interpolated frames
            for _ in range((target_fps // input_fps) - 1):
                interp_frame = cv2.addWeighted(prev_frame, 0.5, frame, 0.5, 0)
                out.write(interp_frame)
                
        prev_frame = frame
        success, frame = cap.read()
        
    # Release everything if job is finished
    cap.release()
    out.release()
    return "Video FPS upscaled successfully."

# upscale_fps('aic19-track1-mtmc-train/train/S03/c015/vdo.avi', 'aic19-track1-mtmc-train/train/S03/c015/vdo10.avi')

In [None]:
def create_detections(video_path, yolo):
    """
    Given a video, create detections for each frame in the video and store bounding boxes in a text file
    """
    cap = cv2.VideoCapture(video_path)

    # Create a text file to store the bounding boxes
    f = open(f"detections/{video_path.split('/')[-2].split('.')[0]}.txt", "w")

    frame_num = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Get the detections for the frame
        detections = yolo(frame, verbose=False)

        # Write the bounding boxes to the text file
        for box in detections[0].boxes:
            if box.cls == 2:
                x1, y1, x2, y2 = box.xyxy[0].int().tolist()
                f.write(f"{frame_num},{x1},{y1},{x2},{y2}\n")

        frame_num += 1

    f.close()
    cap.release()

# for video in tqdm(video_paths):
#     create_detections(video, yolo)

In [None]:
def convert_detections_to_sort_format(detections):
    """
    Convert detections to the format expected by the SORT algorithm. Stacks all the detections of the same frame into a single array.
    """
    max_frame = detections[-1][0]
    all_detections = []
    for frame_num in range(max_frame + 1):
        frame_detections = [d[1:] + [1] for d in detections if d[0] == frame_num]
        if len(frame_detections) == 0:
            # print(f"No detections found for frame {frame_num}")
            frame_detections = np.empty((0, 5))
        all_detections.append(frame_detections)
    return all_detections

def compute_similarity(embedding1, embedding2):
    """
    Compute the Euclidean distance between two embeddings.
    """
    return np.linalg.norm(embedding1 - embedding2)

sequence = 'S01'
if sequence == 'S01':
    camera_ids = ['c001', 'c002', 'c003', 'c004', 'c005']
elif sequence == 'S03':
    camera_ids = ['c010', 'c011', 'c012', 'c013', 'c014', 'c015']
elif sequence == 'S04':
    camera_ids = ['c016', 'c017', 'c018', 'c019', 'c020', 'c021', 'c022', 'c023', 'c024', 'c025', 'c026', 'c027', 'c028', 'c029', 'c030', 'c031', 'c032', 'c033', 'c034', 'c035', 'c036', 'c037', 'c038', 'c039', 'c040']

siamese_net = Net(sequence).cuda().eval()
siamese_net.load_state_dict(torch.load('siamese_model_6.pth', map_location=torch.device('cuda')))
# yolo = YOLO("yolov8n.pt")

# Load detections from text files
detections = {}
for camera_id in camera_ids:
    with open(f'detections/{camera_id}.txt', 'r') as f:
        lines = f.readlines()
        detections[camera_id] = [[int(x) for x in line.strip().split(',')] for line in lines]

# Compute single trackings for each camera
trackings = {c_id: [] for c_id in camera_ids}
for camera_id, dets in tqdm(detections.items(), desc="Computing trackings"):
    dets_sort = convert_detections_to_sort_format(dets)
    mot_tracker = Sort()
    for frame_num, frame_dets in enumerate(dets_sort):
        d = mot_tracker.update(np.array(frame_dets)).astype(int)
        d = [[max(0, x) for x in det] for det in d]
        trackings[camera_id].append(d)

In [None]:
# Pre-compute all embeddings for each detection in every camera and frame
precomputed_embeddings = {}
for cam_id in camera_ids:
    precomputed_embeddings[cam_id] = {}
    for frame, detections in tqdm(enumerate(trackings[cam_id]), desc=f"Precomputing embeddings for {cam_id}", total=len(trackings[cam_id])):
        with torch.no_grad():
            precomputed_embeddings[cam_id][frame] = [
                siamese_net.get_embedding(det[:4], frame, cam_id) for det in detections
            ]

with open('precomputed_embeddings.pkl', 'wb') as f:
    pickle.dump(precomputed_embeddings, f)

In [None]:
with open('precomputed_embeddings.pkl', 'rb') as f:
    precomputed_embeddings = pickle.load(f)

In [None]:
# Initialize matches dictionary
matches = {}

# Only iterate through the first two camera IDs for pairwise comparison
for i, cam1 in enumerate(camera_ids[:2]):
    for cam2 in camera_ids[i+1:2]:
        if cam1 == cam2:
            continue  # Skip if cameras are the same, though this is already ensured by the loop logic

        matches[(cam1, cam2)] = []

        # Iterate through frames and detections for the first camera
        for frame1, detections1 in tqdm(enumerate(trackings[cam1]), desc=f"Matching {cam1} and {cam2}", total=len(trackings[cam1])):
            for frame2, detections2 in enumerate(trackings[cam2]):

                # Utilize the break condition more efficiently
                if abs(frame1 - frame2) > 150:
                    continue

                # Compute matches using precomputed embeddings
                for idx1, det1 in enumerate(detections1):
                    for idx2, det2 in enumerate(detections2):
                        embedding1 = precomputed_embeddings[cam1][frame1][idx1]
                        embedding2 = precomputed_embeddings[cam2][frame2][idx2]
                        similarity = compute_similarity(embedding1, embedding2)

                        if similarity < 3.3:
                            matches[(cam1, cam2)].append((frame1, frame2, det1, det2))

In [None]:
from collections import defaultdict

# Step 1: Initialize Global Tracking
global_tracking = defaultdict(lambda: defaultdict(list))
global_id_counter = 0

# Step 2: Assign Global Identifiers to Matched Detections Across Videos
for (cam1, cam2), matched_detections in matches.items():
    for frame1, frame2, det1, det2 in tqdm(matched_detections, desc=f"Assigning global IDs for {cam1} and {cam2}", total=len(matched_detections)):
        # Check if either detection already has a global ID; if not, assign a new one
        existing_global_id = None
        for gid, tracks in global_tracking.items():
            # Corrected search within the lists of tuples for each camera
            if any((frame1, det1) == tracking for tracking in tracks[cam1]) or any((frame2, det2) == tracking for tracking in tracks[cam2]):
                existing_global_id = gid
                break
        
        if existing_global_id is None:
            existing_global_id = global_id_counter
            global_id_counter += 1
        
        # Add the detections to the global tracking under the found or new global ID
        global_tracking[existing_global_id][cam1].append((frame1, det1))
        global_tracking[existing_global_id][cam2].append((frame2, det2))

# Step 3: Merge Within-Video Trackings
for cam, tracks in trackings.items():
    for frame, detections in tqdm(enumerate(tracks), desc=f"Merging within-video trackings for {cam}", total=len(tracks)):
        for det in detections:
            # Check if this detection is already in global tracking
            already_tracked = False
            for gid, t in global_tracking.items():
                if any((frame, det) == ti for ti in t[cam]):
                    already_tracked = True
                    break
            # If not already tracked, assign a new global ID
            if not already_tracked:
                global_tracking[global_id_counter][cam].append((frame, det))
                global_id_counter += 1

In [None]:
camera_ids = ['c001', 'c002', 'c003', 'c004', 'c005']
# Load detections from text files
detections = {}
for camera_id in camera_ids:
    with open(f'detections/{camera_id}.txt', 'r') as f:
        lines = f.readlines()
        detections[camera_id] = [[int(x) for x in line.strip().split(',')] for line in lines]
trackers = initialize_trackers(detections)

In [None]:
trackers['c005'].trackers

In [None]:
def visualize_tracking(video_path, camera_id, global_tracks, output_path):
    """
    Visualize tracking by drawing bounding boxes and track IDs on video frames for a specific camera.

    Parameters:
    - video_path: Path to the video file for the camera.
    - camera_id: ID of the camera to visualize detections for.
    - global_tracks: List of global tracks, each containing detections from multiple cameras.
    - output_path: Path to save the output video with tracking visualization.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS))

    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MP4V'), frame_rate, (frame_width, frame_height))

    frame_number = 0
    for i in tqdm(range(0, 150)):
        ret, frame = cap.read()
        if not ret:
            break

        # Iterate through all tracks and their detections
        for track in global_tracks:
            for detection in track['detections']:
                # Check if the detection is for the current frame and camera
                if detection['camera_id'] == camera_id and detection['frame_number'] == frame_number:
                    bbox = detection['bbox']
                    x, y, x2, y2 = bbox
                    cv2.rectangle(frame, (x, y), (x2, y2), (0, 127, 0), 2)
                    cv2.putText(frame, f"ID: {track['track_id']}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 127, 0), 2)

        out.write(frame)
        frame_number += 1

    cap.release()
    out.release()
    cv2.destroyAllWindows()


# Load the video
for c in range(0,2):
    video_path = video_paths[c]
    camera_id = video_path.split('/')[-2]
    output_path = f'tracking_output_{camera_id}.mp4' 
    visualize_tracking(video_path, camera_id, global_tracks, output_path)