Task 3: single-object tracking challenge. The task aims to estimate the state of a target, indicated in the first frame, in the subsequent video frames.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import imageio.v2 as io
import cv2
import torch
import os
import sys
from utils.utils_funcitons import *
device= "cuda" if torch.cuda.is_available() else "cpu"
from ultralytics import YOLO

In [2]:
dataset_path = r'.\dataset\Task3_Single_Object_Tracking'
sequences_folder = os.path.join(dataset_path, 'sequences')
annotations_folder = os.path.join(dataset_path, 'annotations')
model_path = r'.\models\best_yolov8n.pt'

### YOLOv8 Object Tracker

In [86]:
model = YOLO(model_path)

# Images folder path
#img_folder = os.path.join(sequences_folder, 'uav0000024_00000_s') #people
#img_folder = os.path.join(sequences_folder, 'uav0000029_01102_s') #motor
img_folder = os.path.join(sequences_folder, 'uav0000086_00870_s') #people

#img_folder = os.path.join(dataset_path, r'sequences\uav0000120_04775_v')
#img_folder = os.path.join(dataset_path, r'sequences\uav0000370_00001_v')
img_names = sorted(os.listdir(img_folder))

# Video size
width, height = 1280, 720

# Codec and VideoWriter
codec = cv2.VideoWriter_fourcc(*'XVID')
fps = 30
output_video = cv2.VideoWriter(r'.\outputs\Task3\output_yolo.mp4', codec, fps, (width, height))

# Frame processing

for i, img in enumerate(img_names):
    img_fullpath = os.path.join(img_folder, img)
    frame = cv2.imread(img_fullpath)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    if frame is not None:
        # Redimensionar la imagen si es necesario
        if width and height:
            result_boxes = track_objects(model, frame, objects=['pedestrian'])
            # Detection control. If no detections were found, use the original frame
            if len(result_boxes[0].cls) > 0 and result_boxes[0].id != None: 
                pred_cls = result_boxes[0].cls.detach().cpu().numpy()
                conf = result_boxes[0].conf.detach().cpu().numpy()
                bboxes_xyxy = result_boxes[0].xyxy.detach().cpu().numpy()
                track_id = result_boxes[0].id

                obj_id_to_track = 44
                pos = torch.nonzero(track_id == obj_id_to_track)[0,0] if torch.nonzero(track_id == obj_id_to_track).nelement() != 0 else None

                if pos != None:
                    cv2.rectangle(frame, (int(bboxes_xyxy[pos][0]), int(bboxes_xyxy[pos][1])), (int(bboxes_xyxy[pos][2]), int(bboxes_xyxy[pos][3])), (0,0,255), 2)
                    cv2.putText(frame, f"People-{obj_id_to_track}", (int(bboxes_xyxy[pos][0]) + 10, int(bboxes_xyxy[pos][1]) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)        

                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                frame_bgr = cv2.resize(frame_bgr, (width, height))
            else:
                frame = cv2.resize(frame, (width, height))
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

                '''
                #SOLO TRACKEAMOS EL PRIMER OBJETO ENCONTRADO
                if i == 0:
                    obj_id_to_track = 3
                    pos = torch.nonzero(track_id == obj_id_to_track)[0,0] if torch.nonzero(track_id == obj_id_to_track).nelement() != 0 else 0

                    #first_id = track_id.detach().cpu().numpy()[0]
                    cv2.rectangle(frame, (int(bboxes_xyxy[0]), int(bboxes_xyxy[1])), (int(bboxes_xyxy[2]), int(bboxes_xyxy[3])), (0,0,255), 2)
                    cv2.putText(frame, f"Pedestrian-{first_id}", (int(bboxes_xyxy[0]) + 10, int(bboxes_xyxy[1]) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
                    display_image(frame)
                elif torch.any(torch.eq(track_id, first_id)):
                    cv2.rectangle(frame, (int(bboxes_xyxy[0]), int(bboxes_xyxy[1])), (int(bboxes_xyxy[2]), int(bboxes_xyxy[3])), (0,0,255), 2)
                    cv2.putText(frame, f"Pedestrian-{first_id}", (int(bboxes_xyxy[0]) + 10, int(bboxes_xyxy[1]) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1) 
                
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                frame_bgr = cv2.resize(frame_bgr, (width, height))
            else:
                frame = cv2.resize(frame, (width, height))
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)                
                '''
        #display_image(frame_bgr)
        output_video.write(frame_bgr)
    #print("FRAME")
# Liberar recursos
output_video.release()
cv2.destroyAllWindows()

YOLOv8 Track all objects in a class

In [85]:
model = YOLO(model_path)

objects = ['motor']
number_class_list = []
if objects!=None:
    if objects!=['all']:
        for object in objects: 
            number_class_list.append(classes.index(object))
    elif objects == ['all']:
        number_class_list = list(range(len(classes)))

# Images folder path
#img_folder = os.path.join(sequences_folder, 'uav0000024_00000_s') #people
img_folder = os.path.join(sequences_folder, 'uav0000029_01102_s') #motor

#img_folder = os.path.join(dataset_path, r'sequences\uav0000120_04775_v')
#img_folder = os.path.join(dataset_path, r'sequences\uav0000370_00001_v')
img_names = sorted(os.listdir(img_folder))

# Video size
width, height = 1280, 720

# Codec and VideoWriter
codec = cv2.VideoWriter_fourcc(*'XVID')
fps = 30
output_video = cv2.VideoWriter(r'.\outputs\Task3\output_yolo.mp4', codec, fps, (width, height))

# Frame processing
for i, img in enumerate(img_names):
    img_fullpath = os.path.join(img_folder, img)
    frame = cv2.imread(img_fullpath)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    if frame is not None:
        # Redimensionar la imagen si es necesario
        if width and height:
            results = model.track(frame, persist=True, verbose=False, conf=0.3, iou=0.5, classes=number_class_list, imgsz=704)
            annotated_frame = results[0].plot()               
            #display_image(annotated_frame)
            annotated_frame = cv2.resize(annotated_frame, (width, height))
            output_video.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
    #print("FRAME")
# Liberar recursos
output_video.release()
cv2.destroyAllWindows()

### Deep-Sort for object tracking

In [43]:
from deep_sort.utils.parser import get_config
from deep_sort.deep_sort import DeepSort
from deep_sort.sort.tracker import Tracker

deep_sort_weights = 'deep_sort/deep/checkpoint/ckpt.t7'
tracker = DeepSort(model_path=deep_sort_weights, max_age=70) #max_age=max number of frame until the model discart the prediction

In [65]:
# Images folder path
img_folder = os.path.join(sequences_folder, 'uav0000024_00000_s')
#img_folder = os.path.join(dataset_path, r'sequences\uav0000120_04775_v')
#img_folder = os.path.join(dataset_path, r'sequences\uav0000370_00001_v')
img_names = sorted(os.listdir(img_folder))

# Video size
width, height = 1280, 720

# Codec and VideoWriter
codec = cv2.VideoWriter_fourcc(*'XVID')
fps = 30
output_video = cv2.VideoWriter(r'.\outputs\Task3\output.mp4', codec, fps, (width, height))

# Frame processing
for img in img_names:
    img_fullpath = os.path.join(img_folder, img)
    frame = cv2.imread(img_fullpath)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    if frame is not None:
        # Redimensionar la imagen si es necesario
        if width and height:
            result_boxes = detector.detect(frame, objects=['pedestrian'], conf_thresh=0.6, imgsz=640)
            # Detection control. If no detections were found, use the original frame
            if len(result_boxes[0].cls) > 0:
                pred_cls = result_boxes[0].cls.detach().cpu().numpy()
                conf = result_boxes[0].conf.detach().cpu().numpy()
                xyxy = result_boxes[0].xyxy.detach().cpu().numpy()
                bboxes_xywh = result_boxes[0].xywh.detach().cpu().numpy()
                #print(pred_cls)
                
                tracks = tracker.update(bboxes_xywh, conf, frame)
                for track in tracker.tracker.tracks:
                    track_id = track.track_id
                    x1, y1, x2, y2 = track.to_tlbr()  # Get bounding box coordinates in (x1, y1, x2, y2) format
                    w = x2 - x1  # Calculate width
                    h = y2 - y1  # Calculate height

                    #print(pred_cls, i)
                    #print(classes[int(pred_cls[i])])
                    cv2.rectangle(frame, (int(x1), int(y1)), (int(x1 + w), int(y1 + h)), (0,0,255), 2)
                    cv2.putText(frame, f"Pedestrian-{track_id}", (int(x1) + 10, int(y1) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
                    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                    frame_bgr = cv2.resize(frame_bgr, (width, height))
                    
            else:
                frame = cv2.resize(frame, (width, height))
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)                

        output_video.write(frame_bgr)
    #print("FRAME")
# Liberar recursos
output_video.release()
cv2.destroyAllWindows()