## HOMEWORK 10

We are going to use and compare three different trackers and compare the results. Plus try to use YOLO for detecting drone as aeroplane.

In our script, we want to compare three types of detectors built into opencv.
The diagram of the script is given below.



![SVG Image](hw_tracking_opencv_v10.svg)

As you can see in the main function of the script there are simply three sections for various videos with drone:
1. Tracking + detection when tracking disappears.
2. Just traсking
3. But just detection.

An initial drone bounding box is manually defined for each video.

As a detector, I took a simple example of using the yolo library to classify objects in a picture, and converted it for convenience into a module with the function of searching for a drone in a picture. In my case, it looks for the largest bounding box in the image that belongs to the *aeroplane* class.

### Code of detector

In [None]:
import cv2
import numpy as np
import time

# Paths to the YOLO files
config_path = 'yolo/yolov3.cfg'
weights_path = 'yolo/yolov3.weights'
names_path = 'yolo/coco.names'

# Load class names
classes = open(names_path).read().strip().split('\n')

# Check if the class "aeroplane" is in the list of classes
if "aeroplane" not in classes:
    print("Error: 'aeroplane' class not found in the provided names file.")
    exit()

aeroplane_class_id = classes.index("aeroplane")

# Load YOLO network
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)

# Get the output layer names
ln = net.getUnconnectedOutLayersNames()

def detect_largest_aeroplane(img):
    if img is None:
        print("Error: Could not open or find the image.")
        return None

    h, w = img.shape[:2]

    # Prepare the image for detection
    blob = cv2.dnn.blobFromImage(img, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)

    # Perform detection
    t0 = time.time()
    outputs = net.forward(ln)
    t = time.time()
    print(f'It took {t - t0:.3f} seconds to process the image.')

    # Process detections
    boxes = []
    confidences = []
    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if class_id == aeroplane_class_id and confidence > 0.5:
                box = detection[:4] * np.array([w, h, w, h])
                (centerX, centerY, width, height) = box.astype("int")
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))

    # Apply non-maxima suppression to filter out weak detections
    indices = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    # Find the largest bounding box
    if len(indices) > 0:
        largest_box = None
        largest_area = 0
        for i in indices.flatten():
            (x, y, w, h) = boxes[i]
            area = w * h
            if area > largest_area:
                largest_area = area
                largest_box = (x, y, w, h)

        if largest_box:
            (x, y, w, h) = largest_box
            # Crop the image to the largest bounding box
##            cropped_img = img[y:y + h, x:x + w]
            return largest_box

    return None

if __name__ == "__main__":
    img_path = 'drone.png'
    # Load the image
    img = cv2.imread(img_path)
    cropped_img = detect_largest_aeroplane(img)

### Code of main script

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import cv2
from yolodetect import detect_largest_aeroplane

FRAMES = 65
COLOR = (0, 0, 255)
FONT_SCALE = 3.0
THICKNESS = 5
K = 0.7
H = int(1080*K)
W = int(1920*K)
RESIZE_DIMS = (W, H)
MODE_TEXT_POS = (3840-1650,2160-200, )


def create_tracker(tracker_type):
    """
    Create an OpenCV tracker based on the given type.
    """
    if tracker_type == 'MIL':
        return cv2.legacy.TrackerMIL_create()
    elif tracker_type == 'KCF':
        return cv2.legacy.TrackerKCF_create()
    elif tracker_type == "CSRT":
        return cv2.legacy.TrackerCSRT_create()
    else:
        raise ValueError(f"Unknown tracker type: {tracker_type}")

def initialize_tracker(cap, tracker, x1, y1, width, height):
    """
    Initialize the tracker with the first frame from the video capture.
    """
    ret, img = cap.read()
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        exit()

    bbox = (x1, y1, width, height)
    tracker.init(img, bbox)

    return img, bbox

def draw_rectangle(img, bbox, color=(0, 255, 0), thickness=2):
    """
    Draw a rectangle around the tracked object.
    """
    x1, y1 = int(bbox[0]), int(bbox[1])
    width, height = int(bbox[2]), int(bbox[3])
    cv2.rectangle(img, (x1, y1), (x1 + width, y1 + height), color, thickness)

def process_frame_with_tracker(tracker, img, tracker_type):
    """
    Process the frame using the tracker and update the bounding box.
    """
    ok, bbox = tracker.update(img)
    if ok:
        draw_rectangle(img, bbox)
        x, y, w, h = [int(v) for v in bbox]
        cv2.putText(img, tracker_type, (x + w, y + h), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
    return ok

def process_frame_with_detector(img):
    """
    Process the frame using the YOLO detector to find the largest aeroplane.
    """
    bbox = detect_largest_aeroplane(img)
    if bbox:
        draw_rectangle(img, bbox)            
        x, y, w, h = bbox
        cv2.putText(img, "AERO", (x + w, y + h), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
    return bbox

def display_frame(img, resize_dimensions=RESIZE_DIMS):
    """
    Display the frame with resized dimensions.
    """
    stretch_near = cv2.resize(img, resize_dimensions, interpolation=cv2.INTER_LINEAR)
    cv2.imshow('frame', stretch_near)

def detecting(video_path):
    """
    detecting function to run the detector on the video.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open video")
        exit()

    tracker_frame_counter = FRAMES
    while tracker_frame_counter > 0:
        ret, img = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break

        bbox = process_frame_with_detector(img)
        if not bbox:
            cv2.putText(img, "Detecting FAIL", (100, 150), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        else:
            cv2.putText(img, "Detecting SUCCESS", (100, 150), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        cv2.putText(img, "Working only DETECTING", MODE_TEXT_POS, cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        display_frame(img)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        tracker_frame_counter -= 1

    cap.release()
    cv2.destroyAllWindows()

def tracking(video_path, tracker_type, x1, y1, x2, y2):
    """
    tracking function to run the tracker on the video.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open video")
        exit()
    
    width, height = x2 - x1, y2 - y1
    tracker = create_tracker(tracker_type)
    img, bbox = initialize_tracker(cap, tracker, x1, y1, width, height)

    tracker_frame_counter = FRAMES
    while tracker_frame_counter > 0:
        ret, img = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break

        ok = process_frame_with_tracker(tracker, img, tracker_type)
        if not ok:
            cv2.putText(img, "Tracking FAIL", (100, 100), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        else:
            cv2.putText(img, "Tracking SUCCESS", (100, 100), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        cv2.putText(img, "Working only TRACKING", MODE_TEXT_POS, cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        display_frame(img)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        tracker_frame_counter -= 1

    cap.release()
    cv2.destroyAllWindows()

def tracking_detecting(video_path, tracker_type, x1, y1, x2, y2):
    """
    tracking_detecting function to run the tracker on the video and help it with yolo detecting.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open video")
        exit()
    
    width, height = x2 - x1, y2 - y1
    tracker = create_tracker(tracker_type)
    img, bbox = initialize_tracker(cap, tracker, x1, y1, width, height)

    tracker_frame_counter = FRAMES
    while tracker_frame_counter > 0:
        ret, img = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break

        ok = process_frame_with_tracker(tracker, img, tracker_type)
        if ok:
            cv2.putText(img, "Tracking SUCCESS", (100, 100), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        else:
            cv2.putText(img, "Tracking FAIL", (100, 100), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
            bbox = process_frame_with_detector(img)
            if not bbox:
                cv2.putText(img, "Detecting FAIL", (100, 180), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
            else:
                tracker = create_tracker(tracker_type)
                tracker.init(img, bbox)
                cv2.putText(img, "Detecting SUCCESS", (100, 180), cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        cv2.putText(img, "Working only TRACKING+DETECTING", MODE_TEXT_POS, cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, COLOR, THICKNESS)
        display_frame(img)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        tracker_frame_counter -= 1

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    video_paths = ["dron1.mov", "dron2.mp4", "dron3.mov", "dron4.mov"]
    tracker_types = ['KCF', 'CSRT', 'MIL']
    rects = [
        (1377, 1499, 1479, 1557),
        (675, 1445, 918, 1572),
##        (685, 1450, 908, 1562),
        (1347, 1817, 1443, 1876),
##        (1357, 1827, 1433, 1866),
        (2439, 1455, 2579, 1510)
    ]
    start_from = 0

    # USE TRACKING AND IF IT NEEDS TRY DETECT
    for video_path, rect in zip(video_paths[start_from:], rects[start_from:]):
        x1, y1, x2, y2 = rect
        for tracker_type in tracker_types:
            tracking_detecting(video_path, tracker_type, x1, y1, x2, y2)  

    
    # USE ONLY TRACKING
    for video_path, rect in zip(video_paths[start_from:], rects[start_from:]):
        x1, y1, x2, y2 = rect
        for tracker_type in tracker_types:
            tracking(video_path, tracker_type, x1, y1, x2, y2)




    #JUST YOLO AEROPLANE DETECTOR
    for video_path in video_paths[start_from:]:
        detecting(video_path) 

### Conclusions

You can see the result of the work on the video
https://youtu.be/wTKhB4wHP2s
The main thing is that even the most stable tracking is lost after 10-20 frames. It is even worse that tracking can think that it is not wrong when it has long since lost the drone.
That is, the system must work together with the detector. The detector must be trained on drones. The Yolo detector gave negative results even against a clear sky. 

To my taste, the best result was given by CSRT, but again, without a reliable detector, its accuracy for tracking is not enough.