In [219]:
!pip install matplotlib filterpy lap

Defaulting to user installation because normal site-packages is not writeable


In [220]:
import os
import cv2
import sys
import numpy as np
from ultralytics import YOLO
import torch
import torchvision
import random
from deep_sort_realtime.deepsort_tracker import DeepSort
torch.backends.cudnn.enabled = True  # Enable cuDNN
torch.backends.cudnn.benchmark = True  # Use cuDNN's auto-tuner for the best performance
from sort.sort import *
# from deep_sort.deep_sort.tracker import Tracker
# from deep_sort.tools import generate_detections as gdet
# from deep_sort.deep_sort import nn_matching
# from deep_sort.deep_sort.detection import Detection

In [221]:
# device = torch.device("cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [222]:
# results = model.track(source="./darknet/Video1.MP4", show=True)  # Tracking with default tracker

SORT tracker with FasterRCNN detection

In [225]:
def tracker_sort_rcnn(input_path,output_path):
    video_path = input_path
    video_out_path = output_path 
    # os.path.join('.','out1.mp4')

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()

    cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'MP4V'), cap.get(cv2.CAP_PROP_FPS),
                            (frame.shape[1], frame.shape[0]))

    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).to(device)
    model.eval()
    tracker = Sort(iou_threshold=0.2,max_age=600)

    # colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for j in range(10)]

    detection_threshold = 0.97
    RED = (255,0,0)
    WHITE = (255, 255, 255)
    track_ids = []
    count = 0
    temp = 0
    bounding_boxes_ids = np.array([])
    while ret:
        transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
        frame_transformed = transform(frame).to(device)
        with torch.no_grad():
#       pred = model([img])
            results = model([frame_transformed])[0]
        detections = []
        for i in range(len(results["scores"])):
            confidence = results["scores"][i]
            if float(confidence) < detection_threshold:
                continue
            xmin, ymin, xmax, ymax = int(results["boxes"][i][0]), int(results["boxes"][i][1]), int(results["boxes"][i][2]), int(results["boxes"][i][3])
            class_id = int(results["labels"][i])
            if class_id not in [3]:
                continue
            # add the bounding box (x, y, w, h), confidence and class id to the results list
            detections.append([xmin, ymin, xmax, ymax])

        if(len(detections) != 0):
            detections = np.array(detections)
            detections = detections.reshape([detections.shape[0], 4])
            tracks = tracker.update(detections)

            for i in range(tracks.shape[0]):
                bounding_boxes_ids = np.append(bounding_boxes_ids, np.array([tracks[i][4]]))

            count = np.unique(bounding_boxes_ids).shape[0]
            temp = count
            # detections = np.array(detections)
            for i in range(detections.shape[0]):
                cv2.rectangle(frame, (int(detections[i][0]), int(detections[i][1])), (int(detections[i][2]), int(detections[i][3])), (0, 255, 0), 2)
                cv2.putText(frame, "Car", (int(detections[i][0])-10, int(detections[i][1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        else:
            count = temp
        cv2.putText(frame, f'Count: {int(count)}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
        cap_out.write(frame)
        ret, frame = cap.read()
    print(count)
    cap.release()
    cap_out.release()
    cv2.destroyAllWindows()

SORT tracker with YOLOV5 detection

In [226]:
def tracker_sort_yolo(input_path,output_path):
    video_path = input_path
    video_out_path = output_path 
    # os.path.join('.','out1.mp4')

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()

    cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'MP4V'), cap.get(cv2.CAP_PROP_FPS),
                            (frame.shape[1], frame.shape[0]))

    model = YOLO("yolov5l.pt")
    # metric = nn_matching.NearestNeighborDistanceMetric("cosine",0.3,None)
    # tracker = Tracker(metric)
    tracker = Sort(iou_threshold=0.2,max_age=600)

    # colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for j in range(10)]

    detection_threshold = 0.75
    RED = (255,0,0)
    WHITE = (255, 255, 255)
    track_ids = []
    count = 0
    temp = 0
    bounding_boxes_ids = np.array([])
    while ret:
        results = model.predict(frame)[0]
        detections = []
        for result in results.boxes.data.tolist():
            confidence = result[4]
            if float(confidence) < detection_threshold:
                continue
            xmin, ymin, xmax, ymax = int(result[0]), int(result[1]), int(result[2]), int(result[3])
            class_id = int(result[5])
            if class_id not in [2]:
                continue
            # add the bounding box (x, y, w, h), confidence and class id to the results list
            detections.append([xmin, ymin, xmax , ymax])

        if(len(detections) != 0):
            detections = np.array(detections)
            detections = detections.reshape([detections.shape[0], 4])
            tracks = tracker.update(detections)

            for i in range(tracks.shape[0]):
                bounding_boxes_ids = np.append(bounding_boxes_ids, np.array([tracks[i][4]]))

            count = np.unique(bounding_boxes_ids).shape[0]
            temp = count
            # detections = np.array(detections)
            for i in range(detections.shape[0]):
                cv2.rectangle(frame, (int(detections[i][0]), int(detections[i][1])), (int(detections[i][2]), int(detections[i][3])), (0, 255, 0), 2)
                cv2.putText(frame, "Car", (int(detections[i][0])-10, int(detections[i][1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
        else:
            count = temp
        cv2.putText(frame, f'Count: {int(count)}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
        cap_out.write(frame)
        ret, frame = cap.read()
    print(count)
    cap.release()
    cap_out.release()
    cv2.destroyAllWindows()

Deepsort Tracker with YOLOV5 detector

In [227]:
def tracker_deepsort_yolo(input_path,output_path):
    video_path = input_path
    video_out_path = output_path 
    # os.path.join('.','out1.mp4')

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()

    cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'MP4V'), 25,
                            (frame.shape[1], frame.shape[0]))

    model = YOLO("yolov5l.pt")
    # metric = nn_matching.NearestNeighborDistanceMetric("cosine",0.3,None)
    # tracker = Tracker(metric)
    tracker = DeepSort(max_age=25,nms_max_overlap = 0.3)

    # colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for j in range(10)]

    detection_threshold = 0.81
    RED = (255,0,0)
    WHITE = (255, 255, 255)
    track_ids = []
    count = 0
    while ret:
        results = model.predict(frame)[0]
        detections = []
        for result in results.boxes.data.tolist():
            confidence = result[4]
            if float(confidence) < detection_threshold:
                continue
            xmin, ymin, xmax, ymax = int(result[0]), int(result[1]), int(result[2]), int(result[3])
            class_id = int(result[5])
            if class_id not in [2]:
                continue
            # add the bounding box (x, y, w, h), confidence and class id to the results list
            detections.append([[xmin, ymin, xmax - xmin, ymax - ymin], confidence, class_id])
            
        tracks = tracker.update_tracks(detections, frame=frame)

        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            if(track_id not in track_ids):
                count += 1
                track_ids.append(track_id)
            ltrb = track.to_ltrb()
            # print(ltrb)
            x1,y1,x2,y2 = int(ltrb[0]),int(ltrb[1]),int(ltrb[2]),int(ltrb[3])
            cv2.rectangle(frame, (x1, y1), (x2,y2), RED, 2)
            cv2.putText(frame, str(track_id), (xmin + 5, ymin - 8),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, WHITE, 2)
        cv2.putText(frame, f'Count: {int(count)}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
        cap_out.write(frame)
        ret, frame = cap.read()
    print(count)
    cap.release()
    cap_out.release()
    cv2.destroyAllWindows()

Deepsort tracker with FasterRCNN detector

In [228]:
def tracker_deepsort_rcnn(input_path,output_path):
    video_path = input_path
    video_out_path = output_path 
    # os.path.join('.','out1.mp4')

    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()

    cap_out = cv2.VideoWriter(video_out_path, cv2.VideoWriter_fourcc(*'MP4V'), cap.get(cv2.CAP_PROP_FPS),
                            (frame.shape[1], frame.shape[0]))

    # model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).to(device)
    model.eval()
    # metric = nn_matching.NearestNeighborDistanceMetric("cosine",0.3,None)
    # tracker = Tracker(metric)
    tracker = DeepSort(max_iou_distance=0.3,max_age=20)

    # colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for j in range(10)]

    detection_threshold = 0.993
    RED = (255,0,0)
    WHITE = (255, 255, 255)
    track_ids = []
    count = 0
    while ret:
        transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
        frame_transformed = transform(frame).to(device)
        with torch.no_grad():
#       pred = model([img])
            results = model([frame_transformed])[0]
        # print(results)
        detections = []
        for i in range(len(results["scores"])):
            confidence = results["scores"][i]
            if float(confidence) < detection_threshold:
                continue
            xmin, ymin, xmax, ymax = int(results["boxes"][i][0]), int(results["boxes"][i][1]), int(results["boxes"][i][2]), int(results["boxes"][i][3])
            class_id = int(results["labels"][i])
            if class_id not in [3]:
                continue
            # add the bounding box (x, y, w, h), confidence and class id to the results list
            detections.append([[xmin, ymin, xmax - xmin, ymax - ymin], confidence, class_id])

        tracks = tracker.update_tracks(detections, frame=frame)

        for track in tracks:
            if not track.is_confirmed():
                continue
            track_id = track.track_id
            if(track_id not in track_ids):
                count += 1
                track_ids.append(track_id)
            ltrb = track.to_ltrb()
            # print(ltrb)
            x1,y1,x2,y2 = int(ltrb[0]),int(ltrb[1]),int(ltrb[2]),int(ltrb[3])
            cv2.rectangle(frame, (x1, y1), (x2,y2), RED, 2)
            cv2.putText(frame, str(track_id), (xmin + 5, ymin - 8),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, WHITE, 2)
        # count = tracker.tracker._next_id-1
        cv2.putText(frame, f'Count: {int(count)}', (20, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 2)
        cap_out.write(frame)
        ret, frame = cap.read()
    print(count)
    cap.release()
    cap_out.release()
    cv2.destroyAllWindows()

In [229]:
input_path = os.path.join('.','Original.mp4')
output_path1 = os.path.join('.','out1.mp4')
output_path2 = os.path.join('.','out2.mp4')
output_path3 = os.path.join('.','out3.mp4')
output_path4 = os.path.join('.','out4.mp4')

In [233]:
tracker_deepsort_yolo(input_path,output_path1)


In [234]:
tracker_deepsort_rcnn(input_path,output_path2)

In [235]:
tracker_sort_yolo(input_path,output_path3)

PRO TIP 💡 Replace 'model=yolov5l.pt' with new 'model=yolov5lu.pt'.
YOLOv5 'u' models are trained with https://github.com/ultralytics/ultralytics and feature improved performance vs standard YOLOv5 models trained with https://github.com/ultralytics/yolov5.



OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 384x640 8 persons, 1 bicycle, 8 cars, 6 motorcycles, 1 bus, 2 trucks, 2 backpacks, 43.0ms
Speed: 1.7ms preprocess, 43.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 7 cars, 6 motorcycles, 1 bus, 2 trucks, 1 backpack, 42.4ms
Speed: 2.6ms preprocess, 42.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 8 cars, 6 motorcycles, 1 bus, 2 trucks, 42.4ms
Speed: 2.5ms preprocess, 42.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 7 cars, 6 motorcycles, 3 trucks, 42.3ms
Speed: 3.3ms preprocess, 42.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 7 cars, 5 motorcycles, 3 trucks, 38.4ms
Speed: 2.2ms preprocess, 38.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 9 cars, 5 motorcycles, 3 trucks, 38.1ms
Speed: 2.9ms preprocess, 38.1ms inference, 1.6ms postprocess per image at shape 

In [236]:
tracker_sort_rcnn(input_path,output_path4)