# Multiple object detection and tracking with `MOTracker` library

In [None]:
# !pip install motrackers

MOTracker GitHub Repository: https://github.com/adipandas/multi-object-tracker/blob/master/examples/example_notebooks/mot_YOLOv3.ipynb

#### Download MobileNet SSD .prototxt and .caffemodel files

In [11]:
import urllib.request

url = "https://drive.google.com/u/0/uc?id=0B3gersZ2cHIxRm5PMWRoTkdHdHc&export=download"
filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.caffemodel"

urllib.request.urlretrieve(url, filename)

url = "https://raw.githubusercontent.com/chuanqi305/MobileNet-SSD/daef68a6c2f5fbb8c88404266aa28180646d17e0/MobileNetSSD_deploy.prototxt"
filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.prototxt"

urllib.request.urlretrieve(url, filename)

('MobileNetSSD_deploy.prototxt', <http.client.HTTPMessage at 0x19c98189d80>)

### Write video from url stream with yolo version 3 + `motracker` library

In [24]:
import cv2
import urllib.request
import json
from motrackers import CentroidTracker

# Download the YOLOv3 model files
yolov3_weights_filename = "../Modelos/yolov3/yolov3.weights"
yolov3_config_filename = "../Modelos/yolov3/yolov3.cfg"

# Download the JSON file for class names
class_names_filename = "../Modelos/yolov3/coco.names"

# Load the YOLOv3 model
net = cv2.dnn.readNetFromDarknet(yolov3_config_filename, yolov3_weights_filename)

# Load the class names
with open(class_names_filename, "r") as f:
    class_names = f.read().splitlines()

# URL of the video stream
video_url = "http://187.111.99.18:9004/?CODE=1646"

# Create a VideoCapture object
cap = cv2.VideoCapture(video_url)

# Define the output video writer
output_filename = "output.avi"
fourcc = cv2.VideoWriter_fourcc(*"XVID")
fps = 3
width = 854
height = 480
output = cv2.VideoWriter(output_filename, fourcc, fps, (width, height))

# Initialize the tracker (CentroidTracker in this case)
tracker = CentroidTracker()

# seconds to capature
seconds = 10

# number of frames captured
i = 0

# Process each frame of the video
while i <= fps * seconds:
    ret, frame = cap.read()
    if not ret:
        break
    
    i += 1
    
    # Perform object detection with YOLOv3
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    detections = net.forward()
    
    # Extract bounding boxes, confidences, and class IDs from the detections
    detection_bboxes = []
    detection_confidences = []
    detection_class_ids = []
    
    for detection in detections:
        scores = detection[5:]
        class_id = scores.argmax()
        confidence = scores[class_id]
        
        if confidence > 0.5:
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            bbox_width = int(detection[2] * width)
            bbox_height = int(detection[3] * height)
            
            # Convert center coordinates to top-left coordinates
            bbox_left = int(center_x - bbox_width / 2)
            bbox_top = int(center_y - bbox_height / 2)
            
            detection_bboxes.append([bbox_left, bbox_top, bbox_width, bbox_height])
            detection_confidences.append(confidence)
            detection_class_ids.append(class_id)
    
    # Update the tracker with the detection results
    output_tracks = tracker.update(detection_bboxes, detection_confidences, detection_class_ids)
    
    # Draw bounding boxes and annotations on the frame
    for track in output_tracks:
        frame_id, track_id, bbox_left, bbox_top, bbox_width, bbox_height, confidence, _, _, _ = track
        class_name = class_names[track_id]
        
        # Draw bounding box
        cv2.rectangle(frame, (int(bbox_left), int(bbox_top)), (int(bbox_left + bbox_width), int(bbox_top + bbox_height)), (0, 255, 0), 2)
        
        # Annotate with class name and track ID
        label = f"{class_name} {track_id}"
        cv2.putText(frame, label, (int(bbox_left), int(bbox_top - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    # Write the frame with annotations to the output video file
    output.write(frame)
    
    # Display the frame
    # cv2.imshow("Video", frame)
    # if cv2.waitKey(1) == ord("q"):
        # break

# Release resources
cap.release()
output.release()
cv2.destroyAllWindows()


### Write video from url stream with: MobileNet SSD from .prototxt and .caffemodel files

In [None]:
import cv2
import urllib.request
import json

def save_video(url, model_filename, prototxt_filename, json_filename, seconds):

    # Load the model
    net = cv2.dnn.readNetFromCaffe(prototxt_filename, model_filename)

    # Load the class names from the JSON file
    with open(json_filename, 'r') as f:
        class_names = json.load(f)

    # Create a VideoCapture object
    cap = cv2.VideoCapture(url)

    # Define the output video writer
    output_filename = "output.avi"
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    fps = 3  # Set the desired FPS
    width = 854  # Set the width of the frame (854 pixels)
    height = 480  # Set the height of the frame (480 pixels)
    output = cv2.VideoWriter(output_filename, fourcc, fps, (width, height))

    # Calculate the number of frames to capture based on the desired seconds
    frame_count = int(fps * seconds)

    # Process each frame of the video
    for i in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (300, 300))
        # Perform object detection on the frame
        blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300), (127.5, 127.5, 127.5), swapRB=True, crop=False)
        net.setInput(blob)
        detections = net.forward()

        # Draw bounding boxes around detected objects and annotate with class names
        for j in range(detections.shape[2]):
            confidence = detections[0, 0, j, 2]
            if confidence > 0.5:
                class_id = int(detections[0, 0, j, 1])
                class_name = class_names[class_id]
                x = int(detections[0, 0, j, 3] * width)
                y = int(detections[0, 0, j, 4] * height)
                w = int(detections[0, 0, j, 5] * width)
                h = int(detections[0, 0, j, 6] * height)

                cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
                cv2.putText(frame, class_name, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        # Write the frame with annotations to the output video file
        output.write(frame)

    # Release resources
    cap.release()
    output.release()
    cv2.destroyAllWindows()

# URL of the video stream
video_url = "http://187.111.99.18:9004/?CODE=1646"

# Set the desired seconds for video processing
seconds = 10

# Set the paths for model, prototxt, and JSON files
model_filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.caffemodel"
prototxt_filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.prototxt"
json_filename = "../Modelos/MobileNetSSD/ssd_mobilenet_caffe_names.json"

# Call the save_video function
save_video(video_url, model_filename, prototxt_filename, json_filename, seconds)


### Write video of Yolo v3 from URL cv2.VideoCapture video stream · Using CUDA backend

In [27]:
import cv2
import numpy as np

# Load YOLOv3 weights and configuration
# model_filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.caffemodel"
# prototxt_filename = "../Modelos/MobileNetSSD/MobileNetSSD_deploy.prototxt"
# net = cv2.dnn.readNetFromCaffe(prototxt_filename, model_filename)

net = cv2.dnn.readNet("../Modelos/yolov3/yolov3.weights", "../Modelos/yolov3/yolov3.cfg")

# Enable GPU acceleration
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

# URL for video stream
url = "http://187.111.99.18:9004/?CODE=1646"

# Create a VideoCapture object
cap = cv2.VideoCapture(url)

# Check if the video capture is successful
if not cap.isOpened():
    print("Failed to open the video stream.")
    exit()

# Function to draw bounding boxes and labels on the image
def draw_predictions(image, class_ids, confidences, boxes):
    for class_id, confidence, box in zip(class_ids, confidences, boxes):
        x, y, w, h = box
        label = class_names[class_id]
        color = colors[class_id]

        # Draw bounding box
        cv2.rectangle(image, (x, y), (x+w, y+h), color, 2)

        # Draw label and confidence
        text = f"{label}: {confidence:.2f}"
        cv2.putText(image, text, (x, y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Load class names for the labels
with open("../Modelos/yolov3/coco.names", "r") as f:
    class_names = [line.strip() for line in f.readlines()]

# Generate random colors for each class
colors = np.random.uniform(0, 255, size=(len(class_names), 3))

# Parameters
seconds = 10  # Number of seconds to capture
fps = 3 # Number of frames per second

# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'XVID')
writer = cv2.VideoWriter('output.avi', fourcc, fps, (512, 512))

# Variable to keep track of elapsed frames
frame_count = 0

while frame_count < seconds * fps:  # 20 frames per second
    # Read the next frame from the video stream
    ret, frame = cap.read()

    # Break the loop if no frame is captured
    if not ret:
        break

    # Resize the frame to a common size
    frame = cv2.resize(frame, (512, 512))

    # Perform object detection on the frame
    blob = cv2.dnn.blobFromImage(frame, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    outputs = net.forward(output_layers)

    # Process the output to get the object detections and bounding boxes
    class_ids = []
    confidences = []
    boxes = []
    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.9:
                center_x = int(detection[0] * frame.shape[1])
                center_y = int(detection[1] * frame.shape[0])
                width = int(detection[2] * frame.shape[1])
                height = int(detection[3] * frame.shape[0])
                left = int(center_x - width / 2)
                top = int(center_y - height / 2)
                class_ids.append(class_id)
                confidences.append(float(confidence))
                boxes.append([left, top, width, height])

    # Draw bounding boxes and labels on the frame
    draw_predictions(frame, class_ids, confidences, boxes)

    # Write the frame to the output video file
    writer.write(frame)

    # Display the frame
    # cv2.imshow("Frame", frame)

    # Increment the frame count
    frame_count += 1

    # Break the loop if the 'q' key is pressed
    # if cv2.waitKey(1) & 0xFF == ord('q'):
        # break

# Release the video capture object, close the output video file, and destroy any open windows
cap.release()
writer.release()
cv2.destroyAllWindows()


error: OpenCV(4.7.0) D:\a\opencv-python\opencv-python\opencv\modules\dnn\src\layers\convolution_layer.cpp:392: error: (-215:Assertion failed) !blobs.empty() || inputs.size() > 1 in function 'cv::dnn::ConvolutionLayerImpl::getMemoryShapes'
