# Prerequisite

In [None]:
# Import libs
import numpy as np
import cv2

In [None]:
# Mount google drive for data transfer
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Define paths
video_path = '/content/gdrive/MyDrive/Video Object Detection /sample 1.mp4'
output_path = '/content/gdrive/MyDrive/Video Object Detection /sample1_output2.mp4'
yolo_weight_path = '/content/gdrive/MyDrive/Video Object Detection /yolov3.weights'
yolo_cfg_path = '/content/gdrive/MyDrive/Video Object Detection /yolov3.cfg'
coco_path = '/content/gdrive/MyDrive/Video Object Detection /coco.names'

# Load YOLO Pretrained Model

In [None]:
# Load YOLO.v3 config file and pretrained weights
network = cv2.dnn.readNetFromDarknet(yolo_cfg_path, yolo_weight_path)
layer_names = network.getLayerNames()
layer_names = [layer_names[i - 1] for i in network.getUnconnectedOutLayers()]

# Convert coco class labels to list
with open(coco_path) as f:
    labels = [line.strip() for line in f]

# Load Video

In [None]:
# Read video from file
video = cv2.VideoCapture(video_path)

# Get frame properties
frame_w = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_h = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video.get(cv2.CAP_PROP_FPS))
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

# Prepare video writer to save the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(output_path, fourcc, fps, (frame_w, frame_h), True)

# Process Video

In [None]:
# Define confidence and NMS threshold
conf_th = 0.3
nms_th = 0.

# Initialize colours for representing every detected object
colours = np.random.randint(0, 255, size=(len(labels), 3), dtype='uint8')

# Initialize frame counter to keep track of frames
frame_cnt = 0

# Process video frame by frame with yolo
while True:
    ret, frame = video.read()
    if not ret:
        break

    frame_cnt += 1

    # Preprocess each frame before passing it to YOLO
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)

    # Perform a forward-pass of the YOLO for each frame
    network.setInput(blob)
    output_from_network = network.forward(layer_names)

    # Prepare lists for detected bounding boxes, confidences and class numbers.
    bounding_boxes = []
    confidences = []
    class_numbers = []

    # Analyze YOLO output for each frame
    for result in output_from_network:
        for detected_object in result:
            scores = detected_object[5:]
            current_class = np.argmax(scores)
            current_confidence = scores[current_class]

            # eliminate detected objects with low confidence levels
            if  current_confidence > conf_th:

              # get coordinates for bounding box
              x_center, y_center, box_width, box_height = detected_object[0:4] * np.array([frame_w, frame_h, frame_w, frame_h])
              x_min = int(x_center - (box_width / 2))
              y_min = int(y_center - (box_height / 2))

              # Add box, class num and confidence of detected object to the lists
              bounding_boxes.append([x_min, y_min, int(box_width), int(box_height)])
              confidences.append(float(current_confidence))
              class_numbers.append(current_class)


    # perform non-maximum suppression on each bounding box to exclude boxes with
    # low confidence levels or those overlapping with boxes with higher confidence
    results = cv2.dnn.NMSBoxes(bounding_boxes, confidences, conf_th, nms_th)

    # At-least one detection should exists
    if len(results) > 0:
        print(f"Detections found in frame {frame_cnt}/{total_frames}")
        for i in results.flatten():
            # Getting current bounding box coordinates, its width and height
            x_min, y_min = bounding_boxes[i][0], bounding_boxes[i][1]
            box_width, box_height = bounding_boxes[i][2], bounding_boxes[i][3]

            # Preparing colour for current bounding box
            colour_box_current = colours[class_numbers[i]].tolist()

            # Drawing bounding box on the original image
            cv2.rectangle(frame, (x_min, y_min), (x_min + box_width, y_min + box_height), colour_box_current, 2)

            # Preparing text with label and confidence for current bounding box
            text_box_current = '{}: {:.4f}'.format(labels[int(class_numbers[i])], confidences[i])

            # Putting text with label and confidence on the original image
            cv2.putText(frame, text_box_current, (x_min, y_min - 5), cv2.FONT_HERSHEY_COMPLEX, 0.7, colour_box_current, 2)

    else:
      print(f"no detection for frame {frame_cnt}/{total_frames}")


    # Write processed current frame to the file
    writer.write(frame)

# Releasing video reader and writer
video.release()
writer.release()

Detections found in frame 1/199
Detections found in frame 2/199
Detections found in frame 3/199
no detection for frame4/199
Detections found in frame 5/199
Detections found in frame 6/199
no detection for frame7/199
Detections found in frame 8/199
Detections found in frame 9/199
Detections found in frame 10/199
Detections found in frame 11/199
Detections found in frame 12/199
Detections found in frame 13/199
Detections found in frame 14/199
no detection for frame15/199
Detections found in frame 16/199
Detections found in frame 17/199
Detections found in frame 18/199
Detections found in frame 19/199
Detections found in frame 20/199
Detections found in frame 21/199
Detections found in frame 22/199
Detections found in frame 23/199
Detections found in frame 24/199
Detections found in frame 25/199
Detections found in frame 26/199
no detection for frame27/199
Detections found in frame 28/199
Detections found in frame 29/199
no detection for frame30/199
no detection for frame31/199
no detecti