In [8]:
import cv2
from IPython.display import YouTubeVideo, display, HTML
import matplotlib.pyplot as plt

s = 'faces.mp4'
source = cv2.VideoCapture(s)

if not source.isOpened():
    print('Error opening video stream or file')

ret, frame = source.read()

frame_height = frame.shape[0]
frame_width = frame.shape[1]

out_mp4 = cv2.VideoWriter('processed_face.mp4', cv2.VideoWriter_fourcc(*'XVID'), 10, (frame_width, frame_height))

net = cv2.dnn.readNetFromCaffe("deploy.prototxt", "res10_300x300_ssd_iter_140000_fp16.caffemodel")

# Model parameters, associated with the training model information
in_width = 300
in_height = 300
mean = [104, 117, 123]

# Set your own sensitivity to the detections
conf_threshold = 0.9

while source.isOpened():
    has_frame, frame = source.read()
    if not has_frame:
        break


    # Create a 4D blob from a frame.
    # 1.0 is the scale factor, mean will be subtracted from all the images, both caffee and opencv use same colour convention
    blob = cv2.dnn.blobFromImage(frame, 1.0, (in_width, in_height), mean, swapRB=False, crop=False)
    # Run a model
    net.setInput(blob)
    # makes a forward pass through network and is performing inference on the representation of our input image
    detections = net.forward()

    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]
        if confidence > conf_threshold:
            # bounding box coordinates
            x_left_bottom = int(detections[0, 0, i, 3] * frame_width)
            y_left_bottom = int(detections[0, 0, i, 4] * frame_height)
            x_right_top = int(detections[0, 0, i, 5] * frame_width)
            y_right_top = int(detections[0, 0, i, 6] * frame_height)

            # draw rectangle around bounding box
            cv2.rectangle(frame, (x_left_bottom, y_left_bottom), (x_right_top, y_right_top), (0, 255, 0))
            label = "Confidence: %.4f" % confidence
            label_size, base_line = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

            # draw the confidence label on the frame
            cv2.rectangle(
                frame,
                (x_left_bottom, y_left_bottom - label_size[1]),
                (x_left_bottom + label_size[0], y_left_bottom + base_line),
                (255, 255, 255),
                cv2.FILLED,
            )
            cv2.putText(frame, label, (x_left_bottom, y_left_bottom), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
    # returns the time it requires to build the inference
    t, _ = net.getPerfProfile()
    # annotate the frame with the amount of time it took to perform the inference
    label = "Inference time: %.2f ms" % (t * 1000.0 / cv2.getTickFrequency())
    cv2.putText(frame, label, (0, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))
    # write the processing onto the output video
    out_mp4.write(frame)

source.release()
out_mp4.release()
