# Ultra-lightweight face detection model with OpenVINO

- This tutorial demonstrates Ultra-lightweight face detection in OpenVINO using both imgs and videos.
- Model information can be found in [Ultra-Light-Fast-Generic-Face-Detector-1MB](https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB).

<img src='https://github.com/Linzaer/Ultra-Light-Fast-Generic-Face-Detector-1MB/raw/master/readme_imgs/27.jpg' width="50%">


### What is Ultra-Light-Fast-Generic-Face-Detector-1MB?
This model is a lightweight facedetection model designed for edge computing devices.

- In terms of model size, the default FP32 precision (.pth) file size is **1.04~1.1MB**, and the inference framework int8 quantization size is about **300KB**.
- In terms of the calculation amount of the model, the input resolution of 320x240 is about **90~109 MFlops**.
- There are two versions of the model, version-slim (network backbone simplification,slightly faster) and version-RFB (with the modified RFB module, higher precision).
- Widerface training pre-training model with different input resolutions of 320x240 and 640x480 is provided to better work in different application scenarios.


## Preparation

### Imports

In [None]:
import cv2
import numpy as np
import time
from pathlib import Path
from openvino.inference_engine import IECore
import matplotlib.cm
import matplotlib.pyplot as plt
from IPython.display import (
    Video,
    display,
    FileLink,
    HTML
)

### Settings

In [None]:
DEVICE = "CPU"
#MODEL_XML = r"models\version-RFB-640.xml"
#MODEL_BIN = r"models\version-RFB-640.bin"

MODEL_XML = r"models\version-RFB-320.xml"
MODEL_BIN = r"models\version-RFB-320.bin"

## Functions

In [None]:
def to_rgb(image_data) -> np.ndarray:
    """
    Convert image_data from BGR to RGB
    """
    return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)

def area_of(left_top, right_bottom):
    """
    Compute the areas of rectangles given two corners.
    Args:
        left_top (N, 2): left top corner.
        right_bottom (N, 2): right bottom corner.

    Returns:
        area (N): return the area.
    """
    hw = np.clip(right_bottom - left_top, 0.0, None)
    return hw[..., 0] * hw[..., 1]

def iou_of(boxes0, boxes1, eps=1e-5):
    """
    Return intersection-over-union (Jaccard index) of boxes.
    Args:
        boxes0 (N, 4): ground truth boxes.
        boxes1 (N or 1, 4): predicted boxes.
        eps: a small number to avoid 0 as denominator.
    Returns:
        iou (N): IoU values.
    """
    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])

    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
    return overlap_area / (area0 + area1 - overlap_area + eps)

def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
    """
    Args:
        box_scores (N, 5): boxes in corner-form and probabilities.
        iou_threshold: intersection over union threshold.
        top_k: keep top_k results. If k <= 0, keep all the results.
        candidate_size: only consider the candidates with the highest scores.
    Returns:
         picked: a list of indexes of the kept boxes
    """
    scores = box_scores[:, -1]
    boxes = box_scores[:, :-1]
    picked = []
    # _, indexes = scores.sort(descending=True)
    indexes = np.argsort(scores)
    # indexes = indexes[:candidate_size]
    indexes = indexes[-candidate_size:]
    while len(indexes) > 0:
        # current = indexes[0]
        current = indexes[-1]
        picked.append(current)
        if 0 < top_k == len(picked) or len(indexes) == 1:
            break
        current_box = boxes[current, :]
        # indexes = indexes[1:]
        indexes = indexes[:-1]
        rest_boxes = boxes[indexes, :]
        iou = iou_of(rest_boxes,np.expand_dims(current_box, axis=0),)
        indexes = indexes[iou <= iou_threshold]

    return box_scores[picked, :]

def predict(width, height, confidences, boxes, prob_threshold, iou_threshold=0.3, top_k=-1):
    boxes = boxes[0]
    confidences = confidences[0]
    picked_box_probs = []
    picked_labels = []
    for class_index in range(1, confidences.shape[1]):
        probs = confidences[:, class_index]
        mask = probs > prob_threshold
        probs = probs[mask]
        if probs.shape[0] == 0:
            continue
        subset_boxes = boxes[mask, :]
        box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1)
        box_probs = hard_nms(box_probs,
                            iou_threshold=iou_threshold,
                            top_k=top_k,
                            )
        picked_box_probs.append(box_probs)
        picked_labels.extend([class_index] * box_probs.shape[0])
    if not picked_box_probs:
        return np.array([]), np.array([]), np.array([])
    picked_box_probs = np.concatenate(picked_box_probs)
    picked_box_probs[:, 0] *= width
    picked_box_probs[:, 1] *= height
    picked_box_probs[:, 2] *= width
    picked_box_probs[:, 3] *= height
    return picked_box_probs[:, :4].astype(np.int32), np.array(picked_labels), picked_box_probs[:, 4]

## Load the Model
Load the model in OpenVINO Runtime with `ie.read_neywork` and load it for the specified device with `ie.load_network`. Get input and output keys and the expected input shape for the model.

In [None]:
ie = IECore()
net = ie.read_network(model=MODEL_XML, weights=MODEL_BIN)
exec_net = ie.load_network(network=net, device_name=DEVICE, num_requests=1)

class_names = ['BACKGROUND', 'face']
RESULT_PATH = "output/"
IMG_PATH = "../data/image/"

# Create Path objects for the result 
output_directory = Path("output")
output_directory.mkdir(exist_ok=True)

threshold = 0.7
#Check net_info
input_blob = next(iter(net.input_info))
output_blob = next(iter(net.outputs))

input_shape = net.input_info[input_blob].input_data.shape
print(f"net input shape: ",input_shape)
output_shape = net.outputs[output_blob].shape
print(f"net output shape: ",output_shape)
print(class_names)

## Face Detect on Image

### Load, resize and reshape input image

The input image is read with OpenCV, resized to network input size.

In [None]:
# Load image and preprocess it
orig_image = cv2.imread(IMG_PATH+f"241_face_img.jpg")
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (320, 240))
image_mean = np.array([127, 127, 127])
image = (image - image_mean) / 128
image = np.transpose(image, [2, 0, 1])
image = np.expand_dims(image, axis=0)
image = image.astype(np.float32)

### Do inference on the image
Do inference, convert the result to an image, and resize it to the original image shape.

In [None]:
#Post-proess
output = exec_net.infer(inputs={input_blob: image})
boxes = output['boxes']
confidences = output['scores']

#predict
boxes, labels, probs = predict(orig_image.shape[1], orig_image.shape[0], confidences, boxes, threshold)
for i in range(boxes.shape[0]):
    box = boxes[i, :]
    label = f"{class_names[labels[i]]}: {probs[i]:.2f}"

    cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 4)

    # cv2.putText(orig_image, label,
    #             (box[0] + 20, box[1] + 40),
    #             cv2.FONT_HERSHEY_SIMPLEX,
    #             1,  # font scale
    #             (255, 0, 255),
    #             2)  # line type
    cv2.imwrite(RESULT_PATH+f"output_img.jpg", orig_image)

### Display result image

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 15))
orig_image = cv2.imread(IMG_PATH+f"241_face_img.jpg")
result_image = cv2.imread(RESULT_PATH+f"/output_img.jpg")
ax[0].imshow(to_rgb(orig_image))
ax[1].imshow(to_rgb(result_image))

## Face Detect on Image

### Video Settings

In [None]:
VIDEO_FILE = "../data/video/241_face_video.mp4"

OUTPUT_PATH = "output/"

In [None]:
cap = cv2.VideoCapture(VIDEO_FILE)  # capture from camera

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

print(width,height)

FACE_SUM = 0
ZIP_RATE = 0.7
THRESHOLD = 0.7
SUMTIME=0
fourcc = cv2.VideoWriter_fourcc('X','2','6','4')
out = cv2.VideoWriter(OUTPUT_PATH+f'output_video.mp4', fourcc, 30.0, (int(ZIP_RATE*width),int(ZIP_RATE* height)),True)

In [None]:
while True:
    ret, orig_image = cap.read()
    if orig_image is None:
        print(f"no img , process ended")
        break
    image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (320, 240))
    # image = cv2.resize(image, (640, 480))
    image_mean = np.array([127, 127, 127])
    image = (image - image_mean) / 128
    image = np.transpose(image, [2, 0, 1])
    image = np.expand_dims(image, axis=0)
    image = image.astype(np.float32)
    time_time = time.time()
    output = exec_net.infer(inputs={input_blob: image})
    boxes = output['boxes']
    confidences = output['scores']
    SUMTIME +=time.time() - time_time
    boxes, labels, probs = predict(orig_image.shape[1], orig_image.shape[0], confidences, boxes, THRESHOLD)
    for i in range(boxes.shape[0]):
        box = boxes[i, :]
        label = f"{class_names[labels[i]]}: {probs[i]:.2f}"

        cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), (255, 255, 0), 4)
        
        # cv2.putText(orig_image, label,
        #             (box[0] + 20, box[1] + 40),
        #             cv2.FONT_HERSHEY_SIMPLEX,
        #             1,  # font scale
        #             (255, 0, 255),
        #             2)  # line type
    FACE_SUM += boxes.shape[0]
    orig_image = cv2.resize(orig_image, (0, 0), fx=Zip_Rate, fy=Zip_Rate)
    out.write(orig_image)
print(f"cost time:{totaltime}")
cap.release()
out.release()
cv2.destroyAllWindows()
print(f"face_sum:{face_sum}")

In [None]:
video = Video(OUTPUT_PATH+'output_video.mp4', width=800, embed=True)
video_link = FileLink(OUTPUT_PATH+'output_video.mp4')
video_link.html_link_str = "<a href='%s' download>%s</a>"
display(HTML(video_link._repr_html_()))
display(video)