# Pedestrian counting with OpenVINO™

This demo shows how to run pedestrian counting inference with OpenVINO, using [PP-YOLOE](https://gitee.com/paddlepaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe) and [OC-SORT](https://gitee.com/paddlepaddle/PaddleDetection/blob/release/2.5/configs/mot/ocsort) from [PaddleDetection Github](https://github.com/PaddlePaddle/PaddleDetection) or [PaddleDetection Gitee](https://gitee.com/paddlepaddle/PaddleDetection). 
The PP-YOLOE pre-trained model used in the demo refers to the "mot_ppyoloe_s_36e_pipeline(27M)"

## OC-Sort Tracker requirments

In [None]:
!pip install filterpy

## Imports

In [None]:
import copy
import os
import urllib
import zipfile
from collections import defaultdict, deque
from pathlib import Path

import cv2
import numpy as np
import openvino.runtime as ov
import yaml

## Models for Pedestrian counting

Pedestrian counting requires "mot_ppyoloe_s_36e_pipeline" model(27M). Pre-trained models used in the demo are downloaded and stored in the "model" folder.

In [None]:
# Define the function to download mot_ppyoloe_s_36e_pipeline model from PaddleDetection resources.


def run_model_download(model_url, model_file_path):
    """
    Download pre-trained models from PaddleDetection resources

    Parameters:
        model_url: url link to pre-trained models
        model_file_path: file path to store the downloaded model
    """
    model_name = model_url.split("/")[-1]

    if model_file_path.is_file():
        print("Model already exists")
    else:
        # Download the model from the server, and untar it.
        print("Downloading the pre-trained model... May take a while...")

        # Create a directory.
        os.makedirs("model", exist_ok=True)
        urllib.request.urlretrieve(model_url, f"model/{model_name} ")
        print("Model Downloaded")
        print(f"model/{model_name} ")
        file = zipfile.ZipFile(f"model/{model_name} ")
        res = file.extractall("model")
        file.close()
        if not res:
            print(f"Model Extracted to {model_file_path}.")
        else:
            print("Error Extracting the model. Please check the network.")

### Download the Model

In [None]:
# A directory where the model will be downloaded.

model_url = (
    "https://bj.bcebos.com/v1/paddledet/models/pipeline/mot_ppyoloe_s_36e_pipeline.zip"
)
model_file_path = Path("model/mot_ppyoloe_s_36e_pipeline/model.pdmodel")

run_model_download(model_url, model_file_path)

## Preprocessing Functions for Pedestrian counting

### load preprocess config 

All PaddlePaddle Pre-trained model downloaded from PaddleDetection contains `infer_cfg.yml` file which All image preprocessing configurations are saved. 

Using the code given by PaddleDetection, we can load `infer_cfg.yml` file and complete the preprocess easily

In [None]:
from ocsort_utils.ocsort_tracker import OCSORTTracker
from preprocess import Compose

# Global dictionary
SUPPORT_MODELS = {
    "YOLO",
    "RCNN",
    "SSD",
    "Face",
    "FCOS",
    "SOLOv2",
    "TTFNet",
    "S2ANet",
    "JDE",
    "FairMOT",
    "DeepSORT",
    "GFL",
    "PicoDet",
    "CenterNet",
    "TOOD",
    "RetinaNet",
    "StrongBaseline",
    "STGCN",
    "YOLOX",
    "HRNet",
}


class PredictConfig(object):
    """set config of preprocess, postprocess and visualize
    Args:
        infer_config (str): path of infer_cfg.yml
    """

    def __init__(self, infer_config):
        # parsing Yaml config for Preprocess
        with open(infer_config) as f:
            yml_conf = yaml.safe_load(f)
        self.check_model(yml_conf)
        self.arch = yml_conf["arch"]
        self.preprocess_infos = yml_conf["Preprocess"]
        self.min_subgraph_size = yml_conf["min_subgraph_size"]
        self.label_list = yml_conf["label_list"]
        self.use_dynamic_shape = yml_conf["use_dynamic_shape"]
        self.draw_threshold = yml_conf.get("draw_threshold", 0.5)
        self.mask = yml_conf.get("mask", False)
        self.tracker = yml_conf.get("tracker", None)
        self.nms = yml_conf.get("NMS", None)
        self.fpn_stride = yml_conf.get("fpn_stride", None)
        if self.arch == "RCNN" and yml_conf.get("export_onnx", False):
            print(
                "The RCNN export model is used for ONNX and it only supports batch_size = 1"
            )
        self.print_config()

    def check_model(self, yml_conf):
        """
        Raises:
            ValueError: loaded model not in supported model type
        """
        for support_model in SUPPORT_MODELS:
            if support_model in yml_conf["arch"]:
                return True
        raise ValueError(
            "Unsupported arch: {}, expect {}".format(yml_conf["arch"], SUPPORT_MODELS)
        )

    def print_config(self):
        print("-----------  Model Configuration -----------")
        print("%s: %s" % ("Model Arch", self.arch))
        print("%s: " % ("Transform Order"))
        for op_info in self.preprocess_infos:
            print("--%s: %s" % ("transform op", op_info["type"]))
        print("--------------------------------------------")

## Initialize Openvino predictor for PP-YOLOE and OC-Sort
1. initialize the runtime for inference. Then, read the network architecture and model weights from the `.pdmodel` and `.pdiparams` files to load to CPU.
2. initialize the OC-Sort Tracker with default configs

In [None]:
class label(object):
    def __init__(self):
        self.labels = ["pedestrian"]


class OpenvineDetector(object):
    def __init__(self, paddlefile, infer_cfg):
        self.paddle_file = paddlefile
        self.infer_cfg = infer_cfg
        self.infer_config = PredictConfig(infer_cfg)
        self.pred_config = label()
        det_thresh = 0.4
        max_age = 30
        min_hits = 3
        iou_threshold = 0.3
        delta_t = 3
        inertia = 0.2
        min_box_area = 0
        vertical_ratio = 0
        use_byte = False

        # initialize OCSORTTracker, which is not a deep learning model and does not require the use of OpenVino inference
        self.tracker = OCSORTTracker(
            det_thresh=det_thresh,
            max_age=max_age,
            min_hits=min_hits,
            iou_threshold=iou_threshold,
            delta_t=delta_t,
            inertia=inertia,
            min_box_area=min_box_area,
            vertical_ratio=vertical_ratio,
            use_byte=use_byte,
        )
        # create runtime.Core
        self.core = ov.Core()
        print("OpenVINO Runtime Core Created!")
        # directly load paddlemodel, using model.pdmodel file
        # Openvino will automatically search for model.pdiparams file and load model parameters
        # Notice whether the file you downloaded contains model.pdmodel and model.pdiparams
        self.model = self.core.read_model(paddlefile)
        # compile model into cpu
        self.compiled_model = self.core.compile_model(self.model, "CPU")
        # create infer request for the model
        self.predictor = self.compiled_model.create_infer_request()
        print("[OpenVINO]%s infer request created" % paddlefile)

    # PP-YOLOe detector predict function
    def det_predict(self, img_list):
        # load preprocess transforms
        transforms = Compose(self.infer_config.preprocess_infos)
        # predict image
        for img_path in img_list:
            inputs = transforms(img_path)
            # Create tensor from external memory
            np_image = np.array([inputs["image"]], dtype="float32")
            np_scale_factor = np.array([inputs["scale_factor"]])
            # Create Openvino Tensor from np.array
            image_tensor = ov.Tensor(array=np_image, shared_memory=True)
            scale_factor_tensor = ov.Tensor(array=np_scale_factor, shared_memory=True)
            # Set input tensor for model with two input: image and scale_factor
            self.predictor.set_input_tensor(1, image_tensor)
            self.predictor.set_input_tensor(0, scale_factor_tensor)
            self.predictor.start_async()
            self.predictor.wait()
            # Get output tensor for model with one output
            output_0 = self.predictor.get_output_tensor(0)
            output_1 = self.predictor.get_output_tensor(1)
            outputs = [output_0.data, output_1.data]
            # outputs = self.predictor.infer({'image': inputs['image'], 'scale_factor': inputs['scale_factor']})
            # print("Openvino predict: ")
            if self.infer_config.arch in ["HRNet"]:
                print(np.array(outputs[0]))
            else:
                # bboxes = np.array(outputs["multiclass_nms3_0.tmp_0"])
                bboxes = np.array(outputs[0])
                return bboxes

    # OCSORT predict function
    def track(self, det_result):
        pred_embs = None
        pred_dets = det_result
        online_targets = self.tracker.update(pred_dets, pred_embs)
        online_tlwhs = list()
        online_scores = list()
        online_ids = list()
        for t in online_targets:
            tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]
            tscore = float(t[4])
            tid = int(t[5])
            if tlwh[2] * tlwh[3] <= self.tracker.min_box_area:
                continue
            if (
                self.tracker.vertical_ratio > 0
                and tlwh[2] / tlwh[3] > self.tracker.vertical_ratio
            ):
                continue
            if tlwh[2] * tlwh[3] > 0:
                online_tlwhs.append(tlwh)
                online_ids.append(tid)
                online_scores.append(tscore)
        tracking_outs = {
            "online_tlwhs": online_tlwhs,
            "online_scores": online_scores,
            "online_ids": online_ids,
        }
        return tracking_outs

    # pipeline of PP-YOLOe and OCSORTTracker
    def predict_image(self, im, visual=False):
        # np.save("mot_input.npy", im)
        bboxes = self.det_predict([im])
        tracking_outs = self.track(bboxes)
        online_tlwhs = [tracking_outs["online_tlwhs"]]
        online_scores = [tracking_outs["online_scores"]]
        online_ids = [tracking_outs["online_ids"]]
        return [[online_tlwhs, online_scores, online_ids]]

## Postprocessing
1. format model's output (parse_mot_res)
2. Pedestrian in or out count (flow_statistic)
3. Boxes visulize (plot_tracking_dict)

### Pedestrian entrance counting

In [None]:
# Format the tracking input
def parse_mot_res(input):
    mot_res = []
    boxes, scores, ids = input[0]
    for box, score, i in zip(boxes[0], scores[0], ids[0]):
        xmin, ymin, w, h = box
        res = [i, 0, score, xmin, ymin, xmin + w, ymin + h]
        mot_res.append(res)
    return {"boxes": np.array(mot_res)}


# pedestrian in or out count function
# returns in or out id list
def flow_statistic(
    result,
    secs_interval,
    do_entrance_counting,
    do_break_in_counting,
    region_type,
    video_fps,
    entrance,
    id_set,
    interval_id_set,
    in_id_list,
    out_id_list,
    prev_center,
    records,
    data_type="mot",
    num_classes=1,
):
    # Count in/out number:
    # Note that 'region_type' should be one of ['horizontal', 'vertical'],
    # 'horizontal' and 'vertical' means entrance is the center line as the entrance when do_entrance_counting

    if do_entrance_counting:
        assert region_type in [
            "horizontal",
            "vertical",
        ], "region_type should be 'horizontal' or 'vertical' when do entrance counting."
        entrance_x, entrance_y = entrance[0], entrance[1]
        frame_id, tlwhs, tscores, track_ids = result
        for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):
            if track_id < 0:
                continue
            if data_type == "kitti":
                frame_id -= 1
            x1, y1, w, h = tlwh
            center_x = x1 + w / 2.0
            center_y = y1 + h / 2.0
            if track_id in prev_center:
                if region_type == "horizontal":
                    # horizontal center line
                    if prev_center[track_id][1] <= entrance_y and center_y > entrance_y:
                        in_id_list.append(track_id)
                    if prev_center[track_id][1] >= entrance_y and center_y < entrance_y:
                        out_id_list.append(track_id)
                else:
                    # vertical center line
                    if prev_center[track_id][0] <= entrance_x and center_x > entrance_x:
                        in_id_list.append(track_id)
                    if prev_center[track_id][0] >= entrance_x and center_x < entrance_x:
                        out_id_list.append(track_id)
                prev_center[track_id][0] = center_x
                prev_center[track_id][1] = center_y
            else:
                prev_center[track_id] = [center_x, center_y]

    if do_break_in_counting:
        assert region_type in [
            "custom"
        ], "region_type should be 'custom' when do break_in counting."
        assert (
            len(entrance) >= 4
        ), "entrance should be at least 3 points and (w,h) of image when do break_in counting."
        im_w, im_h = entrance[-1][:]
        entrance = np.array(entrance[:-1])

        frame_id, tlwhs, tscores, track_ids = result
        for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):
            if track_id < 0:
                continue
            if data_type == "kitti":
                frame_id -= 1
            x1, y1, w, h = tlwh
            center_x = min(x1 + w / 2.0, im_w - 1)
            center_down_y = min(y1 + h, im_h - 1)

            # counting objects in region of the first frame
            if frame_id == 1:
                if in_quadrangle([center_x, center_down_y], entrance, im_h, im_w):
                    in_id_list.append(-1)
                else:
                    prev_center[track_id] = [center_x, center_down_y]
            else:
                if track_id in prev_center:
                    if not in_quadrangle(
                        prev_center[track_id], entrance, im_h, im_w
                    ) and in_quadrangle(
                        [center_x, center_down_y], entrance, im_h, im_w
                    ):
                        in_id_list.append(track_id)
                    prev_center[track_id] = [center_x, center_down_y]
                else:
                    prev_center[track_id] = [center_x, center_down_y]

    # Count totol number, number at a manual-setting interval
    frame_id, tlwhs, tscores, track_ids = result
    for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):
        if track_id < 0:
            continue
        id_set.add(track_id)
        interval_id_set.add(track_id)

    # Reset counting at the interval beginning
    if frame_id % video_fps == 0 and frame_id / video_fps % secs_interval == 0:
        curr_interval_count = len(interval_id_set)
        interval_id_set.clear()
    info = "Frame id: {}, Total count: {}".format(frame_id, len(id_set))
    if do_entrance_counting:
        info += ", In count: {}, Out count: {}".format(
            len(in_id_list), len(out_id_list)
        )
    if do_break_in_counting:
        info += ", Break_in count: {}".format(len(in_id_list))
    if frame_id % video_fps == 0 and frame_id / video_fps % secs_interval == 0:
        info += ", Count during {} secs: {}".format(secs_interval, curr_interval_count)
        interval_id_set.clear()
    print(info)
    info += "\n"
    records.append(info)

    return {
        "id_set": id_set,
        "interval_id_set": interval_id_set,
        "in_id_list": in_id_list,
        "out_id_list": out_id_list,
        "prev_center": prev_center,
        "records": records,
    }

### Visualize

In [None]:
def get_color(idx):
    idx = idx * 3
    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
    return color


# visualize function visualize output to the video
def plot_tracking_dict(
    image,
    num_classes,
    tlwhs_dict,
    obj_ids_dict,
    scores_dict,
    frame_id=0,
    fps=0.0,
    ids2names=[],
    do_entrance_counting=False,
    do_break_in_counting=False,
    entrance=None,
    records=None,
    center_traj=None,
):
    im = np.ascontiguousarray(np.copy(image))
    im_h, im_w = im.shape[:2]
    if do_break_in_counting:
        entrance = np.array(entrance[:-1])  # last pair is [im_w, im_h]

    text_scale = max(0.5, image.shape[1] / 3000.0)
    text_thickness = 2
    line_thickness = max(1, int(image.shape[1] / 500.0))

    if num_classes == 1:
        if records is not None:
            start = records[-1].find("Total")
            end = records[-1].find("In")
            cv2.putText(
                im,
                records[-1][start:end],
                (0, int(40 * text_scale) + 10),
                cv2.FONT_ITALIC,
                text_scale,
                (0, 0, 255),
                thickness=text_thickness,
            )

    if num_classes == 1 and do_entrance_counting:
        entrance_line = tuple(map(int, entrance))
        cv2.rectangle(
            im,
            entrance_line[0:2],
            entrance_line[2:4],
            color=(0, 255, 255),
            thickness=line_thickness,
        )
        # find start location for entrance counting data
        start = records[-1].find("In")
        cv2.putText(
            im,
            records[-1][start:-1],
            (0, int(60 * text_scale) + 10),
            cv2.FONT_ITALIC,
            text_scale,
            (0, 0, 255),
            thickness=text_thickness,
        )

    if num_classes == 1 and do_break_in_counting:
        np_masks = np.zeros((im_h, im_w, 1), np.uint8)
        cv2.fillPoly(np_masks, [entrance], 255)

        # Draw region mask
        alpha = 0.3
        im = np.array(im).astype("float32")
        mask = np_masks[:, :, 0]
        color_mask = [0, 0, 255]
        idx = np.nonzero(mask)
        color_mask = np.array(color_mask)
        im[idx[0], idx[1], :] *= 1.0 - alpha
        im[idx[0], idx[1], :] += alpha * color_mask
        im = np.array(im).astype("uint8")

        # find start location for break in counting data
        start = records[-1].find("Break_in")
        cv2.putText(
            im,
            records[-1][start:-1],
            (entrance[0][0] - 10, entrance[0][1] - 10),
            cv2.FONT_ITALIC,
            text_scale,
            (0, 0, 255),
            thickness=text_thickness,
        )

    for cls_id in range(num_classes):
        tlwhs = tlwhs_dict[cls_id]
        obj_ids = obj_ids_dict[cls_id]
        scores = scores_dict[cls_id]
        cv2.putText(
            im,
            "frame: %d fps: %.2f num: %d" % (frame_id, fps, len(tlwhs)),
            (0, int(15 * text_scale) + 5),
            cv2.FONT_ITALIC,
            text_scale,
            (0, 0, 255),
            thickness=text_thickness,
        )

        record_id = set()
        for i, tlwh in enumerate(tlwhs):
            x1, y1, w, h = tlwh
            intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
            center = tuple(map(int, (x1 + w / 2.0, y1 + h / 2.0)))
            obj_id = int(obj_ids[i])
            if center_traj is not None:
                record_id.add(obj_id)
                if obj_id not in center_traj[cls_id]:
                    center_traj[cls_id][obj_id] = deque(maxlen=30)
                center_traj[cls_id][obj_id].append(center)

            id_text = "{}".format(int(obj_id))
            if ids2names != []:
                id_text = "{}_{}".format(ids2names[cls_id], id_text)
            else:
                id_text = "class{}_{}".format(cls_id, id_text)
            in_region = False
            if do_break_in_counting:
                center_x = min(x1 + w / 2.0, im_w - 1)
                center_down_y = min(y1 + h, im_h - 1)
                if in_quadrangle([center_x, center_down_y], entrance, im_h, im_w):
                    in_region = True

            color = get_color(abs(obj_id)) if not in_region else (0, 0, 255)
            cv2.rectangle(
                im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness
            )
            cv2.putText(
                im,
                id_text,
                (intbox[0], intbox[1] - 25),
                cv2.FONT_ITALIC,
                text_scale,
                color,
                thickness=text_thickness,
            )

            if do_break_in_counting and in_region:
                cv2.putText(
                    im,
                    "Break in now.",
                    (intbox[0], intbox[1] - 50),
                    cv2.FONT_ITALIC,
                    text_scale,
                    (0, 0, 255),
                    thickness=text_thickness,
                )

            if scores is not None:
                text = "score: {:.2f}".format(float(scores[i]))
                cv2.putText(
                    im,
                    text,
                    (intbox[0], intbox[1] - 6),
                    cv2.FONT_ITALIC,
                    text_scale,
                    color,
                    thickness=text_thickness,
                )
        if center_traj is not None:
            for traj in center_traj:
                for i in traj.keys():
                    if i not in record_id:
                        continue
                    for point in traj[i]:
                        cv2.circle(im, point, 3, (0, 0, 255), -1)
    return im

## Main Processing Function for Pedestrian counting
1. Create a videocapture to load frames
2. Prepare a set of frames for mot_predictor
3. Run AI inference for both PP-YOLOE and OC-Sort.
4. Save the results as `output.mp4`.

In [None]:
def predict_video(video_file, region_type, model_dir):
    # Initialize the OpenvineDetector
    mot_predictor = OpenvineDetector(
        os.path.join(model_dir, "model.pdmodel"),
        os.path.join(model_dir, "infer_cfg.yml"),
    )
    # Load the video
    capture = cv2.VideoCapture(video_file)
    out_path = "output.mp4"
    # Get Video info : resolution, fps, frame count
    width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(capture.get(cv2.CAP_PROP_FPS))
    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    print("video fps: %d, frame_count: %d" % (fps, frame_count))
    # Create video writer
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
    frame_id = 0
    entrance, records, center_traj = None, None, None
    center_traj = [{}]
    id_set = set()
    interval_id_set = set()
    in_id_list = list()
    out_id_list = list()
    prev_center = dict()
    records = list()
    if region_type == "horizontal":
        entrance = [0, height / 2.0, width, height / 2.0]
    elif region_type == "vertical":
        entrance = [width / 2, 0.0, width / 2, height]

    while 1:
        if frame_id % 10 == 0:
            print("frame id: ", frame_id)

        ret, frame = capture.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        res = mot_predictor.predict_image([copy.deepcopy(frame_rgb)], visual=False)
        # mot output format: id, class, score, xmin, ymin, xmax, ymax
        mot_res = parse_mot_res(res)
        # flow_statistic only support single class MOT
        boxes, scores, ids = res[0]  # batch size = 1 in MOT
        mot_result = (frame_id + 1, boxes[0], scores[0], ids[0])  # single class
        statistic = flow_statistic(
            mot_result,
            10,
            True,
            False,
            region_type,
            fps,
            entrance,
            id_set,
            interval_id_set,
            in_id_list,
            out_id_list,
            prev_center,
            records,
        )
        records = statistic["records"]
        if mot_res is not None:
            ids = mot_res["boxes"][:, 0]
            scores = mot_res["boxes"][:, 2]
            boxes = mot_res["boxes"][:, 3:]
            boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
            boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
        else:
            boxes = np.zeros([0, 4])
            ids = np.zeros([0])
            scores = np.zeros([0])

        # single class, still need to be defaultdict type for ploting
        num_classes = 1
        online_tlwhs = defaultdict(list)
        online_scores = defaultdict(list)
        online_ids = defaultdict(list)
        online_tlwhs[0] = boxes
        online_scores[0] = scores
        online_ids[0] = ids

        if mot_res is not None:
            image = plot_tracking_dict(
                frame,
                num_classes,
                online_tlwhs,
                online_ids,
                online_scores,
                frame_id=frame_id,
                fps=fps,
                ids2names=mot_predictor.pred_config.labels,
                do_entrance_counting=True,
                entrance=entrance,
                records=records,
                center_traj=center_traj,
            )
        writer.write(image)
        frame_id += 1
    writer.release()
    print("save result to {}".format(out_path))

In [None]:
# Note that 'region_type' should be one of ['horizontal', 'vertical']
predict_video(
    video_file="data/test.mp4",
    region_type="vertical",
    model_dir="model/mot_ppyoloe_s_36e_pipeline",
)