# Pose estimation with OpenVINO

This notebook demonstrates live pose estimation in OpenVINO.

We use [this](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/human-pose-estimation-0001) pose estimation model from [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo/).

### Imports

In [None]:
import time

import cv2
import numpy as np
from IPython import display
from numpy.lib.stride_tricks import as_strided
from openvino import inference_engine as ie

from decoder import OpenPoseDecoder

### Download model

In [None]:
base_model_dir = "model"

model_name = "human-pose-estimation-0001"
precision = "FP16"

download_command = f"omz_downloader --name {model_name} --precision {precision} --output_dir {base_model_dir}"
! $download_command

### Load model

In [None]:
model_path = "model/intel/human-pose-estimation-0001/FP16/human-pose-estimation-0001.xml"
model_weights_path = "model/intel/human-pose-estimation-0001/FP16/human-pose-estimation-0001.bin"

# initialize inference engine
ie_core = ie.IECore()
# read the network and corresponding weights from file
net = ie_core.read_network(model=model_path, weights=model_weights_path)
# load the model on the CPU (you can use GPU or MYRIAD as well)
exec_net = ie_core.load_network(net, "CPU")

# get input and output names of nodes
input_key = list(exec_net.input_info)[0]
output_keys = list(exec_net.outputs.keys())

In [None]:
input_key, output_keys

### OpenPoseDecoder

Open Pose Decoder from [OpenVINO Model Zoo](https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/common/python/models/open_pose.py)

In [None]:
decoder = OpenPoseDecoder()

### Result processing

In [None]:
# 2d pooling in numpy (from: https://stackoverflow.com/a/54966908/1624463)
def pool2d(A, kernel_size, stride, padding, pool_mode="max"):
    """
    2D Pooling

    Parameters:
        A: input 2D array
        kernel_size: int, the size of the window
        stride: int, the stride of the window
        padding: int, implicit zero paddings on both sides of the input
        pool_mode: string, 'max' or 'avg'
    """
    # Padding
    A = np.pad(A, padding, mode="constant")

    # Window view of A
    output_shape = (
        (A.shape[0] - kernel_size) // stride + 1,
        (A.shape[1] - kernel_size) // stride + 1,
    )
    kernel_size = (kernel_size, kernel_size)
    A_w = as_strided(A, shape=output_shape + kernel_size, strides=(stride * A.strides[0], stride * A.strides[1]) + A.strides)
    A_w = A_w.reshape(-1, *kernel_size)

    # Return the result of pooling
    if pool_mode == "max":
        return A_w.max(axis=(1, 2)).reshape(output_shape)
    elif pool_mode == "avg":
        return A_w.mean(axis=(1, 2)).reshape(output_shape)


def heatmap_nms(heatmaps, pooled_heatmaps):
    return heatmaps * (heatmaps == pooled_heatmaps)


# get poses from results
def process_results(img, results):
    pafs = results[output_keys[0]]
    heatmaps = results[output_keys[1]]

    pooled_heatmaps = np.array([[pool2d(h, kernel_size=3, stride=1, padding=1, pool_mode="max") for h in heatmaps[0]]])
    nms_heatmaps = heatmap_nms(heatmaps, pooled_heatmaps)

    # decode poses
    poses, scores = decoder(heatmaps, nms_heatmaps, pafs)
    output_scale = img.shape[1] / exec_net.outputs[output_keys[0]].shape[3], img.shape[0] / exec_net.outputs[output_keys[0]].shape[2]
    # multiply coordinates by scaling factor
    poses[:, :, :2] *= output_scale

    return poses, scores

### Pose drawing
Code based on [Human Pose Estimation Demo](https://github.com/openvinotoolkit/open_model_zoo/tree/master/demos/human_pose_estimation_demo/python)

In [None]:
colors = ((255, 0, 0), (255, 0, 255), (170, 0, 255), (255, 0, 85), (255, 0, 170), (85, 255, 0), (255, 170, 0), (0, 255, 0), (255, 255, 0),
          (0, 255, 85), (170, 255, 0), (0, 85, 255), (0, 255, 170), (0, 0, 255), (0, 255, 255), (85, 0, 255), (0, 170, 255))

default_skeleton = ((15, 13), (13, 11), (16, 14), (14, 12), (11, 12), (5, 11), (6, 12), (5, 6), (5, 7),
                    (6, 8), (7, 9), (8, 10), (1, 2), (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6))


def draw_poses(img, poses, point_score_threshold, skeleton=default_skeleton):
    if poses.size == 0:
        return img

    img_limbs = np.copy(img)
    for pose in poses:
        points = pose[:, :2].astype(np.int32)
        points_scores = pose[:, 2]
        # Draw joints.
        for i, (p, v) in enumerate(zip(points, points_scores)):
            if v > point_score_threshold:
                cv2.circle(img, tuple(p), 1, colors[i], 2)
        # Draw limbs.
        for i, j in skeleton:
            if points_scores[i] > point_score_threshold and points_scores[j] > point_score_threshold:
                cv2.line(img_limbs, tuple(points[i]), tuple(points[j]), color=colors[j], thickness=4)
    cv2.addWeighted(img, 0.4, img_limbs, 0.6, 0, dst=img)
    return img

### Main processing

In [None]:
# main function to run pose estimation
def run_pose_estimation(source=0, flip=True):
    # open video source
    cam = cv2.VideoCapture(source)
    if not cam.isOpened():
        print(f"Cannot open source {source}")
        return

    try:
        while True:
            # grab the frame
            ret, frame = cam.read()
            if not ret:
                print("Source is empty")
                break
            if flip:
                frame = cv2.flip(frame, 1)

            # resize image and change dims to fit neural network input (see https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/human-pose-estimation-0001)
            input_img = cv2.resize(frame, (456, 256), interpolation=cv2.INTER_AREA)
            input_img = input_img.transpose(2, 0, 1)[np.newaxis, ...]

            # get results
            # measure processing time
            start_time = time.time()
            results = exec_net.infer(inputs={input_key: input_img})
            poses, scores = process_results(frame, results)
            stop_time = time.time()

            frame = draw_poses(frame, poses, 0.1)
            # calculate FPS
            fps = 1 / (stop_time - start_time)
            cv2.putText(frame, f"FPS: {fps:.2f}", (20, 40), cv2.FONT_HERSHEY_COMPLEX, 1.0, (0, 0, 255), 2, cv2.LINE_AA)

            # encode numpy array to jpg
            _, encoded_img = cv2.imencode(".jpg", frame, params=[cv2.IMWRITE_JPEG_QUALITY, 90])
            # create IPython image
            i = display.Image(data=encoded_img)

            # display the image in this notebook
            display.clear_output(wait=True)
            display.display(i)
    except KeyboardInterrupt:
        print("Interrupted")
    except RuntimeError as e:
        print(e)
    finally:
        cam.release()

### Run

Run with webcam. Set flip to True if you're using the front camera.

In [None]:
run_pose_estimation(flip=True)

If you don't have a webcam you can run with video.

In [None]:
run_pose_estimation("data/Psy - Gangnam Style.mp4")