In [1]:
import sys
from pathlib import Path
FILE = Path().resolve()
import os 
os.chdir(str(FILE.parent))

sys.path.append(str(FILE.parent))
import argparse
from pytube import YouTube
import os.path as osp
from utils.torch_utils import select_device, time_sync
from utils.general import check_img_size
from utils.datasets import LoadImages
from models.experimental import attempt_load
import torch
import cv2
import numpy as np
import yaml
# from tqdm import tqdm
from tqdm.notebook import tqdm
import imageio
from val import run_nms, post_process_batch

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='data/coco-kp.yaml')
parser.add_argument('--imgsz', type=int, default=1280)
parser.add_argument('--weights', default='kapao_s_coco.pt')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or cpu')
parser.add_argument('--half', action='store_true')
parser.add_argument('--conf-thres', type=float, default=0.5, help='confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
parser.add_argument('--no-kp-dets', action='store_true', help='do not use keypoint objects')
parser.add_argument('--conf-thres-kp', type=float, default=0.5)
parser.add_argument('--conf-thres-kp-person', type=float, default=0.2)
parser.add_argument('--iou-thres-kp', type=float, default=0.45)
parser.add_argument('--overwrite-tol', type=int, default=50)
parser.add_argument('--scales', type=float, nargs='+', default=[1])
parser.add_argument('--flips', type=int, nargs='+', default=[-1])
parser.add_argument('--display', action='store_true', help='display inference results')
parser.add_argument('--fps', action='store_true', help='display fps')
parser.add_argument('--gif', action='store_true', help='create fig')
parser.add_argument('--start', type=int, default=20, help='start time (s)')
parser.add_argument('--end', type=int, default=80, help='end time (s)')
args = parser.parse_args(args=["--start","0","--end","120"])

In [3]:
with open(args.data) as f:
    data = yaml.safe_load(f)  # load data dict

# add inference settings to data dict
data['imgsz'] = args.imgsz
data['conf_thres'] = args.conf_thres
data['iou_thres'] = args.iou_thres
data['use_kp_dets'] = not args.no_kp_dets
data['conf_thres_kp'] = args.conf_thres_kp
data['iou_thres_kp'] = args.iou_thres_kp
data['conf_thres_kp_person'] = args.conf_thres_kp_person
data['overwrite_tol'] = args.overwrite_tol
data['scales'] = args.scales
data['flips'] = [None if f == -1 else f for f in args.flips]

In [4]:
VIDEO_NAME = 'movie/mp4/twitter_pingpong.mp4'
video_name = 'twitter_pingpong.mp4'
assert osp.isfile(VIDEO_NAME)


GRAY = (200, 200, 200)
CROWD_THRES = 200  # max bbox size for crowd classification
CROWD_ALPHA = 0.5
CROWD_KP_SIZE = 2
CROWD_KP_THICK = 2
CROWD_SEG_THICK = 2

BLUE = (245, 140, 66)
ORANGE = (66, 140, 245)
PLAYER_ALPHA_BOX = 0.85
PLAYER_ALPHA_POSE = 0.3
PLAYER_KP_SIZE = 4
PLAYER_KP_THICK = 4
PLAYER_SEG_THICK = 4
FPS_TEXT_SIZE = 3


In [5]:
device = select_device(args.device, batch_size=1)
print('Using device: {}'.format(device))

Using device: cuda:0


In [20]:
model = attempt_load(args.weights, map_location=device)  # load FP32 model
half = args.half & (device.type != 'cpu')
if half:  # half precision only supported on CUDA
    model.half()
stride = int(model.stride.max())  # model stride

imgsz = check_img_size(args.imgsz, s=stride)  # check image size
dataset = LoadImages('./{}'.format(VIDEO_NAME), img_size=imgsz, stride=stride, auto=True)

cap = dataset.cap
cap.set(cv2.CAP_PROP_POS_MSEC, args.start * 1000)
fps = cap.get(cv2.CAP_PROP_FPS)
n = int(fps * (args.end - args.start))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
gif_frames = []
video_name = 'pingpong_inference_{}'.format(osp.splitext(args.weights)[0])

if device.type != 'cpu':
    model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once

if not args.display:
    writer = cv2.VideoWriter(video_name + '.mp4',cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
    if not args.fps:  # tqdm might slows down inference
        progress_dataset = (dataset)

In [21]:
# bboxを取得
t0 = time_sync()

output_bboxes = []
output_poses = []
output_fused = []
output_scores = []

for i, (path, img, im0,_) in enumerate(progress_dataset):
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img = img / 255.0  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]  # expand for batch dim

    out = model(img, augment=True, kp_flip=data['kp_flip'], scales=data['scales'], flips=data['flips'])[0]
    person_dets, kp_dets = run_nms(data, out)
    bboxes, poses, scores, ids, fused = post_process_batch(data, img, [], [[im0.shape[:2]]], person_dets, kp_dets)

    bboxes = np.array(bboxes)
    poses = np.array(poses)
    scores = np.array(scores)
    fused = np.array(fused)
    
   
    output_bboxes.append(bboxes)
    output_poses.append(poses)
    output_scores.append(scores)
    output_fused.append(fused)


    

In [22]:
def convert_tldr2ltwh(bboxes,scores)->list:
    """convert topleft downright to left top width hight
    Args:
        bboxes (list): shape = [frames,nums,[x1 y1 x2 y2]]
    
    Returns:
        list: [<frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>, <conf>, <x>, <y>, <z>]
    """
    ret = []
    for i, (bbox,score) in enumerate(zip(bboxes,scores),start=1):
        for (x1, y1, x2, y2),(conf) in zip(bbox,score):
            bb_left = x1
            bb_top = y1
            bb_width = abs(x2-x1)
            bb_height = abs(y1-y2)
            
            _box = [i,-1,bb_left,bb_top,bb_width,bb_height,conf,-1,-1,-1]
        
            ret.append(_box)
    return ret

def convert_ltwh2tldr(bboxes)->list:
    """ convert left top width height to topleft downright
    Args:
        bboxes (list):         
            list: [<frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>, <conf>, <x>, <y>, <z>]

    Returns:
        list: [<frame>, <id>, <x1>, <y1>, <x2>, <y2>, <conf>, <x>, <y>, <z>]
    """
    ret = []
    for row in bboxes:
        x2 = row[2] + row[4]
        y2 = row[3] + row[5]
        _row = [row[0],row[1],row[2],row[3],x2,y2,row[6],row[7],row[8],row[9]]
        assert len(row) == len(_row)
        del row
        ret.append(_row)
    return ret


def convert_poses_MOTformat(poses)->list:
    """ convert format from kapao to MOT-like
    Args:
        poses: [frame,person,poses]
    
    Returns:
        list: [<frame>, <id>, <poses>(np.ndarray)]
    """
    ret = []
    
    for i,frame_block in enumerate(poses,start=1):
        # 一フレームごと
        for j, _pose in enumerate(frame_block):
            ret.append([i,j, _pose])
    return ret 

def get_masked_data(data:list, mask_id:int=-1):
    """get data whose mask id is the same."""
    ret = []
    for d in data:
        if d[0]==mask_id:
            ret.append(d)
        if d[0]>mask_id:
            break
    return ret

def get_max_frame(data:list)->int:
    """ get max frame number
    Args:
        data (list): MOT-like format
    
    Returns:
        int
    """
    return data[-1][0]

In [23]:
ltwh_bboxes = convert_tldr2ltwh(output_bboxes,output_scores)
tldr_bboxes = convert_ltwh2tldr(ltwh_bboxes)
mot_poses  = convert_poses_MOTformat(output_poses)

In [31]:
display(get_masked_data(tldr_bboxes, 1))
display(get_masked_data(tldr_bboxes, 1100))

[[1, -1, 281.0, 327.0, 449.0, 615.0, 0.9247974, -1, -1, -1],
 [1, -1, 724.0, 279.0, 948.0, 498.0, 0.87222314, -1, -1, -1],
 [1, -1, 1033.0, 225.0, 1126.0, 574.0, 0.8538191, -1, -1, -1],
 [1, -1, 149.0, 289.0, 208.0, 376.0, 0.7752365, -1, -1, -1],
 [1, -1, 57.0, 292.0, 120.0, 372.0, 0.6448778, -1, -1, -1],
 [1, -1, 960.0, 225.0, 1015.0, 310.0, 0.600439, -1, -1, -1],
 [1, -1, 1145.0, 231.0, 1172.0, 260.0, 0.59739846, -1, -1, -1]]

[[1100, -1, 12.0, 287.0, 268.0, 658.0, 0.9105787, -1, -1, -1],
 [1100, -1, 1023.0, 221.0, 1113.0, 574.0, 0.8576677, -1, -1, -1],
 [1100, -1, 708.0, 253.0, 891.0, 478.0, 0.854373, -1, -1, -1],
 [1100, -1, 463.0, 229.0, 508.0, 311.0, 0.62112606, -1, -1, -1],
 [1100, -1, 1143.0, 231.0, 1170.0, 263.0, 0.5603369, -1, -1, -1]]

In [33]:
dataset = LoadImages('./{}'.format(VIDEO_NAME), img_size=imgsz, stride=stride, auto=True)
dataset = tqdm(dataset,total=n)
t0 = time_sync()

for frame_id , (path,img,im0,_) in enumerate(dataset,start=1):
    img = torch.from_numpy(img).to(device)
    img = img.half()
    img /= 255.0
    if len(img.shape) == 3:
        img = img[None]
    
    im0_copy = im0.copy()
    
    # 対象のフレームとして切り出した
    mask_poses = get_masked_data(mot_poses,frame_id)
    mask_bboxes = get_masked_data(tldr_bboxes,frame_id)
    
    for _poses, _bboxes in zip(mask_poses,mask_bboxes):
        x1,y1,x2,y2 = _bboxes[2:6]
        cv2.rectangle(im0_copy,(int(x1),int(y1)),(int(x2),int(y2)),GRAY, thickness=2)
    im0 = cv2.addWeighted(im0, CROWD_ALPHA, im0_copy, 1 - CROWD_ALPHA, gamma=0)

    
    
    if frame_id == 1:
        t = time_sync() - t0
    else:
        t = time_sync() - t1
    if args.gif:
        gif_frames.append(cv2.resize(im0, dsize=None, fx=0.25, fy=0.25)[:, :, [2, 1, 0]])
    elif not args.display:
        writer.write(im0)
    else:
        cv2.imshow('', cv2.resize(im0, dsize=None, fx=0.5, fy=0.5))
        cv2.waitKey(1)

    t1 = time_sync()
    if i == n - 1:
        break


    
cv2.destroyAllWindows()
cap.release()
if not args.display:
    writer.release()
if args.gif:
    print('Saving GIF...')
    with imageio.get_writer(video_name + '.gif', mode="I", fps=fps) as writer:
        for idx, frame in tqdm(enumerate(gif_frames)):
            writer.append_data(frame)

  0%|          | 0/3600 [00:00<?, ?it/s]

In [None]:
# t0 = time_sync()
# for i, (path, img, im0,_) in enumerate(progress_dataset):
#     img = torch.from_numpy(img).to(device)
#     img = img.half() if half else img.float()  # uint8 to fp16/32
#     img = img / 255.0  # 0 - 255 to 0.0 - 1.0
#     if len(img.shape) == 3:
#         img = img[None]  # expand for batch dim

#     out = model(img, augment=True, kp_flip=data['kp_flip'], scales=data['scales'], flips=data['flips'])[0]
#     person_dets, kp_dets = run_nms(data, out)
#     bboxes, poses, scores, ids, fused = post_process_batch(data, img, [], [[im0.shape[:2]]], person_dets, kp_dets)

#     bboxes = np.array(bboxes)
#     poses = np.array(poses)

#     im0_copy = im0.copy()
#     player_idx = []   
#     # DRAW CROWD POSES
#     for j, (bbox, pose) in enumerate(zip(bboxes, poses)):
#         x1, y1, x2, y2 = bbox
#         size = ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** 0.5
#         if size < CROWD_THRES:
#             cv2.rectangle(im0_copy, (int(x1), int(y1)), (int(x2), int(y2)), GRAY, thickness=2)
#             for x, y, _ in pose[:5]:
#                 cv2.circle(im0_copy, (int(x), int(y)), CROWD_KP_SIZE, GRAY, CROWD_KP_THICK)
#             for seg in data['segments'].values():
#                 pt1 = (int(pose[seg[0], 0]), int(pose[seg[0], 1]))
#                 pt2 = (int(pose[seg[1], 0]), int(pose[seg[1], 1]))
#                 cv2.line(im0_copy, pt1, pt2, GRAY, CROWD_SEG_THICK)
#         else:
#             player_idx.append(j)
#     im0 = cv2.addWeighted(im0, CROWD_ALPHA, im0_copy, 1 - CROWD_ALPHA, gamma=0)
#     # DRAW PLAYER POSES
#     player_bboxes = bboxes[player_idx][:2]
#     player_poses = poses[player_idx][:2]
    
    

#     def draw_player_poses(im0, missing=-1):
#         for j, (bbox, pose, color) in enumerate(zip(
#                 player_bboxes[[orange_player, blue_player]],
#                 player_poses[[orange_player, blue_player]],
#                 [ORANGE, BLUE])):
#             if j == missing:
#                 continue
#             im0_copy = im0.copy()
#             x1, y1, x2, y2 = bbox
#             cv2.rectangle(im0_copy, (int(x1), int(y1)), (int(x2), int(y2)), color, thickness=-1)
#             im0 = cv2.addWeighted(im0, PLAYER_ALPHA_BOX, im0_copy, 1 - PLAYER_ALPHA_BOX, gamma=0)
#             im0_copy = im0.copy()
#             for x, y, _ in pose:
#                 cv2.circle(im0_copy, (int(x), int(y)), PLAYER_KP_SIZE, color, PLAYER_KP_THICK)
#             for seg in data['segments'].values():
#                 pt1 = (int(pose[seg[0], 0]), int(pose[seg[0], 1]))
#                 pt2 = (int(pose[seg[1], 0]), int(pose[seg[1], 1]))
#                 cv2.line(im0_copy, pt1, pt2, color, PLAYER_SEG_THICK)
#             im0 = cv2.addWeighted(im0, PLAYER_ALPHA_POSE, im0_copy, 1 - PLAYER_ALPHA_POSE, gamma=0)
#         return im0   

#     if i == 0:
#         # orange player on left at start
#         orange_player = np.argmin(player_bboxes[:, 0])
#         blue_player = int(not orange_player)
#         im0 = draw_player_poses(im0)
#     else:
#         # simple player tracking based on frame-to-frame pose difference
#         dist = []
#         for pose in poses_last:
#             dist.append(np.mean(np.linalg.norm(player_poses[0, :, :2] - pose[:, :2], axis=-1)))
#         if np.argmin(dist) == 0:
#             orange_player = 0
#         else:
#             orange_player = 1
#         blue_player = int(not orange_player)

#         # if only one player detected, find which player is missing
#         missing = -1
#         if len(player_poses) == 1:
#             if orange_player == 0:  # missing blue player
#                 player_poses = np.concatenate((player_poses, poses_last[1:]), axis=0)
#                 player_bboxes = np.concatenate((player_bboxes, bboxes_last[1:]), axis=0)
#                 missing = 1
#             else:  # missing orange player
#                 player_poses = np.concatenate((player_poses, poses_last[:1]), axis=0)
#                 player_bboxes = np.concatenate((player_bboxes, bboxes_last[:1]), axis=0)
#                 missing = 0
#         im0 = draw_player_poses(im0, missing)

#     bboxes_last = player_bboxes[[orange_player, blue_player]]
#     poses_last = player_poses[[orange_player, blue_player]]

#     if i == 0:
#         t = time_sync() - t0
#     else:
#         t = time_sync() - t1
#     if args.gif:
#         gif_frames.append(cv2.resize(im0, dsize=None, fx=0.25, fy=0.25)[:, :, [2, 1, 0]])
#     elif not args.display:
#         writer.write(im0)
#     else:
#         cv2.imshow('', cv2.resize(im0, dsize=None, fx=0.5, fy=0.5))
#         cv2.waitKey(1)

#     t1 = time_sync()
#     if i == n - 1:
#         break
    


# cv2.destroyAllWindows()
# cap.release()
# if not args.display:
#     writer.release()
# if args.gif:
#     print('Saving GIF...')
#     with imageio.get_writer(video_name + '.gif', mode="I", fps=fps) as writer:
#         for idx, frame in tqdm(enumerate(gif_frames)):
#             writer.append_data(frame)