In [1]:
import sys
from pathlib import Path
FILE = Path().resolve()

import argparse
from pytube import YouTube
import os.path as osp
from utils.torch_utils import select_device, time_sync
from utils.general import check_img_size
from utils.datasets import LoadImages
from models.experimental import attempt_load
import torch
import cv2
import numpy as np
import yaml
from tqdm import tqdm
import imageio
from val import run_nms, post_process_batch

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, default='data/coco-kp.yaml')
parser.add_argument('--imgsz', type=int, default=1280)
parser.add_argument('--weights', default='kapao_s_coco.pt')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or cpu')
parser.add_argument('--half', action='store_true')
parser.add_argument('--conf-thres', type=float, default=0.5, help='confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
parser.add_argument('--no-kp-dets', action='store_true', help='do not use keypoint objects')
parser.add_argument('--conf-thres-kp', type=float, default=0.5)
parser.add_argument('--conf-thres-kp-person', type=float, default=0.2)
parser.add_argument('--iou-thres-kp', type=float, default=0.45)
parser.add_argument('--overwrite-tol', type=int, default=50)
parser.add_argument('--scales', type=float, nargs='+', default=[1])
parser.add_argument('--flips', type=int, nargs='+', default=[-1])
parser.add_argument('--display', action='store_true', help='display inference results')
parser.add_argument('--fps', action='store_true', help='display fps')
parser.add_argument('--gif', action='store_true', help='create fig')
parser.add_argument('--start', type=int, default=20, help='start time (s)')
parser.add_argument('--end', type=int, default=80, help='end time (s)')
args = parser.parse_args(args=["--start","0","--end","150"])

In [3]:
with open(args.data) as f:
    data = yaml.safe_load(f)  # load data dict

# add inference settings to data dict
data['imgsz'] = args.imgsz
data['conf_thres'] = args.conf_thres
data['iou_thres'] = args.iou_thres
data['use_kp_dets'] = not args.no_kp_dets
data['conf_thres_kp'] = args.conf_thres_kp
data['iou_thres_kp'] = args.iou_thres_kp
data['conf_thres_kp_person'] = args.conf_thres_kp_person
data['overwrite_tol'] = args.overwrite_tol
data['scales'] = args.scales
data['flips'] = [None if f == -1 else f for f in args.flips]

In [4]:
VIDEO_NAME = 'movie/mp4/twitter_pingpong.mp4'
assert osp.isfile(VIDEO_NAME)


GRAY = (200, 200, 200)
CROWD_THRES = 450  # max bbox size for crowd classification
CROWD_ALPHA = 0.5
CROWD_KP_SIZE = 2
CROWD_KP_THICK = 2
CROWD_SEG_THICK = 2

BLUE = (245, 140, 66)
ORANGE = (66, 140, 245)
PLAYER_ALPHA_BOX = 0.85
PLAYER_ALPHA_POSE = 0.3
PLAYER_KP_SIZE = 4
PLAYER_KP_THICK = 4
PLAYER_SEG_THICK = 4
FPS_TEXT_SIZE = 3


In [5]:
device = select_device(args.device, batch_size=1)
print('Using device: {}'.format(device))

Using device: cpu


In [6]:
model = attempt_load(args.weights, map_location=device)  # load FP32 model
half = args.half & (device.type != 'cpu')
if half:  # half precision only supported on CUDA
    model.half()
stride = int(model.stride.max())  # model stride

imgsz = check_img_size(args.imgsz, s=stride)  # check image size
dataset = LoadImages('./{}'.format(VIDEO_NAME), img_size=imgsz, stride=stride, auto=True)

if device.type != 'cpu':
    model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [7]:
cap = dataset.cap
cap.set(cv2.CAP_PROP_POS_MSEC, args.start * 1000)
fps = cap.get(cv2.CAP_PROP_FPS)
n = int(fps * (args.end - args.start))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
gif_frames = []
video_name = 'pingpong_inference_{}'.format(osp.splitext(args.weights)[0])


In [17]:
for i, (path, img, im0,_) in enumerate(dataset):
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img = img / 255.0  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]  # expand for batch dim

    out = model(img, augment=True, kp_flip=data['kp_flip'], scales=data['scales'], flips=data['flips'])[0]
    person_dets, kp_dets = run_nms(data, out)
    bboxes, poses, scores, ids, fused = post_process_batch(data, img, [], [[im0.shape[:2]]], person_dets, kp_dets)

    bboxes = np.array(bboxes)
    poses = np.array(poses)

    im0_copy = im0.copy()
    player_idx = []    
    break

In [27]:
poses[5]

array([[     986.26,      237.57,           0],
       [     985.31,       235.2,           0],
       [     987.52,      235.33,           0],
       [     981.55,      235.79,           0],
       [     993.35,      235.92,           0],
       [     973.82,      245.88,     0.51308],
       [     997.62,      245.84,     0.52574],
       [     967.34,      263.12,           0],
       [     1008.5,      263.64,           0],
       [        971,      262.65,           0],
       [     1006.3,      264.44,           0],
       [     979.05,      279.72,           0],
       [     997.17,      279.85,           0],
       [     967.95,      295.76,           0],
       [     992.81,      296.69,           0],
       [     966.89,      312.46,           0],
       [      992.4,      313.26,           0]])