In [None]:
import torch, detectron2
!nvcc --version
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)
print("detectron2:", detectron2.__version__)

In [None]:
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
#from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode

In [None]:
#We are using the pre-trained Detectron2 model, as shown below.
cfg = get_cfg()

cfg.MODEL.DEVICE = "cuda"
# load the pre trained model from Detectron2 model zoo
cfg.merge_from_file(model_zoo.get_config_file("COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml"))
# set confidence threshold for this model
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
# load model weights
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml")
# create the predictor for pose estimation using the config
pose_detector = DefaultPredictor(cfg)

In [None]:
import numpy as np

In [None]:
gt_3d = np.load('/human36m/preprocessed_data/data_3d_h36m.npz', allow_pickle=True)['positions_3d'].item()

In [None]:
gt_2d = np.load('/human36m/preprocessed_data/data_2d_h36m_gt.npz', allow_pickle=True)['positions_2d'].item()

In [None]:
gt_3d['S11'].keys()

In [None]:
def coco2h36m(x):
    '''
        Input: x (M x T x V x C)

        COCO: {0-nose 1-Leye 2-Reye 3-Lear 4Rear 5-Lsho 6-Rsho 7-Lelb 8-Relb 9-Lwri 10-Rwri 11-Lhip 12-Rhip 13-Lkne 14-Rkne 15-Lank 16-Rank}

        H36M:
        0: 'root',
        1: 'rhip',
        2: 'rkne',
        3: 'rank',
        4: 'lhip',
        5: 'lkne',
        6: 'lank',
        7: 'belly',
        8: 'neck',
        9: 'nose',
        10: 'head',
        11: 'lsho',
        12: 'lelb',
        13: 'lwri',
        14: 'rsho',
        15: 'relb',
        16: 'rwri'
    '''
    y = np.zeros(x.shape)
    y[:,0,:] = (x[:,11,:] + x[:,12,:]) * 0.5
    y[:,1,:] = x[:,12,:]
    y[:,2,:] = x[:,14,:]
    y[:,3,:] = x[:,16,:]
    y[:,4,:] = x[:,11,:]
    y[:,5,:] = x[:,13,:]
    y[:,6,:] = x[:,15,:]
    y[:,8,:] = (x[:,5,:] + x[:,6,:]) * 0.5
    y[:,7,:] = (y[:,0,:] + y[:,8,:]) * 0.5
    y[:,9,:] = x[:,0,:]
    y[:,10,:] = (x[:,1,:] + x[:,2,:]) * 0.5
    y[:,11,:] = x[:,5,:]
    y[:,12,:] = x[:,7,:]
    y[:,13,:] = x[:,9,:]
    y[:,14,:] = x[:,6,:]
    y[:,15,:] = x[:,8,:]
    y[:,16,:] = x[:,10,:]
    return y

In [None]:
import glob

predictor = DefaultPredictor(cfg)
# level = '11'
level = 'clear'
actions = ['Directions 1', 'Discussion 1', 'Discussion 2', 'Eating', 'Eating 1', 'Greeting', 'Greeting 2', 'Phoning 2', 'Phoning 3']

# actions = ['Discussion 2', 'Eating', 'Eating 1', 'Greeting', 'Greeting 1', 'Phoning', 'Phoning 1']
cams = {'54138969': 0, '55011271': 1, '58860488': 2, '60457274': 3}
detectron = {}
temp_dict = {}

for action in actions:
  all_cams = []
  print(action)
  for cam_no in cams.keys():
    print(cam_no)        
    keypoints_per_cam = []
    count = 0
    for img_ind in range(len(gt_3d['S9'][action])):
        dir = '/human36m/Videos_S9/S9/synthetic_occlusion_images/'+ action +'.' + cam_no + '_' + level + '/' + str(img_ind).zfill(4) + '.jpg'
        im = cv2.imread(dir)
        im = cv2.imread(dir)
        outputs = predictor(im)
        predicted = outputs["instances"].pred_keypoints.cpu().numpy()
        if len(predicted) > 0:
            predicted = coco2h36m(predicted)[0]
        else:
            predicted = keypoints_per_cam[-1]
        count = count + 1
        if count % 1000 == 0:
            print(count)
        keypoints_per_cam.append(predicted)
    all_cams.append(np.array(keypoints_per_cam, dtype='float32'))
  np.savez_compressed('/human36m/Videos_S9/S9/synthetic_occlusion_keypoints/'+ action +'_' + level, positions_2d=all_cams)