In [1]:
from tqdm import tqdm
from collections import defaultdict
from copy import deepcopy
import os
from pprint import pprint

import matplotlib.pyplot as plt
from pycocotools.coco import COCO
import numpy as np
import cv2
from mmcv import imread
from mmengine import dump, load

from utils import box_xywh2xyxy, box_xyxy2xywh

In [15]:
train_img = load('../../annotations/image_info_train.json')
test_img  = load('../../annotations/image_info_test.json')

train_img_root = '../../images/train/'
test_img_root = '../../images/test/'

ins_categories = load('../../configs/ins_categories.json')
coco_ins_label2cat = {i: cat['id'] for i, cat in enumerate(ins_categories)}
coco_ins_cat2label = {cat['id']: i for i, cat in enumerate(ins_categories)}

# 方法

## Instance

### watch

In [7]:
font = cv2.FONT_HERSHEY_SIMPLEX
palette = np.asarray([
    (220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
    (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
    (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
    (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
    (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
    (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
    (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
    (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
    (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
    (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
    (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
    (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
    (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
    (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
    (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
    (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
    (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
    (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
    (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
    (246, 0, 122), (191, 162, 208)], dtype=float) / 255

In [8]:
def show_box(box, ax, info: dict = {}):
    x, y, w, h = box

    edgecolor = info.get('edgecolor', 'green')
    name = info.get('name', '')

    # ax.text(x0, y0, name, fontdict=font)
    ax.text(x, y, name)
    # 底色
    ax.add_patch(plt.Rectangle((x, y), w, h, edgecolor='white', facecolor=(0,0,0,0), lw=2))
    # 类别颜色
    ax.add_patch(plt.Rectangle((x, y), w, h, edgecolor=edgecolor, facecolor=(0,0,0,0), lw=1))

def watch_ins_proposal(proposal: dict, img_path: str, with_name: bool = True):
    image = imread(img_path, channel_order='rgb')
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    ax.imshow(image)
    ax.set_title(os.path.basename(img_path))
    ax.axis('off')
    bboxes = proposal['instances']['bboxes']
    labels = proposal['instances']['labels']
    for box, label in zip(bboxes, labels):
        cat = coco_ins_label2cat[label]
        info = dict(edgecolor=palette[label])
        if with_name:
            info['name'] = ins_categories[cat]['name']
        show_box(box, ax, info=info)

### convert

In [18]:
def cvt_mmdet_proposal(ins_pred: list, coco_img: dict, score_thr: float = 0.1):
    """从mmdet的预测转换"""
    id2filename = {item['id']: item['file_name'] for item in coco_img['images']}

    num_valid = 0
    ins_proposals = defaultdict(lambda:{'bboxes': [], 'labels': [], 'scores': []})
    for ins in tqdm(ins_pred):
        if ins['score'] < score_thr:
            continue
        num_valid += 1
        img_name = id2filename[ins['image_id']]
        ins_proposals[img_name]['bboxes'].append(box_xywh2xyxy(ins['bbox']))
        ins_proposals[img_name]['labels'].append(ins['category_id'])
        ins_proposals[img_name]['scores'].append(ins['score'      ])
    print('num valid:', num_valid)

    total_proposals = {}
    for filename, props in ins_proposals.items():
        labels = props['labels']
        if coco_ins_cat2label is not None:
            labels = [coco_ins_cat2label[i] for i in labels]
        ins_props = {
            'bboxes': np.float32(props['bboxes']).round(0),
            'scores': np.float32(props['scores']).round(3),
            'labels': np.int64(labels),
            'ids'   : np.arange(len(labels))  # 单张图内id
        }

        total_proposals[filename] = {'instances': ins_props}
    return total_proposals

In [19]:
def cvt_scg_proposal(root, ins_label_2_id_map: dict = None, 
                     score_thr: float = 0.1, human_id: int = 0):
    """从SCG系列（SCG, ViPLO, UPT）的proposal转换"""
    total_proposals = {}
    for filename in tqdm(os.listdir(root)):
        props = load(os.path.join(root, filename))
        img_name = filename.replace('.json', '.jpg')

        labels = props['labels']
        if ins_label_2_id_map is not None:
            labels = [ins_label_2_id_map[i] for i in labels]

        bboxes = np.float32(props['boxes' ]).round(0)  # xyxy
        scores = np.float32(props['scores']).round(3)
        labels = np.int64(labels)

        keep = scores > score_thr
        keep_human = keep[labels == human_id]

        ins_props = {
            'bboxes': bboxes[keep],
            'scores': scores[keep],
            'labels': labels[keep],
            'ids'   : np.arange(keep.sum())  # 单张图内id
        }

        if 'human_joints' in props:
            points = np.float32(props['human_joints']
                                ).round(0).reshape(-1, 17, 2)
            scores = np.float32(props['human_joints_score']
                                ).round(3).reshape(-1, 17)
            pose_props = {
                'keypoints' : points[keep_human],
                'scores'    : scores[keep_human],
                'tgt_ins_id': np.where(ins_props['labels']==human_id)[0]
            }

            total_proposals[img_name] = {
                'instances'       : ins_props, 
                'person_keypoints': pose_props
            }
        else:
            total_proposals[img_name] = {'instances': ins_props}
    return total_proposals

In [20]:
def cvt_ins_ann2proposal(coco_ins: COCO):
    proposals = dict()
    for img_id in tqdm(coco_ins.getImgIds()):
        img_info = coco_ins.loadImgs(img_id)[0]
        ins_anns = coco_ins.loadAnns(sorted(coco_ins.getAnnIds(img_id)))

        bboxes, labels = [], []
        for ann in ins_anns:
            bboxes.append(box_xywh2xyxy(ann['bbox']))
            labels.append(coco_ins_cat2label[ann['category_id']])

        proposals[img_info['file_name']] = {
            'instances': {
                'bboxes': np.float32(bboxes),
                'scores': np.ones_like(labels, dtype=np.float32),
                'labels': np.int64(labels),
                'ids'   : np.arange(len(labels))  # 单张图内id
            }
        }
    return proposals

## Person keypoints

### watch

In [12]:
from mmengine.structures import InstanceData
from mmpose.structures import PoseDataSample
from mmpose.visualization import PoseLocalVisualizer

  from .autonotebook import tqdm as notebook_tqdm
    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0)
    Python  3.9.18 (you have 3.9.18)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [13]:
pose_categories = load('../../configs/pose_categories.json')
pprint(pose_categories, compact=True)

dataset_meta = {'skeleton_links': pose_categories[0]['skeleton']}
pose_local_visualizer = PoseLocalVisualizer(
    link_color=tuple((255, 0, 0)) * len(dataset_meta['skeleton_links']),
    line_width=4)
pose_local_visualizer.set_dataset_meta(dataset_meta)

[{'id': 1,
  'keypoints': ['nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',
                'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
                'left_wrist', 'right_wrist', 'left_hip', 'right_hip',
                'left_knee', 'right_knee', 'left_ankle', 'right_ankle'],
  'name': 'person',
  'skeleton': [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
               [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
               [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]],
  'supercategory': 'person'}]


In [14]:
def watch_pose_proposal(proposal: dict, img_path: str):
    visualizer = deepcopy(pose_local_visualizer)
    image = imread(img_path, channel_order='rgb')
    data = PoseDataSample()
    keypoints = np.array(proposal['person_keypoints']['keypoints']).reshape(-1, 17, 3)
    data.gt_instances = InstanceData(keypoints=keypoints)
    vis_result = visualizer.add_datasample('image', image, data, draw_pred=False)
    plt.imshow(vis_result)

### convert

In [12]:
def extract_person_from_proposal(ins_props: dict, coco_img: dict):
    """挑出“人”实例，用于姿态估计"""
    filename2id = {item['file_name']: item['id'] for item in coco_img['images']}

    human_ins = []
    imgs_wo_human = []

    for filename, prop in tqdm(ins_props.items()):
        prop = prop['instances']
        image_id = filename2id[filename]
        wo_human = True
        for label, bbox, score, id in zip(prop['labels'], prop['bboxes'], 
                                          prop['scores'], prop['ids'   ]):
            if label == 0:
                wo_human = False
                human_ins.append({
                    'image_id'   : image_id,
                    'bbox'       : box_xyxy2xywh(bbox),
                    'score'      : score,
                    'category_id': 1,
                    'id'         : id
                })
        if wo_human:
            imgs_wo_human.append(image_id)

    print('num person:', len(human_ins))
    print('num image wo person:', len(imgs_wo_human))
    return human_ins, imgs_wo_human

## 合并各种类型的proposal

In [13]:
def merge_pose_to_prop(proposals: dict, pose_pred: list, coco_img: dict) -> dict:
    id2filename = {item['id']: item['file_name'] for item in coco_img['images']}

    pose_proposals = defaultdict(list)
    for pose in tqdm(pose_pred):
        img_name = id2filename[pose['image_id']]
        pose_proposals[img_name].append(pose['keypoints'])
    pose_proposals = dict(pose_proposals)

    print('num image:', len(set(proposals.keys())))
    print('num image wo pose prop:', 
          len(set(proposals.keys()) ^ set(pose_proposals.keys())))

    total_proposals = deepcopy(proposals)
    for img_name, props in total_proposals.items():
        if img_name not in pose_proposals:
            # 没有person预测，也就没有pose预测
            poses = np.zeros((0, 17, 3))
        else:
            poses = pose_proposals[img_name]
            poses = np.array(poses, dtype=np.float32).reshape(-1, 17, 3)
        props['person_keypoints'] = {
            'keypoints': poses[..., :-1],
            'scores'   : poses[...,  -1],
            'tgt_ins_id': np.where(proposals[img_name]['instances']['labels']==0)[0]
        }
    return total_proposals

# 实际转换

### FasterRCNN(SCG)

In [25]:
proposal_root = '../../proposals/SCG/'
train_total_prop_path = os.path.join(proposal_root, 'proposal_scg_train.pkl')
test_total_prop_path  = os.path.join(proposal_root, 'proposal_scg_test.pkl')
train_ins_prop_root   = os.path.join(proposal_root, 'trainval')
test_ins_prop_root    = os.path.join(proposal_root, 'test')
train_pose_prop_path  = os.path.join(proposal_root, 'body2d_vcoco-train_scg.keypoints.json')
test_pose_prop_path   = os.path.join(proposal_root, 'body2d_vcoco-test_scg.keypoints.json')
train_person_path     = os.path.join(proposal_root, 'train_person.json')
test_person_path      = os.path.join(proposal_root, 'test_person.json')

#### instance

In [21]:
# 转换proposal格式
scg_ins_map = {i: i-1 for i in range(1, 81)}
train_proposals = cvt_scg_proposal(train_ins_prop_root, scg_ins_map, score_thr=0.1)
test_proposals  = cvt_scg_proposal( test_ins_prop_root, scg_ins_map, score_thr=0.1)

100%|██████████| 4969/4969 [00:00<00:00, 5261.58it/s]
100%|██████████| 4532/4532 [00:00<00:00, 5567.31it/s]


In [34]:
# train_proposals = load(train_total_prop_path)
# test_proposals  = load( test_total_prop_path)
# len(train_proposals), len(test_proposals)

In [22]:
train_proposals['COCO_train2014_000000000165.jpg']

{'instances': {'bboxes': array([[257., 106., 630., 519.],
         [ 12.,  22., 373., 535.],
         [151., 155., 262., 406.],
         [245.,   3., 510., 327.],
         [162., 154., 322., 386.],
         [  1., 320.,  36., 534.],
         [160., 141., 251., 397.],
         [385., 224., 443., 285.],
         [248., 376., 287., 402.],
         [171., 119., 426., 498.],
         [ 39.,   0., 447., 439.],
         [172., 136., 254., 276.],
         [ 47.,  29., 275., 386.],
         [240., 380., 323., 535.],
         [300., 107., 604., 353.],
         [  2.,   3., 115., 294.],
         [142.,   7., 496., 455.],
         [389., 374., 452., 475.]], dtype=float32),
  'scores': array([0.998, 0.98 , 0.946, 0.763, 0.489, 0.464, 0.412, 0.409, 0.29 ,
         0.207, 0.192, 0.172, 0.158, 0.151, 0.141, 0.132, 0.129, 0.102],
        dtype=float32),
  'labels': array([ 0,  0, 27, 25, 76,  0, 43, 56, 76, 27, 25, 27,  0, 26,  0, 56,  0,
         27]),
  'ids': array([ 0,  1,  2,  3,  4,  5,  6,  7,  

#### person_keypoints

In [23]:
# 提取仅含“人”的实例
train_person, train_imgs_wo_human = extract_person_from_proposal(train_proposals, train_img)
test_person ,  test_imgs_wo_human = extract_person_from_proposal( test_proposals,  test_img)
dump(train_person, train_person_path)
dump( test_person,  test_person_path)

100%|██████████| 4969/4969 [00:00<00:00, 16145.47it/s]


num person: 47187
num image wo person: 0


100%|██████████| 4532/4532 [00:00<00:00, 17777.86it/s]


num person: 44674
num image wo person: 2


In [None]:
# train_person = load(train_person_path)
# test_person  = load( test_person_path)
# len(train_person), len(test_person)

(59010, 55569)

In [24]:
train_person[100]

{'image_id': 27562,
 'bbox': [577.0, 270.0, 8.0, 18.0],
 'score': 0.316,
 'category_id': 1,
 'id': 4}

In [28]:
# 获取姿态估计结果
train_pose = load(train_pose_prop_path)
test_pose  = load( test_pose_prop_path)
assert len(train_pose) == len(train_person), f'{len(train_pose)} != {len(train_person)}'
assert len( test_pose) == len( test_person), f'{len( test_pose)} != {len( test_person)}'

In [29]:
# 转换成keypoints_proposal
train_proposals = merge_pose_to_prop(train_proposals, train_pose, train_img)
test_proposals  = merge_pose_to_prop( test_proposals,  test_pose,  test_img)

100%|██████████| 47187/47187 [00:00<00:00, 915920.23it/s]


num image: 4969
num image wo pose prop: 0


100%|██████████| 44674/44674 [00:00<00:00, 1176292.50it/s]


num image: 4532
num image wo pose prop: 2


In [30]:
dump(train_proposals, train_total_prop_path)
dump( test_proposals,  test_total_prop_path)

### UPT

In [31]:
proposal_root = '../../proposals/UPT/'
train_total_prop_path = os.path.join(proposal_root, 'proposal_upt_train.pkl')
test_total_prop_path  = os.path.join(proposal_root, 'proposal_upt_test.pkl')
train_ins_prop_path   = os.path.join(proposal_root, 'ins_vcoco-train_upt.bbox.json')
test_ins_prop_path    = os.path.join(proposal_root, 'ins_vcoco-test_upt.bbox.json')
train_pose_prop_path  = os.path.join(proposal_root, 'body2d_vcoco-train_upt.keypoints.json')
test_pose_prop_path   = os.path.join(proposal_root, 'body2d_vcoco-test_upt.keypoints.json')
train_person_path     = os.path.join(proposal_root, 'train_person.json')
test_person_path      = os.path.join(proposal_root, 'test_person.json')

#### instance

In [32]:
train_ins_props = load(train_ins_prop_path)
test_ins_props  = load( test_ins_prop_path)

In [34]:
train_ins_props[10]

{'image_id': 165,
 'bbox': [598.669921875, 354.3212890625, 41.21551513671875, 112.9805908203125],
 'score': 0.01862606778740883,
 'category_id': 62}

In [37]:
# 转换proposal格式
train_proposals = cvt_mmdet_proposal(train_ins_props, train_img, score_thr=0.1)
test_proposals  = cvt_mmdet_proposal( test_ins_props,  test_img, score_thr=0.1)

100%|██████████| 540000/540000 [00:00<00:00, 2733718.63it/s]


num valid: 47350


100%|██████████| 494600/494600 [00:00<00:00, 2075272.68it/s]

num valid: 43307





In [16]:
# train_proposals = load(train_total_prop_path)
# test_proposals  = load( test_total_prop_path)
# len(train_proposals), len(test_proposals)

(5400, 4946)

In [38]:
train_proposals['COCO_train2014_000000000165.jpg']

{'instances': {'bboxes': array([[256., 108., 639., 530.],
         [  6.,  28., 365., 530.],
         [ 61., 122., 384., 407.],
         [154., 125., 365., 402.],
         [159., 124., 367., 320.],
         [163., 175., 256., 400.]], dtype=float32),
  'scores': array([0.994, 0.992, 0.97 , 0.784, 0.208, 0.206], dtype=float32),
  'labels': array([ 0,  0, 76, 76, 76, 27]),
  'ids': array([0, 1, 2, 3, 4, 5])}}

#### person_keypoints

In [39]:
# 提取仅含“人”的实例
train_person, train_imgs_wo_human = extract_person_from_proposal(train_proposals, train_img)
test_person ,  test_imgs_wo_human = extract_person_from_proposal( test_proposals,  test_img)
dump(train_person, train_person_path)
dump( test_person,  test_person_path)

100%|██████████| 5400/5400 [00:00<00:00, 86707.94it/s]


num person: 16318
num image wo person: 4


100%|██████████| 4946/4946 [00:00<00:00, 5611.05it/s]


num person: 15495
num image wo person: 3


In [None]:
# train_person = load(train_person_path)
# test_person  = load( test_person_path)
# len(train_person), len(test_person)

In [40]:
print(train_person[0])

{'image_id': 165,
 'bbox': [256.0, 108.0, 383.0, 422.0],
 'score': 0.994,
 'category_id': 1,
 'id': 0}

In [41]:
# 获取姿态估计结果
train_pose = load(train_pose_prop_path)
test_pose  = load( test_pose_prop_path)
assert len(train_pose) == len(train_person), f'{len(train_pose)} != {len(train_person)}'
assert len( test_pose) == len( test_person), f'{len( test_pose)} != {len( test_person)}'

In [42]:
# 转换成keypoints_proposal
train_proposals = merge_pose_to_prop(train_proposals, train_pose, train_img)
test_proposals  = merge_pose_to_prop( test_proposals,  test_pose,  test_img)

100%|██████████| 16318/16318 [00:00<00:00, 615923.51it/s]


num image: 5400
num image wo pose prop: 4


100%|██████████| 15495/15495 [00:00<00:00, 1632687.04it/s]


num image: 4946
num image wo pose prop: 3


In [43]:
dump(train_proposals, train_total_prop_path)
dump( test_proposals,  test_total_prop_path)