In [3]:
import json
import os
import re
import fnmatch
from pathlib import Path
import numpy as np
import cv2
import copy
import random
import xml.etree.ElementTree as ET
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm
from pycocotools.coco import COCO

In [4]:
cwd = os.getcwd()
coco_json_path = cwd + '/data/coco/annotations/instances_train2014.json'
with open(coco_json_path, 'r') as f:
    original_coco_data = json.load(f)
original_coco_data.keys()

dict_keys(['info', 'images', 'licenses', 'annotations', 'categories'])

In [20]:
print(name_to_cls_ind_in_coco)

{'person': 1, 'bicycle': 2, 'car': 3, 'motorcycle': 4, 'airplane': 5, 'bus': 6, 'train': 7, 'truck': 8, 'boat': 9, 'traffic light': 10, 'fire hydrant': 11, 'stop sign': 13, 'parking meter': 14, 'bench': 15, 'bird': 16, 'cat': 17, 'dog': 18, 'horse': 19, 'sheep': 20, 'cow': 21, 'elephant': 22, 'bear': 23, 'zebra': 24, 'giraffe': 25, 'backpack': 27, 'umbrella': 28, 'handbag': 31, 'tie': 32, 'suitcase': 33, 'frisbee': 34, 'skis': 35, 'snowboard': 36, 'sports ball': 37, 'kite': 38, 'baseball bat': 39, 'baseball glove': 40, 'skateboard': 41, 'surfboard': 42, 'tennis racket': 43, 'bottle': 44, 'wine glass': 46, 'cup': 47, 'fork': 48, 'knife': 49, 'spoon': 50, 'bowl': 51, 'banana': 52, 'apple': 53, 'sandwich': 54, 'orange': 55, 'broccoli': 56, 'carrot': 57, 'hot dog': 58, 'pizza': 59, 'donut': 60, 'cake': 61, 'chair': 62, 'couch': 63, 'potted plant': 64, 'bed': 65, 'dining table': 67, 'toilet': 70, 'tv': 72, 'laptop': 73, 'mouse': 74, 'remote': 75, 'keyboard': 76, 'cell phone': 77, 'microwave

In [19]:
cls_ind_to_name_in_coco = {}
name_to_cls_ind_in_coco = {}
all_cls_ind_in_coco = []
for _cat in original_coco_data['categories']:
    all_cls_ind_in_coco.append(_cat['id'])
    cls_ind_to_name_in_coco[_cat['id']] = _cat['name']
    name_to_cls_ind_in_coco[_cat['name']] = _cat['id']

cls_name_in_voc = ['person', 'bird', 'cat', 'cow', 'dog', 'horse', 'sheep',
                      'airplane', 'bicycle', 'boat', 'bus' ,'car', 'motorcycle',
                      'train', 'bottle', 'chair', 'dining table', 'potted plant', 'couch', 'tv']

cls_ind_in_coco_set1 = []
for _cat in original_coco_data['categories']:
    if _cat['name'] in cls_name_in_voc:
        cls_ind_in_coco_set1.append(_cat['id'])

cls_pool = copy.deepcopy(all_cls_ind_in_coco)
for _cls in cls_ind_in_coco_set1:
    cls_pool.remove(_cls)

random.seed(0)
cls_ind_in_coco_set2 = random.sample(cls_pool, k=20)
for _cls in cls_ind_in_coco_set2:
    cls_pool.remove(_cls)

random.seed(0)
cls_ind_in_coco_set3 = random.sample(cls_pool, k=20)
for _cls in cls_ind_in_coco_set3:
    cls_pool.remove(_cls)

cls_ind_in_coco_set4 = cls_pool

cls_ind_all_set = [cls_ind_in_coco_set1, cls_ind_in_coco_set2, cls_ind_in_coco_set3, cls_ind_in_coco_set4]

In [12]:
print(cls_ind_in_coco_set1)
print(cls_ind_in_coco_set2)
print(cls_ind_in_coco_set3)
print(cls_ind_in_coco_set4)

[1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72]
[85, 43, 78, 87, 47, 11, 35, 53, 52, 46, 38, 51, 41, 58, 32, 82, 24, 37, 73, 22]
[60, 65, 13, 48, 79, 77, 61, 54, 76, 34, 50, 74, 25, 86, 15, 31, 80, 14, 84, 28]
[8, 10, 23, 27, 33, 36, 39, 40, 42, 49, 55, 56, 57, 59, 70, 75, 81, 88, 89, 90]


In [10]:
im_summary = {}
for i, im_d in enumerate(original_coco_data['images']):
    im_summary[str(im_d['id'])] = {}
    im_summary[str(im_d['id'])]['im_dict'] = im_d
    im_summary[str(im_d['id'])]['annotations'] = []
    im_summary[str(im_d['id'])]['categories'] = []
for j, a_d in enumerate(original_coco_data['annotations']):
    if a_d['iscrowd'] == 0:  # only keep non-crowd annotations
        im_id = a_d['image_id']
        cat_id = a_d['category_id']
        im_summary[str(im_id)]['annotations'].append(a_d)
        im_summary[str(im_id)]['categories'].append(cat_id)
for _key in im_summary.keys():
    im_summary[_key]['categories'] = list(set(im_summary[_key]['categories']))

### coco 60 train data

In [13]:
cwd = os.getcwd()
dump_dir = cwd + '/data/coco/annotations/coco60_train'
set_names = ['set1', 'set2', 'set3', 'set4',]
for current_novel_class_inds, set_name in zip(cls_ind_all_set, set_names):
    new_data = {}
    new_data['info'] = original_coco_data['info']
    new_data['images'] = []
    new_data['licenses'] = original_coco_data['licenses']
    new_data['annotations'] = []
    new_data['categories'] = []
    
    new_categories = []
    for cat in original_coco_data['categories']:
        if cat['id'] not in current_novel_class_inds:
            new_categories.append(cat)
    new_data['categories'] = new_categories

    im_id_to_filt = []
    for _key in im_summary.keys():
        only_novel_class = True
        for cls_id in im_summary[_key]['categories']:
            if cls_id not in current_novel_class_inds:
                only_novel_class = False
                break
        if only_novel_class:
            im_id_to_filt.append(im_summary[_key]['im_dict']['id'])
            
    for im in tqdm(original_coco_data['images']):
        if im['id'] not in im_id_to_filt:
            new_data['images'].append(im)
    for ann in original_coco_data['annotations']:
        if ann['category_id'] not in current_novel_class_inds:
            new_data['annotations'].append(ann)

    dump_path = os.path.join(dump_dir, 'instances_' + set_name +'.json')
    with open(dump_path, 'w') as f:
        json.dump(new_data, f)

HBox(children=(IntProgress(value=0, max=82783), HTML(value='')))




HBox(children=(IntProgress(value=0, max=82783), HTML(value='')))




HBox(children=(IntProgress(value=0, max=82783), HTML(value='')))




HBox(children=(IntProgress(value=0, max=82783), HTML(value='')))




In [17]:
_COCO = COCO(os.path.join(dump_dir, 'instances_set1.json'))
print(len(_COCO.imgs))
print(len(_COCO.anns))
print(len(_COCO.cats))

loading annotations into memory...
Done (t=2.82s)
creating index...
index created!
68937
258186
60


In [8]:
data['images'][0]

{'license': 5,
 'file_name': 'COCO_train2014_000000057870.jpg',
 'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg',
 'height': 480,
 'width': 640,
 'date_captured': '2013-11-14 16:28:13',
 'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
 'id': 57870}

In [9]:
data['annotations'][0]

{'segmentation': [[312.29,
   562.89,
   402.25,
   511.49,
   400.96,
   425.38,
   398.39,
   372.69,
   388.11,
   332.85,
   318.71,
   325.14,
   295.58,
   305.86,
   269.88,
   314.86,
   258.31,
   337.99,
   217.19,
   321.29,
   182.49,
   343.13,
   141.37,
   348.27,
   132.37,
   358.55,
   159.36,
   377.83,
   116.95,
   421.53,
   167.07,
   499.92,
   232.61,
   560.32,
   300.72,
   571.89]],
 'area': 54652.9556,
 'iscrowd': 0,
 'image_id': 480023,
 'bbox': [116.95, 305.86, 285.3, 266.03],
 'category_id': 58,
 'id': 86}

In [6]:
original_coco_data['categories']

[{'supercategory': 'person', 'id': 1, 'name': 'person'},
 {'supercategory': 'vehicle', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'vehicle', 'id': 3, 'name': 'car'},
 {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'},
 {'supercategory': 'vehicle', 'id': 5, 'name': 'airplane'},
 {'supercategory': 'vehicle', 'id': 6, 'name': 'bus'},
 {'supercategory': 'vehicle', 'id': 7, 'name': 'train'},
 {'supercategory': 'vehicle', 'id': 8, 'name': 'truck'},
 {'supercategory': 'vehicle', 'id': 9, 'name': 'boat'},
 {'supercategory': 'outdoor', 'id': 10, 'name': 'traffic light'},
 {'supercategory': 'outdoor', 'id': 11, 'name': 'fire hydrant'},
 {'supercategory': 'outdoor', 'id': 13, 'name': 'stop sign'},
 {'supercategory': 'outdoor', 'id': 14, 'name': 'parking meter'},
 {'supercategory': 'outdoor', 'id': 15, 'name': 'bench'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'animal', 'id': 18, 'name': 'dog'},