In [3]:
import numpy as np
import cv2
import pandas as pd
import json
import copy
import os

In [4]:
data_dir = '/mmdetection/data/coco_datasets/datasets/scratch/annotations/'
train_data_path = '/mmdetection/data/coco_datasets/datasets/scratch/annotations/train.json'
valid_data_path = '/mmdetection/data/coco_datasets/datasets/scratch/annotations/valid.json'
test_data_path = '/mmdetection/data/coco_datasets/datasets/scratch/annotations/test.json'

train_data = json.load(open(train_data_path))
valid_data = json.load(open(valid_data_path))
test_data = json.load(open(test_data_path))


In [10]:
def compute_area(seg):
    seg = np.array(seg).reshape(-1,2).astype(np.int32)
    area = cv2.contourArea(seg)
    if area == 0:
        area = 1e-5
    return area

def double_check(data):
    img_ids = [img['id'] for img in data['images']]
    img_ids_in_annos = set([ann['image_id'] for ann in data['annotations']])
    differential_quant = len(img_ids) - len(img_ids_in_annos)
    if (differential_quant <= 0):
        print("check 'double_check' function again")
    else:
        print('have positive differential quantity')
        new_images = []
        for img in data['images']:
            if img['id'] in img_ids_in_annos:
                new_images.append(img)
        data['images'] = new_images
    return data

In [21]:
def clean_data(data, name=None):
    seg_area = {}
    seg_img_ratio = {}
    img_id = {}
    for ann in data['annotations']:
        seg_area[ann['id']] = compute_area(ann['segmentation'])
        img_id[ann['id']] = ann['image_id']
        
        for img in data['images']:
            if ann['image_id'] == img['id']:
                img_area = img['width'] * img['height']
                break
        seg_img_ratio[ann['id']] = seg_area[ann['id']]/img_area
    
    df_name =  name+'_df'
    df = pd.DataFrame({'anno_id': seg_area.keys(),
                      'img_id': img_id.values(),
                      'seg_area': seg_area.values(),
                      'seg_img_ratio': seg_img_ratio.values()}) 
    df.to_csv('statistical_charts/old_{}.csv'.format(df_name),index=False)
    
    upper_limit = 0.15
    lower_limit = np.exp(-11)
    under_lower_outlier = under_lower_ratio_outlier = df[(df.seg_img_ratio<=lower_limit)]
    above_upper_ratio_outlier = df[(df.seg_img_ratio>=upper_limit)]
    
    removed_ann_id = list(under_lower_ratio_outlier.anno_id)
    removed_img_id = list(set(above_upper_ratio_outlier.img_id))

    print('no removed ann id: ',len(removed_ann_id), 
          'no removed img id: ', len(removed_img_id))
    
    
    print('original ann id: ',len(data['images']), 
          'original img id: ',len(data['annotations']))
    new_json = copy.deepcopy(data)

    for ann in new_json['annotations']:
        if ann['image_id'] in removed_img_id:
            removed_ann_id.append(ann['id'])

    new_json['images'] = [img for img in new_json['images'] if img['id'] not in removed_img_id]
    new_json['annotations'] = [ann for ann in new_json['annotations'] if ann['id'] not in removed_ann_id]
    print(len(new_json['images']), len(new_json['annotations']))
    new_json = double_check(new_json)
    
    new_json_name = 'clean_'+name+'.json'
    
    path = os.path.join(data_dir, new_json_name)
    with open(path, 'w', encoding='utf-8') as json_file:
        json.dump(new_json, json_file, ensure_ascii=False, indent=4)


In [22]:
clean_data(train_data, name='train')

no removed ann id:  3364 no removed img id:  154
original ann id:  16144 original img id:  81642
15990 77763
have positive differential quantity


In [23]:
clean_data(valid_data, name='valid')

no removed ann id:  752 no removed img id:  32
original ann id:  3775 original img id:  19024
3743 18164
have positive differential quantity


In [24]:
clean_data(test_data, name='test')

no removed ann id:  973 no removed img id:  35
original ann id:  3775 original img id:  19453
3740 18358
have positive differential quantity


In [25]:
3364+752+973

5089

In [26]:
154+32+35

221