In [1]:
import json

with open(r"D:\test\instances_attributes_train2020.json", "r") as f:
    data = json.load(f)

print(type(data))
print(data.keys())


<class 'dict'>
dict_keys(['annotations', 'images', 'info', 'licenses', 'categories', 'attributes'])


In [2]:
images = data["images"]
num_images = len(images)
print("Total images:", num_images)


Total images: 45623


In [5]:
import os

image_dir = r"D:\test\images\train"  # đổi thành đường dẫn thư mục ảnh của bạn

# chỉ lấy file ảnh phổ biến
valid_exts = (".jpg", ".jpeg", ".png")

files_in_dir = [
    f for f in os.listdir(image_dir)
    if f.lower().endswith(valid_exts)
]

print("Total images in folder:", len(files_in_dir))

images = data["images"]

json_filenames = [img["file_name"] for img in images]

print("Total images in JSON:", len(json_filenames))


Total images in folder: 45623
Total images in JSON: 45623


In [6]:
set_dir = set(files_in_dir)
set_json = set(json_filenames)
common_images = set_dir & set_json
print("Images present in BOTH folder and JSON:", len(common_images))


Images present in BOTH folder and JSON: 45623


In [7]:
image_filenames = [img["file_name"] for img in images]

print(image_filenames[:10])  # xem thử 10 file đầu


['6ce91b2c047f7f441edd5dc17326ae17.jpg', 'da3618863549ca1db90583f33d1d0664.jpg', '23eb1448530add24409ed669467925c2.jpg', '2e49eb8a326e78b76b9d92e2ce68226c.jpg', '7272d5b783f8440b245e75a96990ff73.jpg', '739ab7b5ce5ecd4ebd0ff2c280e93d4e.jpg', 'd4c40a8fba2a27a6964b5a3bde178243.jpg', '661bba1b15d772e7ad438b6f4b449aec.jpg', 'c2cc67ff9334e06376ee42112ea48ea0.jpg', '35f0cd519c18e2880fb243739b722c89.jpg']


In [8]:
categories = data["categories"]
category_names = [cat["name"] for cat in categories]

print(len(category_names))


46


In [9]:
categories = data["attributes"]
category_names = [cat["name"] for cat in categories]

print(len(category_names))

294


In [10]:
annotations = data["annotations"]
empty_attr_count = sum(
    1 for ann in annotations
    if not ann.get("attribute_ids")
)
total_annotations = len(annotations)
empty_attr_count, total_annotations

(126991, 333401)

In [11]:
ratio = empty_attr_count / total_annotations * 100

print(f"Empty attribute annotations: {empty_attr_count}/{total_annotations} ({ratio:.2f}%)")


Empty attribute annotations: 126991/333401 (38.09%)


In [12]:
categories = data["categories"]

def clean_name(name):
    return name.split("(")[0].strip()

cat_id2name = {
    cat["id"]: clean_name(cat["name"])
    for cat in categories
}


In [13]:
from collections import defaultdict

annotations = data["annotations"]

stats = defaultdict(lambda: {
    "total": 0,
    "with_attr": 0,
    "empty_attr": 0
})

for ann in annotations:
    cid = ann["category_id"]
    stats[cid]["total"] += 1

    if ann.get("attribute_ids"):
        stats[cid]["with_attr"] += 1
    else:
        stats[cid]["empty_attr"] += 1


In [14]:
results = []

for cid, s in stats.items():
    total = s["total"]
    with_attr = s["with_attr"]
    empty_attr = s["empty_attr"]

    ratio = with_attr / total * 100 if total > 0 else 0

    results.append({
        "category_id": cid,
        "category_name": cat_id2name.get(cid, str(cid)),
        "total": total,
        "with_attr": with_attr,
        "empty_attr": empty_attr,
        "attr_ratio": ratio
    })


In [15]:
results_sorted = sorted(
    results,
    key=lambda x: x["attr_ratio"])

for r in results_sorted[:25]:
    print(
        f'{r["category_name"]}: '
        f'{r["with_attr"]}/{r["total"]} '
        f'({r["attr_ratio"]:.1f}%)'
    )


sock: 0/2582 (0.0%)
shoe: 0/46374 (0.0%)
tights, stockings: 0/4326 (0.0%)
glasses: 0/4855 (0.0%)
belt: 0/6851 (0.0%)
bag, wallet: 0/7217 (0.0%)
epaulette: 0/874 (0.0%)
hat: 0/2518 (0.0%)
buckle: 0/3300 (0.0%)
scarf: 0/1374 (0.0%)
glove: 0/1385 (0.0%)
watch: 0/3389 (0.0%)
tie: 0/1457 (0.0%)
headband, head covering, hair accessory: 0/3470 (0.0%)
umbrella: 0/135 (0.0%)
rivet: 0/4893 (0.0%)
sequin: 0/929 (0.0%)
ribbon: 0/274 (0.0%)
bow: 0/528 (0.0%)
tassel: 0/335 (0.0%)
leg warmer: 0/112 (0.0%)
applique: 1/3529 (0.0%)
flower: 1/1367 (0.1%)
hood: 2/1226 (0.2%)
fringe: 1/588 (0.2%)
