## Exploration: Synsets

This notebook explores the image metadata of each synset, and finally selects the synsets we want to use.

In [4]:
## Get number of synsets which have bounding box annotations, which also contain >= 600 images

import os, os.path
from pprint import pprint
import json
import numpy as np

bbox_annotations_path = 'data/bbox/Annotation'
MIN_NO_OF_IMAGES = 600
counts = {}

synsets = [f for f in os.listdir(bbox_annotations_path)
                if not os.path.isfile(os.path.join(bbox_annotations_path, f))]

for synset in synsets:
    path = bbox_annotations_path + '/' + synset
    count = len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    counts[synset] = count

populated_synsets = {k: v for k, v in counts.items() if v >= MIN_NO_OF_IMAGES}

print(len(populated_synsets.items()))
print(populated_synsets)


265
{'n07874780': 611, 'n04067472': 663, 'n03360622': 728, 'n02948072': 621, 'n02002724': 654, 'n04317175': 602, 'n09436708': 3691, 'n07935504': 1261, 'n03207941': 678, 'n02802426': 1327, 'n03636649': 1095, 'n03042490': 602, 'n03017168': 653, 'n03089624': 600, 'n03649909': 610, 'n01641391': 642, 'n03814906': 757, 'n03773035': 683, 'n02231487': 615, 'n02958343': 1586, 'n04149813': 627, 'n02699494': 615, 'n03770439': 610, 'n04325704': 600, 'n02403003': 622, 'n04019541': 604, 'n03599486': 703, 'n02259212': 657, 'n03759954': 799, 'n09835506': 735, 'n01950731': 639, 'n03733281': 791, 'n04179913': 631, 'n02077923': 625, 'n04562935': 666, 'n02395406': 614, 'n02849154': 1229, 'n03761084': 689, 'n04275548': 669, 'n04532670': 617, 'n01986214': 630, 'n04465501': 699, 'n01774384': 652, 'n04370048': 608, 'n02841315': 616, 'n04435653': 627, 'n04102406': 660, 'n03497657': 603, 'n01644373': 660, 'n02013706': 744, 'n03992509': 612, 'n02730930': 604, 'n03001627': 1330, 'n07875152': 639, 'n02508021': 142

In [5]:
## Now make sure those synsets have names so they are interpretable -- because some do not
## Discard the unnamed synsets

synset_names = None
NAMES_FILE_PATH = 'data/synset_names.json'
with open(NAMES_FILE_PATH) as f:
    synset_names = json.load(f)
    
named_populated_synsets = {}
discarded_count = 0

for synset in populated_synsets:
    if synset in synset_names.keys():
        synset_name = synset_names[synset].split(',')[0] # Only take the first of comma-separated alternative names
        named_populated_synsets[synset] = synset_name
    else:
        discarded_count += 1
        
print(named_populated_synsets)


{'n04067472': 'reel', 'n03360622': 'flat bench', 'n02948072': 'candle', 'n02002724': 'black stork', 'n04317175': 'stethoscope', 'n02374451': 'horse', 'n04265275': 'space heater', 'n03207941': 'dishwasher', 'n03179701': 'desk', 'n03042490': 'cliff dwelling', 'n03017168': 'chime', 'n03089624': 'confectionery', 'n03874599': 'padlock', 'n01641391': 'leopard frog', 'n03814906': 'necklace', 'n03773035': 'mirror', 'n03692522': 'loupe', 'n04149813': 'scoreboard', 'n02699494': 'altar', 'n04325704': 'stole', 'n02403003': 'ox', 'n04019541': 'puck', 'n03599486': 'jinrikisha', 'n02259212': 'leafhopper', 'n02231487': 'walking stick', 'n01950731': 'sea slug', 'n03733281': 'maze', 'n02841315': 'binoculars', 'n02077923': 'sea lion', 'n04562935': 'water tower', 'n02395406': 'hog', 'n03761084': 'microwave', 'n02990373': 'ceiling', 'n04532670': 'viaduct', 'n04465501': 'tractor', 'n01774384': 'black widow', 'n04370048': 'sweater', 'n07873807': 'pizza', 'n04435653': 'tile roof', 'n03498962': 'hatchet', 'n03

In [6]:
print("Remaining: " + str(len(named_populated_synsets.items())))
print("Discarded: " + str(discarded_count))

Remaining: 227
Discarded: 38


In [10]:

# Write to file
FILE_PATH = 'data/named_populated_synsets.json'
data = json.dumps(named_populated_synsets)
with open(FILE_PATH, 'w') as f:
    f.write(data)


final_synsets_counts = {}
final_synsets_names = {}
synsets_sorted = sorted(named_populated_synsets.items(), key=lambda x: x[1], reverse=True)
print(len(synsets_sorted))
# for i in range(200):
for i in range(len(synsets_sorted)):
    synset = synsets_sorted[i][0]
    if synset in named_populated_synsets:
        final_synsets_names[synset] = named_populated_synsets[synset]
        final_synsets_counts[synset] = synsets_sorted[i][1]

        
FILE_PATH_1 = 'data/final_synsets_counts.json'
FILE_PATH_2 = 'data/final_synsets_names.json'
data1 = json.dumps(final_synsets_counts)
data2 = json.dumps(final_synsets_names)
with open(FILE_PATH_1, 'w') as f:
    f.write(data1)
with open(FILE_PATH_2, 'w') as f:
    f.write(data2)

print("All done.")

227
All done.
