In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import math
from pathlib import Path
import os
import random
import sys

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

from IPython.display import display
from ipywidgets import widgets
from matplotlib import pyplot as plt
import numpy as np
import tqdm

import cifar10
import utils

cifar = cifar10.CIFAR10Data('../other_data/cifar10')

version_string = ''

all_new_imgs, img_data = utils.load_tinyimage_subset(version_string=version_string)
cifar10_by_keyword = utils.load_cifar10_by_keyword(unique_keywords=True, version_string=version_string)

Loading indices from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/tinyimage_subset_indices.json
Loading image data from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/tinyimage_subset_data.pickle
Loading keywords from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/cifar10_keywords_unique.json


In [2]:
keyword_name = 'rowboat'

existing_cifar10_indices = cifar10_by_keyword[keyword_name]
new_imgs = all_new_imgs[keyword_name]
print('Keyword "{}" has {} images in CIFAR-10 and {} unused images remaining in Tiny Images.\n'.format(keyword_name, len(existing_cifar10_indices), len(new_imgs)))

good_indices_filename = '../other_data/tinyimage_good_indices.json'

if Path(good_indices_filename).is_file():
    with open(good_indices_filename, 'r') as f:
        good_indices_dict = json.load(f)
    if keyword_name in good_indices_dict:    
        cur_good_indices = set(good_indices_dict[keyword_name])
    else:
        cur_good_indices = set([])
else:
    print('Error: the file {} does not exist.'.format(good_indices_filename))
    print('Unless you want to start collecting images from scratch, this should not happen.')
    print('To create a new data structure for collectiong new images, modify the following three lines.')
    raise ValueError('No existing file of good indices.')
    # good_indices_dict = {}
    # cur_good_indices = set([])

all_good_indices = []
for _, v in good_indices_dict.items():
    all_good_indices.extend(v)
all_good_indices = list(set(all_good_indices))
print('There are currently {} new selected images for this keyword.\n'.format(len(cur_good_indices)))
print('    ({} good indices selected overall)'.format(len(all_good_indices)))

Keyword "rowboat" has 23 images in CIFAR-10 and 1795 unused images remaining in Tiny Images.

There are currently 40 new selected images for this keyword.

    (9244 good indices selected overall)


## Existing images

In [3]:
num_images_to_show = min(len(existing_cifar10_indices), 100)
num_cols = 8
num_rows = 13
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = kk * num_per_tab + ii * num_cols + jj
            cur_cifar10_index = existing_cifar10_indices[cur_index]
            cur_img = widgets.Image(value=utils.np_to_png(cifar.all_images[cur_cifar10_index,:,:,:], scale=scale))
            cur_label = widgets.Label(value=str(cur_cifar10_index))
            cur_box = widgets.VBox([cur_img, cur_label])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs: 100%|██████████| 1/1 [00:00<00:00,  3.05it/s]


## New images

In [4]:
num_images_to_show = 400
img_offset = 0
dst_threshold = 1000.0

assert img_offset >= 0 and img_offset < len(new_imgs)
num_images_to_show = min(num_images_to_show, len(new_imgs) - img_offset)
num_cols = 8
num_rows = 13
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

checkboxes = {}

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = img_offset + kk * num_per_tab + ii * num_cols + jj
            cur_ti_index = new_imgs[cur_index]['tinyimage_index']
            cur_dst = new_imgs[cur_index]['cifar10_nn_dst']
            cur_img = widgets.Image(value=utils.np_to_png(img_data[cur_ti_index], scale=scale))
            description = str(cur_ti_index)
            if cur_dst < dst_threshold:
                description += ' D'  #str(int(dsts[cur_ti_index][0][1]))
            cur_checkbox = widgets.Checkbox(cur_ti_index in cur_good_indices, description=description, indent=False, layout=widgets.Layout(width='100px', height='28')) #, description=str(ii * num_cols + jj))
            cur_checkbox.width = '90px'
            checkboxes[cur_ti_index] = cur_checkbox
            cur_box = widgets.VBox([cur_img, cur_checkbox])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs: 100%|██████████| 4/4 [00:05<00:00,  1.40s/it]


In [5]:
for ti_index, box in checkboxes.items():
    if box.value:
        cur_good_indices.add(ti_index)
    else:
        cur_good_indices.discard(ti_index)
good_indices_dict[keyword_name] = list(cur_good_indices)
print('Now there now {} selected images for keyword "{}".'.format(len(cur_good_indices), keyword_name))
with open(good_indices_filename, 'w') as f:
    json.dump(good_indices_dict, f, indent=2)

Now there now 40 selected images for keyword "rowboat".
