# Subselect new labeled indices

This notebook allows a second person to confirm the initial labelings and subselect a pool of labeled TinyImage indices.


In [33]:
%load_ext autoreload
%autoreload 2

import io
import json
import math
import pickle
import random
import os
import sys

from IPython.display import display
from ipywidgets import widgets
from pathlib import Path
import numpy as np
import PIL.Image
import tqdm

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)
import utils

version='7'

cifar10_by_keyword = utils.load_cifar10_by_keyword(True, 'v'+version)

if version == '4':
    distances = utils.load_v4_distances_to_cifar10()
    version_for_large_filenames= '4'
elif version == '6' or version == '7':
    distances = utils.load_distances_to_cifar10()
    version_for_large_filenames='6.1'

with open('../other_data/tinyimage_large_dst_images_v{}.json'.format(version_for_large_filenames), 'r') as f:
    all_new_imgs = json.load(f)
with open('../other_data/tinyimage_large_dst_image_data_v{}.pickle'.format(version_for_large_filenames), 'rb') as f:
    img_data = pickle.load(f)
with open('../other_data/keyword_counts_v{}.json'.format(version), 'r') as f:
    keyword_counts = json.load(f)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading keywords from file /Users/becca/git/CIFAR-10.1/other_data/cifar10_keywords_unique_v7.json


## New labeled images
Load initial pool of labeled indicies for given label and keyword

In [32]:
keyword_name = 'gamecock'

if version == '4':
    count = keyword_counts[keyword_name]
elif version == '6' or version == '7':
    count = keyword_counts[label][keyword_name]

existing_cifar10_indices = cifar10_by_keyword[keyword_name]
new_imgs = all_new_imgs[keyword_name]

print('Keyword "{}" has {} images in CIFAR-10 and {} unused images remaining in TinyImages.'.format(
    keyword_name, len(existing_cifar10_indices), len(new_imgs)))
print('We need {} image(s).'.format(count))

# Use the subselected file if it already exists, otherwise start from the initial pool
subselected_indices_filename = '../other_data/tinyimage_good_indices_subselected_v{}.json'.format(version)
if Path(subselected_indices_filename).is_file():
    with open(subselected_indices_filename, 'r') as f:
        good_indices_dict = json.load(f)
else:
    with open('../other_data/tinyimage_good_indices.json', 'r') as f:
        good_indices_dict = json.load(f)

if keyword_name in good_indices_dict:    
    initial_cur_good_indices = set(good_indices_dict[keyword_name])
else:
    print('Missing indices for keyword {}'.format(keyword_name))

# Remove indices that correspond to images that have l2 distance <= threshold 
# with an existing image in CIFAR-10
threshold=1000
cur_good_indices = []
for idx in initial_cur_good_indices:
    cur_distance = distances[idx]
    if cur_distance > threshold:
        cur_good_indices.append(idx)
cur_good_indices = set(cur_good_indices)
print('There are currently {} new selected images for this keyword.'.format(len(cur_good_indices)))


KeyError: 'gamecock'

## Existing images

In [27]:
num_images_to_show = len(existing_cifar10_indices)
num_cols = 8
num_rows = 5
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = kk * num_per_tab + ii * num_cols + jj
            cur_cifar10_index = existing_cifar10_indices[cur_index]
            cur_img = widgets.Image(value=utils.np_to_png(cifar.all_images[cur_cifar10_index,:,:,:], scale=scale))
            cur_label = widgets.Label(value=str(cur_cifar10_index))
            cur_box = widgets.VBox([cur_img, cur_label])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs: 100%|██████████| 1/1 [00:00<00:00,  3.75it/s]


Tab(children=(VBox(children=(HBox(children=(VBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x…

## New images

In [28]:
num_images_to_show = len(cur_good_indices)
img_offset = 0

assert img_offset >= 0 and img_offset < len(new_imgs)
num_images_to_show = min(num_images_to_show, len(new_imgs) - img_offset)
num_cols = 8
num_rows = 13
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

checkboxes = {}

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = img_offset + kk * num_per_tab + ii * num_cols + jj
            #cur_ti_index = new_imgs[cur_index]['tinyimage_index']
            cur_ti_index = list(cur_good_indices)[cur_index]
            cur_img = widgets.Image(value=utils.np_to_png(img_data[cur_ti_index], scale=scale))
            cur_checkbox = widgets.Checkbox(cur_ti_index in cur_good_indices, description=str(cur_ti_index), indent=False, layout=widgets.Layout(width='100px', height='28')) #, description=str(ii * num_cols + jj))
            cur_checkbox.width = '90px'
            checkboxes[cur_ti_index] = cur_checkbox
            cur_box = widgets.VBox([cur_img, cur_checkbox])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs: 100%|██████████| 1/1 [00:00<00:00,  4.62it/s]


Tab(children=(VBox(children=(HBox(children=(VBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x…

In [29]:
for ti_index, box in checkboxes.items():
    if box.value:
        cur_good_indices.add(ti_index)
    else:
        cur_good_indices.discard(ti_index)
good_indices_dict[keyword_name] = list(cur_good_indices)
print('Now there now {} selected images for keyword "{}".'.format(len(cur_good_indices), keyword_name))
with open(subselected_indices_filename, 'w') as f:
    json.dump(good_indices_dict, f, indent=2)

Now there now 8 selected images for keyword "gamecock".
