In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import math
import os
import pickle
import random
import sys

from IPython.display import display
from ipywidgets import Layout
from ipywidgets import widgets
from matplotlib import pyplot as plt
import numpy as np
import tqdm

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import cifar10
import utils

cifar_label_names = utils.cifar10_label_names

version_string = 'v6'
images, labels, tinyimage_indices = utils.load_new_test_data(version_string, load_tinyimage_indices=True)
num_new_images = images.shape[0]

reshaped_images = np.reshape(images, (num_new_images, -1))
assert reshaped_images.shape == (num_new_images, 32 * 32 * 3)

print('\nLoaded version {} of the new dataset.'.format(version_string))
print('There are {} images in the dataset.'.format(images.shape[0]))

Loading labels from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v6_labels.npy
Loading image data from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v6_data.npy
Loading Tiny Image indices from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/cifar10.1_v6_ti_indices.json

Loaded version v6 of the new dataset.
There are 2000 images in the dataset.


# The new dataset

Execute the following cell to browse the new dataset.

In [2]:
# Set this to determine which image to start with
image_offset = 0

num_images_to_show = min(len(labels) - image_offset, 400)
num_cols = 8
num_rows = 13
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = kk * num_per_tab + ii * num_cols + jj
            cur_img = widgets.Image(value=utils.np_to_png(images[cur_index + image_offset,:,:,:], scale=scale))
            cur_class = labels[cur_index + image_offset]
            cur_label = widgets.Label(value=cifar_label_names[cur_class] + '  (' + str(cur_index + image_offset) + ')')
            cur_box = widgets.VBox([cur_img, cur_label])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]


# New images compared to previous dataset

In [3]:
prev_version_string = 'v4'
_, _, prev_tinyimage_indices = utils.load_new_test_data(version_string=prev_version_string, load_tinyimage_indices=True)

new_tinyimage_indices = list(set(tinyimage_indices) - set(prev_tinyimage_indices))

print('\nThere are {} new images in version {} compared to version {}'.format(len(new_tinyimage_indices), version_string, prev_version_string))

images_to_show = []
for ii in new_tinyimage_indices:
    images_to_show.append(tinyimage_indices.index(ii))

# Set this to determine which image to start with
image_offset = 0

num_images_to_show = min(len(images_to_show) - image_offset, 400)
num_cols = 8
num_rows = 13
num_per_tab = num_cols * num_rows
num_tabs = int(math.ceil(num_images_to_show / num_per_tab))
scale=3

tab_contents = []
for kk in tqdm.tqdm(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil((num_images_to_show - (num_tabs - 1) * num_per_tab) / num_cols))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_num_cols = num_cols
        if kk == num_tabs - 1 and ii == cur_num_rows - 1:
            cur_num_cols = num_images_to_show - (num_tabs - 1) * num_per_tab - (cur_num_rows - 1) * num_cols
        for jj in range(cur_num_cols):
            cur_index = images_to_show[kk * num_per_tab + ii * num_cols + jj + image_offset]
            cur_img = widgets.Image(value=utils.np_to_png(images[cur_index,:,:,:], scale=scale))
            cur_class = labels[cur_index]
            cur_label = widgets.Label(value=cifar_label_names[cur_class] + '  (' + str(cur_index) + ')', layout=Layout(height='20px'))
            label_text2 = str(tinyimage_indices[cur_index])
            cur_label2 = widgets.Label(value=label_text2, layout=Layout(height='20px'))
            cur_box = widgets.VBox([cur_img, cur_label, cur_label2])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)

Setting up image tabs:   0%|          | 0/2 [00:00<?, ?it/s]

Loading labels from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v4_labels.npy
Loading image data from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v4_data.npy
Loading Tiny Image indices from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/cifar10.1_v4_ti_indices.json

There are 126 new images in version v6 compared to version v4


Setting up image tabs: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


# Check for duplicates within the new dataset

In [4]:
near_duplicates = {}
threshold = 1500.0

for ii in tqdm.tqdm_notebook(range(num_new_images)):
    cur_near_duplicates = utils.find_near_self_duplicates(reshaped_images, ii, 0.0, threshold)
    if len(cur_near_duplicates) > 0:
        near_duplicates[ii] = cur_near_duplicates
        print('Index {}: {}'.format(ii, cur_near_duplicates))




In [5]:
image_offset = 0
dup_indices = sorted(list(near_duplicates.keys()))
num_images_to_show = len(dup_indices)
num_cols = 11
num_rows = 20
num_tabs = int(math.ceil(num_images_to_show / num_rows))
scale=3

tab_contents = []
for kk in tqdm.tqdm_notebook(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil(num_images_to_show - (num_tabs - 1) * num_rows))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_index = dup_indices[kk * num_rows + ii]
        cur_candidates = near_duplicates[cur_index]
        cur_num_candidates = min(num_cols - 1, len(cur_candidates))
        if cur_num_candidates < len(cur_candidates):
            print('Warning: index {} has {} candidates'.format(cur_index, len(cur_candidates)))
        cur_img = widgets.Image(value=utils.np_to_png(images[cur_index,:,:,:], scale=scale))
        label_text = str(cur_index) + ' ' + str(tinyimage_indices[cur_index])
        cur_label = widgets.Label(value=label_text)
        cur_box = widgets.VBox([cur_img, cur_label])
        cur_box.layout.align_items = 'center'
        cur_box.layout.padding = '6px'
        cur_row.append(cur_box)
        for ind, dst in cur_candidates[:cur_num_candidates]:
            cur_img = widgets.Image(value=utils.np_to_png(images[ind,:,:,:], scale=scale))
            label_text = str(ind) + ' {:.1f}'.format(dst)
            cur_label = widgets.Label(value=label_text)
            cur_box = widgets.VBox([cur_img, cur_label])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)




# Check for duplicates in CIFAR10

Running this check requires some extra files from our S3 bucket.

In [6]:
dsts_to_cifar10 = utils.load_distances_to_cifar10(version_string=version_string)
cifar = cifar10.CIFAR10Data('../other_data/cifar10')

Loading distances from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/tinyimage_cifar10_distances_v6.json


In [7]:
threshold_lower = 1000.0
threshold_upper = 2000.0

candidates = {}

prior_version_string = 'v4'
ti_indices_in_prior_version = []
if prior_version_string != '':
    _, _, ti_indices_in_prior_version = utils.load_new_test_data(version_string=prior_version_string, load_tinyimage_indices=True)


for ii in range(num_new_images):
    if tinyimage_indices[ii] in ti_indices_in_prior_version:
        continue
    all_nearest_neighbors = dsts_to_cifar10[tinyimage_indices[ii]]
    within_threshold_neighbors = []
    for ind, dst in all_nearest_neighbors:
        if dst <= threshold_upper and dst >= threshold_lower:
            within_threshold_neighbors.append((ind, dst))
    if len(within_threshold_neighbors) > 0:
        candidates[ii] = within_threshold_neighbors

print('\n{} images with candidate nearest neighbors'.format(len(candidates)))

Loading labels from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v4_labels.npy
Loading image data from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/datasets/cifar10.1_v4_data.npy
Loading Tiny Image indices from file /Users/ludwig/research/deep_learning/tinyimages/CIFAR-10.1/other_data/cifar10.1_v4_ti_indices.json

30 images with candidate nearest neighbors


In [8]:
image_offset = 0
candidate_indices = sorted(list(candidates.keys()))
num_images_to_show = len(candidate_indices)
num_cols = 11
num_rows = 20
num_tabs = int(math.ceil(num_images_to_show / num_rows))
scale=3

tab_contents = []
for kk in tqdm.tqdm_notebook(range(num_tabs), desc='Setting up image tabs'):
    rows = []
    cur_num_rows = num_rows
    if kk == num_tabs - 1:
        cur_num_rows = int(math.ceil(num_images_to_show - (num_tabs - 1) * num_rows))
    for ii in range(cur_num_rows):
        cur_row = []
        cur_index = candidate_indices[kk * num_rows + ii]
        cur_candidates = candidates[cur_index]
        cur_num_candidates = min(num_cols - 1, len(cur_candidates))
        if cur_num_candidates < len(cur_candidates):
            print('Warning: index {} has {} candidates'.format(cur_index, len(cur_candidates)))
        cur_img = widgets.Image(value=utils.np_to_png(images[cur_index,:,:,:], scale=scale))
        label_text = str(cur_index) + ' ' + str(tinyimage_indices[cur_index])
        cur_label = widgets.Label(value=label_text)
        cur_box = widgets.VBox([cur_img, cur_label])
        cur_box.layout.align_items = 'center'
        cur_box.layout.padding = '6px'
        cur_row.append(cur_box)
        for ind, dst in cur_candidates[:cur_num_candidates]:
            cur_img = widgets.Image(value=utils.np_to_png(cifar.all_images[ind,:,:,:], scale=scale))
            label_text = str(ind) + ' {:.1f}'.format(dst)
            cur_label = widgets.Label(value=label_text)
            cur_box = widgets.VBox([cur_img, cur_label])
            cur_box.layout.align_items = 'center'
            cur_box.layout.padding = '6px'
            cur_row.append(cur_box)
        cur_hbox = widgets.HBox(cur_row)
        rows.append(cur_hbox)
    tab_contents.append(widgets.VBox(rows))

tab = widgets.Tab()
tab.children = tab_contents
for i in range(len(tab.children)):
    tab.set_title(i, str(i))
display(tab)


