In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

from collections import Counter
import json
import math
import os
import pathlib
import random
import sys
from timeit import default_timer as timer

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import candidate_data
import dataset_sampling
import final_dataset_inspection_notebook_code as notebook_code
import image_loader
import imagenet
import mturk_data
import mturk_utils
import near_duplicate_data
import utils

imgnet = imagenet.ImageNetData()
cds = candidate_data.CandidateData(load_metadata_from_s3=False, exclude_blacklisted_candidates=False)
loader = image_loader.ImageLoader(imgnet, cds)
mturk = mturk_data.MTurkData(live=True,
                             load_assignments=True,
                             source_filenames_to_ignore=mturk_data.main_collection_filenames_to_ignore)
ndc = near_duplicate_data.NearDuplicateData(imgnet=imgnet,
                                            candidates=cds,
                                            mturk_data=mturk,
                                            load_review_thresholds=True)

display(mturk_utils.get_dataset_inspection_css())

Reading from local file /Users/ludwig/research/deep_learning/imagenet_2/data/cache/metadata/imagenet_metadata_2018-09-14_01-26-58_UTC.pickle ... done
Loaded 200418 unique candidates from 190 search result JSON file(s).
    /Users/ludwig/research/deep_learning/imagenet_2/data/search_results/...
        2018-07-31_flickr_search_result_vaishaal_class_1_153.json
        2018-08-20-16-10-18_becca.json
        2018-08-25-11-43-09_becca.json
        2018-08-27-22-53-45_becca.json
        2018-08-30-02-40-26_becca.json
        2018-08-30-18-46-35_becca.json
        2018-08-30-19-31-10_becca.json
        2018-09-04-17-03-01_becca.json
        2018-09-04-17-36-03_becca.json
        2018-09-05-16-16-14_becca.json
        ...
    There were 81754 duplicate occurences.
    Ignored 0 candidate entries because they are on the blacklist (blacklist size: 24363).
Loaded 7040 HITs from 64 hit data JSON file(s) in 0 seconds.
    /Users/ludwig/research/deep_learning/imagenet_2/data/mturk/hit_data_live/...


HTML(value='\n<style>\n.image_grid_caption {\n    font-size: 8px;\n    line-height: 12px;\n    height: 12px;\n…

In [2]:
dataset_name = 'imagenetv2-c-5.json'

data, review_data, review_filepath = notebook_code.load_dataset_and_print_info(dataset_name, imgnet, cds, mturk, ndc)

Dataset imagenetv2-c-5.json

Generated by ludwig at 2018-11-26_02:30:16_UTC
Sampling function: sample_best (seed 285740942)

Futher parameters:
    min_num_annotations: 10
    near_duplicate_review_targets: {'l2': 120000000.0, 'dssim': 0.2205, 'fc7': 13200.0}
    is_valid: False
    starting_from: imagenetv2-c-4.json

9996 images for 1000 wnids

998 wnids have 10 images
2 wnids have 8 images

Number of unique image filenames: 9996

9996 images are candidate images
0 images are ImageNet images
    0 training images
    0 val images
    0 test images

Dataset wnids match the ImageNet wnids

The dataset contains 0 blacklisted candidates

The minimum number of assignments for an image in the dataset is 10

The minimum image selection frequency of an image in the dataset is 0.5
    (0.5 among wnids without a special threshold)
    (0 images do not have a selection frequency)
The average image selection frequency of the images in the dataset is 0.92

The dataset contains 0 near-duplicates
Re

In [7]:
num_wnids_to_show = 15
starting_wnid = 'n01669191'
#starting_wnid = sorted(imgnet.class_info_by_wnid.keys())[0]

image_filter = 'all'
show_reviewed = False

ui, review_checkboxes, problematic_checkboxes, blacklist_checkboxes, near_duplicate_text_fields = notebook_code.generate_review_ui(
        num_wnids_to_show=num_wnids_to_show,
        starting_wnid=starting_wnid,
        data=data,
        review_data=review_data,
        imgnet=imgnet,
        cds=cds,
        loader=loader,
        ndc=ndc,
        num_val_images_per_wnid=10,
        image_filter=image_filter,
        show_reviewed=show_reviewed)

ui

Loading image data ... done, took 4.059256853000022 seconds


VBox(children=(Label(value='n01669191', _dom_classes=('wnid_heading',)), Label(value='box turtle, box tortoise…

In [8]:
with open('../data/metadata/candidate_blacklist.json', 'r') as f:
    blacklist = json.load(f)
num_added_by_wnid = {}
num_removed_by_wnid = {}
for cid, checkbox in blacklist_checkboxes.items():
    assert cid in cds.all_candidates
    cur_wnid = cds.all_candidates[cid]['wnid']
    if checkbox.value:
        if cid not in blacklist:
            blacklist[cid] = 'invalid image (selected in the final review notebook)'
            if cur_wnid not in num_added_by_wnid:
                num_added_by_wnid[cur_wnid] = 0
            num_added_by_wnid[cur_wnid] += 1
relevant_wnids = sorted(list(set(num_added_by_wnid.keys()) | set(num_removed_by_wnid.keys())))
for cur_wnid in relevant_wnids:
    num_added = num_added_by_wnid[cur_wnid] if cur_wnid in num_added_by_wnid else 0
    num_removed = num_removed_by_wnid[cur_wnid] if cur_wnid in num_removed_by_wnid else 0
    cur_synset = ', '.join(imgnet.class_info_by_wnid[cur_wnid].synset)
    print(f'Added {num_added} candidates to the blacklist (removed {num_removed}) for wnid {cur_wnid} ({cur_synset})')

with open('../data/metadata/candidate_blacklist.json', 'w') as f:
    json.dump(blacklist, f, indent=2, sort_keys=True)

    
print()


with open('../data/metadata/near_duplicates.json', 'r') as f:
    near_duplicates = json.load(f)
for wnid, text_field in near_duplicate_text_fields.items():
    cids = text_field.value.split(' ')
    cids = [x.strip() for x in cids]
    if len(cids) == 1 and cids[0] == '':
        continue
    assert len(cids) >= 2
    if len(cids) != len(set(cids)):
        print(f'Near-duplicate set {cids} has repeated elements')
    assert len(cids) == len(set(cids))
    for cid in cids:
        assert cid in cds.all_candidates
    root_cid = cids[0]
    if root_cid not in near_duplicates:
        near_duplicates[root_cid] = []
    for other_cid in cids[1:]:
        if other_cid not in near_duplicates[root_cid]:
            near_duplicates[root_cid].append(other_cid)
    near_duplicates[root_cid] = list(sorted(set(near_duplicates[root_cid])))
    print('Added {} near-duplicates for wnid {} ({})'.format(len(cids), wnid, ', '.join(imgnet.class_info_by_wnid[wnid].synset)))
        
with open('../data/metadata/near_duplicates.json', 'w') as f:
    json.dump(near_duplicates, f, indent=2, sort_keys=True)
    
print()

review_filename = dataset_name[:-5] + '_review.json'
review_filepath = pathlib.Path('../data/dataset_reviews/' + review_filename)
review_filepath = review_filepath.resolve()
assert review_filepath.is_file()
with open(review_filepath, 'r') as f:
    review_data = json.load(f)

for wnid, checkbox in review_checkboxes.items():
    review_data[wnid]['reviewed'] = checkbox.value

for wnid, checkbox in problematic_checkboxes.items():
    review_data[wnid]['problematic'] = checkbox.value

with open(review_filepath, 'w') as f:
    json.dump(review_data, f, indent=2, sort_keys=True)

print('Wrote updated review data to {}'.format(review_filepath))
print()
num_reviewed = len([x for x in review_data.items() if x[1]['reviewed']])
print('Number of reviewed wnids: {}'.format(num_reviewed))
num_problematic = len([x for x in review_data.items() if x[1]['problematic']])
print('Number of problematic wnids: {}'.format(num_problematic))

Added 1 candidates to the blacklist (removed 0) for wnid n03697007 (lumbermill, sawmill)
Added 1 candidates to the blacklist (removed 0) for wnid n04328186 (stopwatch, stop watch)


Wrote updated review data to /Users/ludwig/research/deep_learning/imagenet_2/data/dataset_reviews/imagenetv2-c-4_review.json

Number of reviewed wnids: 1000
Number of problematic wnids: 19


In [3]:
list(notebook_code.compute_splitting_points(review_data, [1]))

[('n01669191', 1), ('n02066245', 28)]

In [3]:
263 / 3

87.66666666666667

In [7]:
(260 - 76) / 2

92.0