# Step 1: Initializing the notebook
Run this cell only once to start reviewing. It is not necessary to relaod this cell in order to load the results from other reviewers (this happens in the next cell).

In [26]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import datetime
import getpass
from IPython.display import display
from ipywidgets import widgets
import json
import math
import matplotlib
import matplotlib.pyplot as plt
import os
import pickle
import sys
import tqdm

repo_root = os.path.join(os.getcwd(), '../code')
sys.path.append(repo_root)

import candidate_data
import imagenet
import image_loader
from near_duplicate_checker import print_nn_stats
import mturk_utils
import mturk_data
from review_near_duplicates_notebook_code import *
import utils

imgnt = imagenet.ImageNetData()
cds = candidate_data.CandidateData()
mturk = mturk_data.MTurkData(live=True,
                             load_assignments=True,
                             source_filenames_to_ignore=mturk_data.main_collection_filenames_to_ignore)
loader = image_loader.ImageLoader(imgnt, cds)
display(mturk_utils.get_nn_review_css())

with open('../data/metadata/nearest_neighbor_results.pickle', 'rb') as f:
    nn_results = pickle.load(f)
print('Current nearest neighbor statistics: ')
print_nn_stats(nn_results)
print()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Reading from local file /Users/becca/git/imagenet_2/data/cache/metadata/imagenet_metadata_2018-09-14_01-26-58_UTC.pickle ... done
Loaded 178264 unique candidates from 146 search result JSON file(s).
    /Users/becca/git/imagenet_2/data/search_results/...
        2018-07-31_flickr_search_result_vaishaal_class_1_153.json
        2018-08-20-16-10-18_becca.json
        2018-08-25-11-43-09_becca.json
        2018-08-27-22-53-45_becca.json
        2018-08-30-02-40-26_becca.json
        2018-08-30-18-46-35_becca.json
        2018-08-30-19-31-10_becca.json
        2018-09-04-17-03-01_becca.json
        2018-09-04-17-36-03_becca.json
        2018-09-05-16-16-14_becca.json
        ...
    There were 71506 duplicate occurences.
    Ignored 15975 candidate entries because they are on the blacklist (blacklist size: 12860).
Loaded 6751 HITs from 55 hit data JSON file(s) in 1 seconds.
    /Users/becca/git/imagenet

HTML(value='\n<style>\n.image_name {\n    font-size: 8px;\n    line-height: 12px;\n    height: 12px;\n}\n</sty…

Current nearest neighbor statistics: 
104534 candidates for metric l2
90812 candidates for metric dssim
94891 candidates for metric fc7



# Step 2: Load existing near duplicates, reviews, and nearest neighbor information
Re-run this cell after the files on disk have changed, e.g., after a `git pull`.

In [27]:
# Load existing reviews
# Reviews is a dictionary from candidate image to another dictionary 
# mapping from distance to reviewer name, time, threshold at which it was reviewed
reviews = {}
with open('../data/metadata/nearest_neighbor_reviews_v2.json', 'r') as f:
    reviews = json.load(f)
print('Current review statistics: ')
print_nn_stats(reviews)
print()

thresholds = {
    'l2' : 1.2e8,  #1.5e8
    'dssim' : 0.2205,
    'fc7' : 1.32e4  #1e4
}
cd_metric_pairs = get_cd_metric_pairs(nn_results, reviews, thresholds, cds)
print('Number of candidate-metric pairs to review: {}'.format(len(cd_metric_pairs)))

# Load existing near duplicates
with open('../data/metadata/near_duplicates.json', 'r') as f:
    near_duplicates = json.load(f)
print('Number of candidates with duplicates: {}'.format(len(near_duplicates)))
print()

Current review statistics: 
1983 candidates for metric l2
2588 candidates for metric dssim
2370 candidates for metric fc7

Number of candidate-metric pairs to review: 623
Number of candidates with duplicates: 3654



# Review nearest neighbors

Go through the candidate pairs by changing `candidate_offset`.

TODO: add functionality to show only candidate / nearest neighbor pairs that are not reviewed yet.

In [22]:
candidate_offset = 0
top_k = 10
checkboxes, review_boxes = review_near_duplicates(cd_metric_pairs,
                                                  nn_results,
                                                  reviews,
                                                  near_duplicates,                                     
                                                  top_k,
                                                  thresholds,
                                                  candidate_offset,
                                                  loader,
                                                  cds,
                                                  max_to_show=100,)

Loading image data ... done, took 0.00027372699696570635 seconds


HBox(children=(IntProgress(value=1, bar_style='info', description='Setting up image tabs', max=1), HTML(value=…


The following images have candidates below the threshold that were omitted:



Tab()

In [20]:
reviewer_name = getpass.getuser()
cur_selected, cur_unselected = parse_checkboxes(checkboxes)
near_duplicates = verify_checkboxes(checkboxes, cur_selected, cur_unselected, near_duplicates)
reviews = parse_review_boxes(review_boxes, reviews, reviewer_name, thresholds)

with open('../data/metadata/nearest_neighbor_reviews_v2.json', 'w') as f:
    json.dump(reviews, f, indent=2)
print('There are now {} images with at least one review entry.'.format(len(reviews)))
print('    Wrote to ../data/metadata/nearest_neighbor_reviews.json')

near_duplicates_to_save = get_near_duplicates_to_save(near_duplicates)
num_near_duplicates = get_num_near_duplicates(near_duplicates_to_save)
print('There are now {} candidates with near duplicates.'.format(num_near_duplicates))

with open('../data/metadata/near_duplicates.json', 'w') as f:
    json.dump(near_duplicates_to_save, f, indent=2, sort_keys=True)
print('    Wrote to ../data/metadata/near_duplicates.json')


There are now 4442 images with at least one review entry.
    Wrote to ../data/metadata/nearest_neighbor_reviews.json
There are now 3654 candidates with near duplicates.
    Wrote to ../data/metadata/near_duplicates.json


In [25]:
reviews["5cec40d17d6d478a66516d237f1702bb3b5d500b"]

{'l2': {'reviewer': 'becca',
  'date': '2018-10-12 15:01:34.232385',
  'references': ['n04592741_9186.JPEG',
   'ILSVRC2012_test_00075380.JPEG',
   '8c3d391beeda1ee272c20378fa067c21427c7051',
   '5abb22f22d7f066653f58355410fa307040c3912',
   'n04275548_10476.JPEG',
   'n03804744_34539.JPEG',
   'n02134084_829.JPEG',
   'n02058221_9499.JPEG',
   '11e59de908f000f41cd975aab8070605d21dc36f',
   'n02264363_1501.JPEG']},
 'dssim': {'reviewer': 'becca',
  'date': '2018-10-12 15:01:34.232402',
  'references': ['ILSVRC2012_test_00009498.JPEG',
   '1880cf9a3f28028066f77ad8044ef4f01ca78e4f',
   'ILSVRC2012_test_00075380.JPEG',
   'ILSVRC2012_val_00021872.JPEG',
   'ILSVRC2012_test_00039843.JPEG',
   '8c3d391beeda1ee272c20378fa067c21427c7051',
   '5abb22f22d7f066653f58355410fa307040c3912',
   '3d351fcd1cd706573034f1d4283500cc569ff48a',
   'ILSVRC2012_test_00016870.JPEG',
   '0b902bc90ea22cc91c7e89e310807010ca0ce1ed']},
 'fc7': {'reviewer': 'becca',
  'date': '2018-10-17 11:37:38.378699',
  'refere