First we create a balanced set of images for adapting the weights of naphash 
In order 16384 frames from each of these datasets:
* CelebA https://mmlab.ie.cuhk.edu.hk/projects/CelebA.html
* COCO https://cocodataset.org/#home
* Alternative ImageNet datasets  "ImageNetV2"( https://github.com/modestyachts/ImageNetV2 ) and ImageNet-Sketch (https://github.com/HaohanWang/ImageNet-Sketch)
* Fashionpedia Dataset https://fashionpedia.github.io/home/Fashionpedia_download.html
* iNaturalist dataset 2019 https://www.kaggle.com/c/inaturalist-2019-fgvc6
* Places365-Standard http://places2.csail.mit.edu/download.html
* ImageNet fall11 Release https://www.image-net.org/

This will also load the images, calculate the dct, and store it on disk for faster weight adaption training and testing

In [1]:
%load_ext autoreload
%autoreload 2
#There are three hyper-parameters for weight adaption session:
dct_dim = 32
min_img_dims = 128
trg_num_samples = 16384 #largest power of two which can fit in the individual dataset sizes (Places365 is 36500 but with the potential amount of large/small images we stay on the cautious side)

#Experimental switches:
use_pil_rz = False #uses PIL LANCZOS for downsampling instead of opencv INTER_AREA  
num_threads = 8 #use multi-core procedures with this many cores where possible

Each dataset as a different number of samples and some contain a different degree of images too small for use (128x128 is used as a minimum)

In [2]:
import gzip
import json
import glob
seekall = '/workspace/data/data/imagenet_fall11/fall11_whole.seekhelper.txt.gz'
def load_from_gz(filepath, tar_root_folder="", as_list_style=True):
    if as_list_style:
        info = []
    else:
        info = {}
    with gzip.open(filepath, 'rb') as f_in:
        for line in f_in:
            if as_list_style:
                info.append(tar_root_folder+line.decode('ascii'))
            else:
                l_sp = line.decode('ascii').split(':')
                info[l_sp[0]] = [int(l_sp[1]),int(l_sp[2])]
    return info

def load_from_seekhelper(dir0):
    if not dir0[-1] == '/':
        dir0+='/'
    return [dir0+p for p in load_from_gz(dir0+'.seekhelper.txt.gz')]
imgnet_alt_paths = load_from_seekhelper('/workspace/data/data/imagenet_alt/')
fashionpedia_paths = load_from_seekhelper('/workspace/data/data/fashionpedia/')
inat2019_paths = load_from_seekhelper('/workspace/data/inat2019/')
places365_paths = load_from_seekhelper('/workspace/data/places365/')
celeba_paths = load_from_seekhelper('/workspace/data/celeba/jpg256')

imgnet_root='/workspace/data/data/imagenet_fall11/'
imgnet_paths = load_from_seekhelper(imgnet_root)
coco_inp_dir = '/workspace/data/data/coco/images/train2017'
coco_paths = sorted(glob.glob(coco_inp_dir+'/*.jpg'))

In [3]:
import random
train_paths = [celeba_paths, coco_paths, imgnet_alt_paths, fashionpedia_paths, inat2019_paths, places365_paths, imgnet_paths]

In [4]:
print([len(p) for p in train_paths])

[70000, 118287, 53542, 48823, 303593, 36500, 14197087]


In [5]:
#normalize sets and take smaller subsample
trg_num_samples_use = int(trg_num_samples*1.25) #about 12% of imagenet frames have one dimension smaller than 128 -> add double for buffer; will be straightened later
all_paths_balanced = []
for t in train_paths:
    t0 = t[:]
    random.shuffle(t0)
    all_paths_balanced += t0[:trg_num_samples_use]

In [None]:
#Uncomment to recreate the original data paths 
#!gunzip ordered_paths_balanced.txt.gz
#with open('ordered_paths_balanced.txt', 'r') as f_out:
#    all_paths_balanced = [p for p in f_out]

In [9]:
from async_dct_loader import async_load_dct_paths, tqdm_nb
from build.naphash_cpp import naphash as nhcpp, rot_inv_type
nhcpp_objs = [nhcpp(dct_dim=dct_dim, rot_inv_mode=rot_inv_type.none, apply_center_crop=False, is_rgb=False) for _ in range(num_threads)] #no center crop
#all_dct = load_dct_paths(nhcpp_objs, all_paths_balanced, num_threads=8, dct_dim=dct_dim, min_img_dims=min_img_dims)
dcts = await async_load_dct_paths(nhcpp_objs, all_paths_balanced, num_threads, dct_dim, min_img_dims, tqdm_vers=tqdm_nb)

HBox(children=(FloatProgress(value=0.0, max=17920.0), HTML(value='')))




In [20]:
import numpy as np
def dataset_by_path(p):
    path_context = ['/celeba/','/coco/','/imagenet_alt/', '/fashionpedia/','/inat2019/','/places365/', '/imagenet_fall11/']
    for i,c in enumerate(path_context):
        if c in p: return i
    return -1
orig_dct, all_bu, count_ds = [], [], {}
for i in range(len(all_paths_balanced)):
    if dcts[i] is None:
        continue
    idx_dataset = dataset_by_path(all_paths_balanced[i])
    if count_ds.get(idx_dataset,0) > trg_num_samples:
        continue
    count_ds[idx_dataset] = count_ds.get(idx_dataset,0) + 1  
    all_bu.append(all_paths_balanced[i])
    orig_dct.append(dcts[i].reshape(1024))
orig_dct = np.vstack(orig_dct)
print(count_ds)

{0: 16385, 1: 16385, 2: 16385, 3: 16385, 4: 16385, 5: 16385, 6: 16385}


In [25]:
np.savez_compressed('ordered_dct_balanced.npz', dcts=orig_dct, paths=all_bu)

In [26]:
with open('ordered_paths_balanced.txt', 'wt') as f_out:
    for p in all_bu:
        f_out.write(p.replace('\n','')+'\n')

In [28]:
!gzip ordered_paths_balanced.txt

Below is code to pre-calculate nap hashes for the CIFAR10 dataset

In [8]:
from async_dct_loader import async_load_dct_paths, tqdm_nb
from build.naphash_cpp import naphash as nhcpp, rot_inv_type
from cifar10_trainer import files_in_subdirs

cifar10_train = list(sorted(files_in_subdirs('./cifar10/train')))
cifar10_val = list(sorted(files_in_subdirs('./cifar10/test')))

nhcpp_objs = [nhcpp(dct_dim=dct_dim, rot_inv_mode=rot_inv_type.none, apply_center_crop=False, is_rgb=False) for _ in range(num_threads)] #no center crop
#all_dct = load_dct_paths(nhcpp_objs, all_paths_balanced, num_threads=8, dct_dim=dct_dim, min_img_dims=min_img_dims)
dcts = await async_load_dct_paths(nhcpp_objs, cifar10_val+cifar10_train, num_threads, dct_dim, -1, tqdm_vers=tqdm_nb)


HBox(children=(FloatProgress(value=0.0, max=7500.0), HTML(value='')))




In [9]:
import numpy as np
np.savez_compressed('cifar10_dcts.npz', dcts=np.vstack([d.reshape(1024) for d in dcts]), paths=list(cifar10_val+cifar10_train))

In [1]:
import numpy as np
from async_dct_loader import async_hash_dcts, tqdm_nb
from build.naphash_cpp import naphash as nhcpp, rot_inv_type
from cifar10_trainer import files_in_subdirs
num_threads, dct_dim = 8, 32
precalc_dcts = np.load('cifar10_dcts.npz')
cifar_dcts, paths = precalc_dcts['dcts'],  precalc_dcts['paths']
nhcpp_objs = [nhcpp(dct_dim=dct_dim, rot_inv_mode=rot_inv_type.none, apply_center_crop=False, is_rgb=False) for _ in range(num_threads)] #no center crop
#all_dct = load_dct_paths(nhcpp_objs, all_paths_balanced, num_threads=8, dct_dim=dct_dim, min_img_dims=min_img_dims)
hashes = await async_hash_dcts(nhcpp_objs, cifar_dcts, num_threads, tqdm_vers=tqdm_nb)

HBox(children=(FloatProgress(value=0.0, max=7500.0), HTML(value='')))




In [2]:
from async_dct_loader import async_load_dct_paths
cifar10_train = list(sorted(files_in_subdirs('./cifar10/train')))
cifar10_val = list(sorted(files_in_subdirs('./cifar10/test')))
hashes2 = await async_load_dct_paths(nhcpp_objs, cifar10_val+cifar10_train, num_threads, dct_dim, -1, tqdm_vers=tqdm_nb, ret_hash = True)

HBox(children=(FloatProgress(value=0.0, max=7500.0), HTML(value='')))




In [6]:
all([(hashes[i] == hashes2[i]).all() for i in range(len(hashes))])

True

In [10]:
np.savez_compressed('cifar10_hashes.npz', hashes=np.vstack(hashes), paths=list(cifar10_val+cifar10_train))