## Image downloader

This notebook downloads the relevant images from ImageNet directly, using the registered access key.
Running this notebook from start to finish creates train/val/test sets under data/images/

In [2]:
# Download images
# Mostly code from https://github.com/rezoo/imagenet-python

import sys
import os
from xml.etree import ElementTree
import requests
import json

BASE_URL = "http://www.image-net.org/download/synset"
USERNAME = "lqkhoo"
ACCESS_KEY = "2f9264606df44886088c60983ca4ec45cb8e62c9"

BASE_PATH = "data/images/tar/"

synsets = None
NAMES_FILE_PATH = 'data/named_populated_synsets.json'
with open(NAMES_FILE_PATH) as f:
    synsets = json.load(f)

print(len(synsets.keys()))
    
for synset in synsets:
    params = {
        "wnid": synset,
        "username": USERNAME,
        "accesskey": ACCESS_KEY,
        "release": "latest",
        "src": "stanford"
    }
    
    print("Downloading images for " + synset + " (" + synsets[synset] + ")")
    write_path = BASE_PATH + synset + ".tar"
    if not os.path.exists(write_path) or os.path.getsize(write_path) == 0:
        response = requests.get(BASE_URL, params=params)
        content_type = response.headers["content-type"]
        if content_type.startswith("text"):
            print("  WARNING: 404 error downloading synset" + synset)
        else:
            with open(write_path, "wb") as f:
                f.write(response.content)
    
    else:
        print("  Images already downloaded. Moving on...")
            
print("All done.")

227
Downloading images for n03207941 (dishwasher)
  Images already downloaded. Moving on...
Downloading images for n02403003 (ox)
  Images already downloaded. Moving on...
Downloading images for n03530642 (honeycomb)
  Images already downloaded. Moving on...
Downloading images for n01986214 (hermit crab)
  Images already downloaded. Moving on...
Downloading images for n04326547 (stone wall)
  Images already downloaded. Moving on...
Downloading images for n02443484 (black-footed ferret)
  Images already downloaded. Moving on...
Downloading images for n02909870 (bucket)
  Images already downloaded. Moving on...
Downloading images for n03792972 (mountain tent)
  Images already downloaded. Moving on...
Downloading images for n02074367 (dugong)
  Images already downloaded. Moving on...
Downloading images for n03388043 (fountain)
  Images already downloaded. Moving on...
Downloading images for n04479046 (trench coat)
  Images already downloaded. Moving on...
Downloading images for n03017168 

In [7]:
# Extract images to /raw

import os
import tarfile
from os import listdir
from os.path import isfile, join
import json
import numpy as np

INPUT_DIR = "data/images/tar/"
OUTPUT_DIR = "data/images/raw/"

files = [f for f in listdir(INPUT_DIR) if isfile(join(INPUT_DIR, f))]

# load synsets
FILE_PATH_1 = 'data/final_synsets_counts.json'
with open('data/final_synsets_counts.json') as f:
    synsets = json.load(f)
    f.close()
    
for synset in synsets:   
    input_dir = join(INPUT_DIR, synset + ".tar")
    output_dir = join(OUTPUT_DIR, synset)
    tar = tarfile.open(input_dir)
    
    # Extract to /raw
    print("Extracting " + synset)
    tar.extractall(path=output_dir)

print("All done.")

Extracting n03360622
Extracting n03903868
Extracting n09288635
Extracting n01855672
Extracting n03837869
Extracting n04265275
Extracting n02963159
Extracting n02132136
Extracting n03657121
Extracting n04335886
Extracting n02843684
Extracting n03388043
Extracting n02395406
Extracting n03680355
Extracting n03793489
Extracting n02699494
Extracting n02226429
Extracting n02950826
Extracting n02841315
Extracting n03532672
Extracting n03844815
Extracting n04366367
Extracting n02948072
Extracting n02410509
Extracting n04592741
Extracting n02731629
Extracting n03207941
Extracting n04107743
Extracting n03085013
Extracting n04560804
Extracting n03814906
Extracting n02108089
Extracting n03759954
Extracting n02374451
Extracting n04133789
Extracting n04479046
Extracting n07711569
Extracting n04562935
Extracting n03329302
Extracting n04532670
Extracting n01632458
Extracting n02999410
Extracting n03447721
Extracting n04613696
Extracting n09874725
Extracting n02730930
Extracting n01605630
Extracting n0

In [16]:
# Filter. We take images which meet the following criteria:
#   at least 256x256 pixels
#   3-channel RGB. We discard black & white, 4-channel CMYK jpegs, or other formats 
#      that do not have the shape (>=224, >=224, 3)
# Dump these images to /filtered

import os
from os import listdir
from os.path import isfile, join
from shutil import copyfile
from PIL import Image

INPUT_DIR = "data/images/raw/"
OUTPUT_DIR = "data/images/filtered/"

dirs = [d for d in listdir(INPUT_DIR) if not isfile(join(INPUT_DIR, d))]
for i in range(len(dirs)):
    d = dirs[i]
    print("Processing: " + d + " (" + str(i+1) + "/" + str(len(dirs)) + ")")
    
    dirpath = join(INPUT_DIR, d)
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    num_files = len(files)
    num_ignored_files = 0
    for f in files:
        file_path = join(dirpath, f)
        img = Image.open(file_path)
        img.load()
        data = np.asarray(img)
        if len(data.shape) != 3:
            # print("    Ignored file: Incorrect no. of dimensions." + str(data.shape))
            num_ignored_files += 1
            continue
        elif data.shape[2] != 3:
            # print("    Ignored file: Incorrect no. of channels." + str(data.shape))
            num_ignored_files += 1
            continue
        elif data.shape[0] < 224 or data.shape[1] < 224:
            # print("    Ignored file: Image smaller than 100px." + str(data.shape))
            num_ignored_files += 1
            continue
            
        output_dirpath = join(OUTPUT_DIR, d)
        if not os.path.exists(output_dirpath):
            os.makedirs(output_dirpath)
            
        copyfile(join(dirpath, f), join(output_dirpath, f))
    print("  " + str(num_files - num_ignored_files) + " / " + str(num_files) + " passed filter.")
            
print("All done")


Processing: n03692522 (1/227)
  786 / 1406 passed filter.
Processing: n04179913 (2/227)
  1856 / 2078 passed filter.
Processing: n09217230 (3/227)
  1608 / 1713 passed filter.
Processing: n03314780 (4/227)
  783 / 1138 passed filter.
Processing: n04325704 (5/227)
  1015 / 1191 passed filter.
Processing: n03982430 (6/227)
  1437 / 1932 passed filter.
Processing: n02077923 (7/227)
  999 / 1297 passed filter.
Processing: n02321529 (8/227)
  1090 / 1167 passed filter.
Processing: n04317175 (9/227)
  1057 / 1402 passed filter.
Processing: n04194289 (10/227)
  1136 / 1261 passed filter.
Processing: n06277135 (11/227)
  1010 / 1211 passed filter.
Processing: n02443484 (12/227)
  729 / 999 passed filter.
Processing: n01950731 (13/227)
  2098 / 2211 passed filter.
Processing: n01774384 (14/227)
  1174 / 1272 passed filter.
Processing: n02395406 (15/227)
  981 / 1463 passed filter.
Processing: n03179701 (16/227)
  1292 / 1366 passed filter.
Processing: n04613696 (17/227)
  1102 / 1166 passed fil

In [21]:
# Count and sort synsets

import os
from os import listdir
from os.path import isfile, join
import json
from pprint import pprint

INPUT_DIR = "data/images/filtered/"
counts = {}

dirs = [d for d in listdir(INPUT_DIR) if not isfile(join(INPUT_DIR, d))]
for i in range(len(dirs)):
    d = dirs[i]
    print("Processing: " + d + " (" + str(i+1) + "/" + str(len(dirs)) + ")")
    
    dirpath = join(INPUT_DIR, d)
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    num_files = len(files)
    counts[d] = num_files
    
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
sorted_counts = [t for t in sorted_counts if t[1] >= 600]

synsets = {}
synsets.update(sorted_counts)
pprint(len(synsets.items()))

with open("data/filtered_synsets.json", "w") as f:
    json.dump(synsets, f)

Processing: n03692522 (1/227)
Processing: n04179913 (2/227)
Processing: n09217230 (3/227)
Processing: n03314780 (4/227)
Processing: n04325704 (5/227)
Processing: n03982430 (6/227)
Processing: n02077923 (7/227)
Processing: n02321529 (8/227)
Processing: n04317175 (9/227)
Processing: n04194289 (10/227)
Processing: n06277135 (11/227)
Processing: n02443484 (12/227)
Processing: n01950731 (13/227)
Processing: n01774384 (14/227)
Processing: n02395406 (15/227)
Processing: n03179701 (16/227)
Processing: n04613696 (17/227)
Processing: n04256520 (18/227)
Processing: n03838899 (19/227)
Processing: n02013706 (20/227)
Processing: n04209133 (21/227)
Processing: n04330267 (22/227)
Processing: n02788148 (23/227)
Processing: n02906734 (24/227)
Processing: n03874293 (25/227)
Processing: n03447721 (26/227)
Processing: n03017168 (27/227)
Processing: n03837869 (28/227)
Processing: n02256656 (29/227)
Processing: n07920052 (30/227)
Processing: n03657121 (31/227)
Processing: n03498962 (32/227)
Processing: n0243

Now use the bbox notebook to generate both tsv and json bounding boxes for the filtered list of synsets

In [35]:
# Now we need to ensure each image has a bounding box. We filter again.
import json
from shutil import copyfile

INPUT_DIR = "data/images/filtered/"
OUTPUT_DIR = "data/images/bboxfiltered/"
JSON_BBOX_DIR = "data/bbox/json/"

dirs = [d.split('.')[0] for d in listdir(JSON_BBOX_DIR) if isfile(join(JSON_BBOX_DIR, d))]
for i in range(len(dirs)):
    d = dirs[i]
    with open(join(JSON_BBOX_DIR, d + ".json")) as f:
        bbox_dict = json.load(f)
    
    print("Processing: " + d + " (" + str(i+1) + "/" + str(len(dirs)) + ")")
    
    dirpath = join(INPUT_DIR, d)
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    
    for f in files:
        filepath = join(dirpath, f)
        output_dirpath = join(OUTPUT_DIR, d)
        if not os.path.exists(output_dirpath):
            os.makedirs(output_dirpath)
        if f in bbox_dict:
            copyfile(filepath, join(output_dirpath, f))
            
print("All done.")

Processing: n03770439 (1/218)
Processing: n03902125 (2/218)
Processing: n04256520 (3/218)
Processing: n03657121 (4/218)
Processing: n03197337 (5/218)
Processing: n04398044 (6/218)
Processing: n02280649 (7/218)
Processing: n02259212 (8/218)
Processing: n02132136 (9/218)
Processing: n09468604 (10/218)
Processing: n03814639 (11/218)
Processing: n02403003 (12/218)
Processing: n04311004 (13/218)
Processing: n09288635 (14/218)
Processing: n04350905 (15/218)
Processing: n03085013 (16/218)
Processing: n03513137 (17/218)
Processing: n03980874 (18/218)
Processing: n02236044 (19/218)
Processing: n03187595 (20/218)
Processing: n04008634 (21/218)
Processing: n01986214 (22/218)
Processing: n02484322 (23/218)
Processing: n01641391 (24/218)
Processing: n01605630 (25/218)
Processing: n03388043 (26/218)
Processing: n03649909 (27/218)
Processing: n04525038 (28/218)
Processing: n03314780 (29/218)
Processing: n04201297 (30/218)
Processing: n03042490 (31/218)
Processing: n03992509 (32/218)
Processing: n0402

In [4]:
# Count images again
import numpy as np
from os import listdir
from os.path import isfile, join
from pprint import pprint

INPUT_DIR = "data/images/bboxfiltered"

dirs = [d for d in listdir(INPUT_DIR) if not isfile(join(INPUT_DIR, d))]

counts = {}
for i in range(len(dirs)):
    d = dirs[i]
    dirpath = join(INPUT_DIR, d)
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    count = len(files)
    counts[d] = count
            
pprint(counts)
n = len([c for c in counts.values() if c >= 400])
total = np.sum(np.array([c for c in counts.values() if c >= 400]))
print(n)
print(total)

{'n00017222': 303,
 'n01503061': 1344,
 'n01605630': 648,
 'n01629819': 521,
 'n01632458': 592,
 'n01641391': 525,
 'n01641577': 554,
 'n01644373': 573,
 'n01770393': 527,
 'n01774384': 569,
 'n01784675': 555,
 'n01853498': 580,
 'n01855032': 596,
 'n01855672': 665,
 'n01871265': 567,
 'n01917289': 634,
 'n01950731': 565,
 'n01983481': 485,
 'n01986214': 548,
 'n02002724': 529,
 'n02013706': 647,
 'n02077923': 532,
 'n02084071': 593,
 'n02108089': 836,
 'n02113799': 709,
 'n02132136': 618,
 'n02165105': 512,
 'n02165456': 641,
 'n02169497': 522,
 'n02226429': 617,
 'n02231487': 549,
 'n02236044': 637,
 'n02256656': 543,
 'n02259212': 572,
 'n02268853': 534,
 'n02277742': 560,
 'n02279972': 637,
 'n02280649': 615,
 'n02281406': 546,
 'n02281787': 584,
 'n02321529': 524,
 'n02374451': 553,
 'n02395406': 482,
 'n02403003': 544,
 'n02410509': 568,
 'n02415577': 531,
 'n02430045': 670,
 'n02443484': 447,
 'n02472293': 174,
 'n02484322': 649,
 'n02508021': 1175,
 'n02666196': 527,
 'n0269115

In [3]:
# Create training, test, and validation splits. 
# We pick out 160 synsets that contain the most images and make the splits on that set
# We hold out ALL instances in the remaining 38 synsets to test for generalization

import json
import os
from os import listdir
from os.path import isfile, join
from shutil import copyfile
import numpy as np

SYNSETS_FILE = "data/filtered_synsets.json"
INPUT_DIR = "data/images/bboxfiltered/"
OUTPUT_DIR = "data/images/"
counts = {}

with open(SYNSETS_FILE) as f:
    synsets = json.load(f)
    print(len(synsets.items()))

counts = {}
dirs = [d for d in listdir(INPUT_DIR) if not isfile(join(INPUT_DIR, d))]
# Sort by no. of files in each dir
for d in dirs:
    dirpath = join(INPUT_DIR, d)
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    num_files = len(files)
    if num_files >= 400:
        counts[d] = num_files
    
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
print(len(sorted_counts))

synsets_to_be_split = list(map(lambda x: x[0], sorted_counts[:160]))
synsets_to_be_held_out = list(map(lambda x: x[0], sorted_counts[160:]))
print(len(synsets_to_be_split))
print(len(synsets_to_be_held_out))


for d in synsets_to_be_held_out:
    print("Processing holdout: " + d)
    dirpath = join(INPUT_DIR, d)
    holdout_output_dir = join(OUTPUT_DIR, "holdout")
    
    holdout_train_output_dir = join(holdout_output_dir, "train", d)
    holdout_val_output_dir = join(holdout_output_dir, "val", d)
    holdout_test_output_dir = join(holdout_output_dir, "test", d)
    
    if not os.path.exists(holdout_train_output_dir): os.makedirs(holdout_train_output_dir)
    if not os.path.exists(holdout_val_output_dir): os.makedirs(holdout_val_output_dir)
    if not os.path.exists(holdout_test_output_dir): os.makedirs(holdout_test_output_dir)
    
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    num_files = len(files)
    val = files[0:50]
    test = files[51:100]
    train = files[101:]
    # train, val, test = np.split(files, [int((10.0/12.0)*num_files), int((11.0/12.0)*num_files)])
    
    for f in train:
        copyfile(join(dirpath, f), join(holdout_train_output_dir, f))
    for f in val:
        copyfile(join(dirpath, f), join(holdout_val_output_dir, f))
    for f in test:
        copyfile(join(dirpath, f), join(holdout_test_output_dir, f))


for i in range(len(synsets_to_be_split)):
    
    d = synsets_to_be_split[i]
    print("Processing: " + d + " (" + str(i+1) + "/" + str(len(synsets_to_be_split)) + ")")
    if d not in synsets:
        print("  Insufficient no. of images. Ignoring synset.")
        continue
    
    dirpath = join(INPUT_DIR, d)
    
    train_output_dir = join(OUTPUT_DIR, "train", d)
    val_output_dir = join(OUTPUT_DIR, "val", d)
    test_output_dir = join(OUTPUT_DIR, "test", d)
    if not os.path.exists(train_output_dir): os.makedirs(train_output_dir)
    if not os.path.exists(val_output_dir): os.makedirs(val_output_dir)
    if not os.path.exists(test_output_dir): os.makedirs(test_output_dir)
    
    files = [f for f in listdir(dirpath) if isfile(join(dirpath, f))]
    num_files = len(files)
    val = files[0:50]
    test = files[51:100]
    train = files[101:]
    # train, val, test = np.split(files, [int((10.0/12.0)*num_files), int((11.0/12.0)*num_files)])
    
    for f in train:
        copyfile(join(dirpath, f), join(train_output_dir, f))
    for f in val:
        copyfile(join(dirpath, f), join(val_output_dir, f))
    for f in test:
        copyfile(join(dirpath, f), join(test_output_dir, f))


print("All done.")


218
198
160
38
Processing holdout: n03255030
Processing holdout: n03759954
Processing holdout: n01983481
Processing holdout: n04592741
Processing holdout: n02395406
Processing holdout: n04107743
Processing holdout: n02950826
Processing holdout: n04325704
Processing holdout: n03773035
Processing holdout: n03180011
Processing holdout: n03803284
Processing holdout: n04525038
Processing holdout: n03995372
Processing holdout: n04317175
Processing holdout: n03207941
Processing holdout: n03530642
Processing holdout: n03329302
Processing holdout: n04335886
Processing holdout: n04201297
Processing holdout: n03793489
Processing holdout: n03680355
Processing holdout: n03761084
Processing holdout: n02443484
Processing holdout: n03692522
Processing holdout: n03992509
Processing holdout: n03838899
Processing holdout: n04004767
Processing holdout: n03325088
Processing holdout: n04102406
Processing holdout: n03513137
Processing holdout: n04447861
Processing holdout: n03447721
Processing holdout: n0390