# Load Data - ImageNet

In [12]:
import numpy as np
from os import listdir
from os.path import isfile, join
from PIL import Image
from sklearn.model_selection import train_test_split

# Paths to files
LABELS_PATH = '../../data/data_imagenet/labels.txt'
DATA_PATH = '../../data/data_imagenet/val/'
IMG_MEAN = [103.939, 116.779, 123.68]  # Mean to subtract from image, used it later, in training script

# TODO find pre-calculated means and std

# Load in images
def load_img(path, size = (256, 256)):
    im = Image.open(path)
    im = im.resize(size, Image.ANTIALIAS)
    rgb_im = im.convert('RGB')  # Some imageses are in Grayscale
    return np.array(rgb_im)

# Get center of image array
def center_crop(img_mat, size = (224, 224)):
    w,h,c = img_mat.shape
    start_h = h//2-(size[1]//2)  # Size[1] - h of cropped image
    start_w = w//2-(size[0]//2)  # Size[0] - w of cropepd image
    return img_mat[start_w:start_w+size[0],start_h:start_h+size[1], :]

In [83]:
def load_data_imagenet(size = (256, 256), size_crop = (224, 224)):

    # ### Get test and train labels

    # First get all the labels
    y_val = np.loadtxt(fname=LABELS_PATH, dtype="str")
    y_val = np.array(list(map(int, y_val[:,1])))
        
    # ### Load in images as numpy array

    path = DATA_PATH
    val_imgs = [f for f in listdir(path) if isfile(join(path, f))]
    len_val = len(val_imgs)

    # Fill in x_train array with train data

    x_val = np.empty((len_val, *size_crop, 3), dtype="float32")

    for i, img_path in enumerate(val_imgs):
        img_mat = load_img(DATA_PATH + img_path, size = size)
        x_val[i] = center_crop(img_mat, size = size_crop)  # Crop center of the image

    return (x_val, y_val)

In [84]:
def load_data_imagenet_split(size = (256, 256), size_crop = (224, 224), seed = 333):
    
    x,y = load_data_imagenet(size, size_crop)  # Load in data
    
    y = y[:100]
    
    for i in range(3):
        x[:,:,:,i] -= IMG_MEAN[i]
    
    x_test, x_val, y_test, y_val = train_test_split(x, y, test_size=0.5, random_state=seed)
    
    return ((x_val, y_val), (x_test, y_test))    

In [85]:
(x_val, y_val), (x_test, y_test) = load_data_imagenet_split()

In [86]:
x_val.shape

(50, 224, 224, 3)

In [87]:
y_val.shape

(50,)

In [93]:

y_val_all = np.loadtxt(fname=LABELS_PATH, dtype="str")
y_val_all = np.array(list(map(int, y_val_all[:,1])))
        

In [95]:
max(y_val_all)

999

In [98]:
y_val_all.shape

(50000,)

In [97]:
y_val_test = keras.utils.to_categorical(y_val_all, 1000)

In [19]:
import keras

Using TensorFlow backend.


In [33]:
y_val_test.shape

(50000, 1000)

In [None]:
[ np.where(r==1)[0][0] for r in y_true]

In [44]:
t = np.array([1,2,3])

In [46]:
np.where(t==1)

(array([0], dtype=int64),)

In [99]:
y_val_all[49950:]

array([899, 879, 522,  49, 813, 239, 886, 347, 208, 294, 320,  87, 715,
       929, 212,  94, 533, 903, 812, 921, 583, 709, 295, 372,  67, 361,
       108, 447, 455,  49, 121, 919, 872, 277, 367, 430,  44,  81, 399,
        24, 120, 357, 826, 101, 644, 283,  26, 232, 982, 355])

In [49]:
y_val_all[49999]

185

In [100]:
from keras.applications.imagenet_utils import preprocess_input


In [102]:
x_val.shape

(50, 224, 224, 3)