In [None]:
# 1. Prepare dataset

# Dataset CIFAR-10
# downloaded from: https://www.cs.toronto.edu/~kriz/cifar.html
# described in: Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.

import pickle
import os
import tarfile
import urllib

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
DATA_TARFILE = 'cifar-10-python.tar.gz'
DATA_DIR = 'dataset'
DATA_EXTRACTED_DIR = 'dataset/cifar-10-batches-py'

def download_extract_dataset(dest_dir):
    """ Download and extract CIFAR-10 dataset (if necessary) to a given directory. """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    dest_filename = os.path.join(dest_dir, DATA_TARFILE)
    if not os.path.exists(dest_filename):
        print('Downloading data from %s...' % DATA_URL)
        urllib.urlretrieve(DATA_URL, dest_filename)
        print('Download finished')
    
    if not os.path.exists(DATA_EXTRACTED_DIR):
        print('Extracting archive...')
        with tarfile.open(dest_filename, "r:gz") as tar:
            tar.extractall(DATA_DIR)
    
    print('Dataset ready in directory: %s' % DATA_EXTRACTED_DIR)
    
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


download_extract_dataset(DATA_DIR)

batch_train_dicts = [unpickle(os.path.join(DATA_EXTRACTED_DIR, ('data_batch_%d' % i))) for i in range(1, 6)]
batch_test_dict = unpickle(os.path.join(DATA_EXTRACTED_DIR, 'test_batch'))


In [None]:
# Join training set batches together, for simplified processing

import numpy as np
np.random.seed(42)

def join_batches(batch_dicts):
    # TODO: do it faster, with one allocation only (eg. allocation and then slices)
    result = dict(batch_dicts[0])
    for i in range(1, 5):
        result[b'data'] = np.concatenate((result[b'data'], batch_dicts[i][b'data']))
        result[b'labels'] = result[b'labels'] + batch_dicts[i][b'labels']
    return result

train_dict = join_batches(batch_train_dicts)
test_dict = batch_test_dict

print('Training set size: %d' % train_dict[b'data'].shape[0])

In [None]:
# Convert images to RGB format

def cifar_to_rgb_dataset(img_cifar):
    """
    Change format from CIFAR-like to matplotlib-like of all given images 
    
    :param imgs_cifar: an array of images represented by list of 3072 consecutive pixel values:
        first all red, then green, then blue; row-wise
    :return: an array of shape (..., 32, 32, 3), with values of type 'float32'
    """
    img_3d = np.reshape(img_cifar, (-1, 3, 32, 32))
    img_rgb = np.transpose(img_3d, (0, 2, 3, 1))
    # scale values to [0, 1] interval:
    return np.asarray(img_rgb, dtype='float32') / 255.

train_data_rgb = cifar_to_rgb_dataset(train_dict[b'data'])
train_labels = np.asarray(train_dict[b'labels'], dtype='uint8')

In [None]:
# 2. Plot images

import matplotlib.pyplot as plt
%matplotlib inline

class_cnt = 10
class_size = 5000
class_sample_size = 10  # class images sample size

rnd_indices = np.random.choice(np.arange(class_size), size=(class_cnt, class_sample_size), replace=False)

fig = plt.figure()
plt.subplots_adjust(wspace=0.1, hspace=0.1)

for cls in range(class_cnt):
    # choose 10 random images for class 'cls'
    class_imgs = train_data_rgb[train_labels == cls][rnd_indices[cls]]
    # plot them
    for x, img in enumerate(class_imgs):
        fig.add_subplot(class_cnt, class_sample_size, cls * class_sample_size + x + 1)
        plt.imshow(img)
        plt.axis('off')

plt.show()