In [None]:
# 1. Prepare dataset

# Dataset CIFAR-10
# downloaded from: https://www.cs.toronto.edu/~kriz/cifar.html
# described in: Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.

import pickle
import os
import tarfile
import urllib

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
DATA_TARFILE = 'cifar-10-python.tar.gz'
DATA_DIR = 'dataset'
DATA_EXTRACTED_DIR = 'dataset/cifar-10-batches-py'

def download_extract_dataset(dest_dir):
    """ Download and extract CIFAR-10 dataset (if necessary) to a given directory. """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    dest_filename = os.path.join(dest_dir, DATA_TARFILE)
    if not os.path.exists(dest_filename):
        print('Downloading data from %s...' % DATA_URL)
        urllib.urlretrieve(DATA_URL, dest_filename)
        print('Download finished')
    
    if not os.path.exists(DATA_EXTRACTED_DIR):
        print('Extracting archive...')
        with tarfile.open(dest_filename, "r:gz") as tar:
            tar.extractall(DATA_DIR)
    
    print('Dataset ready in directory: %s' % DATA_EXTRACTED_DIR)
    
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


download_extract_dataset(DATA_DIR)

batch_train_dicts = [unpickle(os.path.join(DATA_EXTRACTED_DIR, ('data_batch_%d' % i))) for i in range(1, 6)]
batch_test_dict = unpickle(os.path.join(DATA_EXTRACTED_DIR, 'test_batch'))


In [None]:
# Join training set batches together, for simplified processing

import numpy as np
np.random.seed(42)

def join_batches(batch_dicts):
    # TODO: do it faster, with one allocation only (eg. allocation and then slices)
    result = dict(batch_dicts[0])
    for i in range(1, 5):
        result[b'data'] = np.concatenate((result[b'data'], batch_dicts[i][b'data']))
        result[b'labels'] = result[b'labels'] + batch_dicts[i][b'labels']
    return result

train_dict = join_batches(batch_train_dicts)
test_dict = batch_test_dict

print('Training set size: %d' % train_dict[b'data'].shape[0])