In [None]:
# 1. Prepare dataset

# Dataset CIFAR-10
# downloaded from: https://www.cs.toronto.edu/~kriz/cifar.html
# described in: Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.

import pickle
import os
import tarfile
import urllib.request

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
DATA_TARFILE = 'cifar-10-python.tar.gz'
DATA_DIR = 'dataset'
DATA_EXTRACTED_DIR = 'dataset/cifar-10-batches-py'

def download_extract_dataset(dest_dir):
    """ Download and extract CIFAR-10 dataset (if necessary) to a given directory. """
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    dest_filename = os.path.join(dest_dir, DATA_TARFILE)
    if not os.path.exists(dest_filename):
        print('Downloading data from %s...' % DATA_URL)
        urllib.request.urlretrieve(DATA_URL, dest_filename)
        print('Download finished')
    
    if not os.path.exists(DATA_EXTRACTED_DIR):
        print('Extracting archive...')
        with tarfile.open(dest_filename, "r:gz") as tar:
            tar.extractall(DATA_DIR)
    
    print('Dataset ready in directory: %s' % DATA_EXTRACTED_DIR)
    
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


download_extract_dataset(DATA_DIR)

batch_train_dicts = [unpickle(os.path.join(DATA_EXTRACTED_DIR, ('data_batch_%d' % i))) for i in range(1, 6)]
batch_test_dict = unpickle(os.path.join(DATA_EXTRACTED_DIR, 'test_batch'))

In [None]:
# Join training set batches together, for simplified processing

import numpy as np
np.random.seed(42)

BATCH_SIZE = 10000
BATCH_CNT = 5
TRAIN_HEIGHT = 50000  # = BATCH_SIZE * BATCH_CNT = sum([batch_dicts[0][b'data'].shape[i] for i in range(5)])
TRAIN_WIDTH = batch_train_dicts[0][b'data'].shape[1]

# join batches together
X_train = np.empty((TRAIN_HEIGHT, TRAIN_WIDTH), dtype='uint8')
y_train = np.empty(TRAIN_HEIGHT, dtype='uint8')
for i in range(0, BATCH_CNT):
    X_train[(i * BATCH_SIZE):((i + 1) * BATCH_SIZE)] = batch_train_dicts[i][b'data']
    y_train[(i * BATCH_SIZE):((i + 1) * BATCH_SIZE)] = batch_train_dicts[i][b'labels']

X_test = batch_test_dict[b'data']
y_test = np.asarray(batch_test_dict[b'labels'], dtype='uint8')

print('Full training set size: %d' % len(X_train))
print('Full test set size: %d' % len(X_test))

In [None]:
# Convert images to RGB format

def cifar_to_rgb_dataset(imgs):
    """
    Change format from CIFAR-like to matplotlib-like of all given images 
    
    :param imgs_cifar: an array of images represented by list of 3072 consecutive pixel values:
        first all red, then green, then blue; row-wise
    :return: an array of shape (..., 32, 32, 3), with values of type 'float32'
    """
    img_3d = np.reshape(imgs, (-1, 3, 32, 32))
    img_rgb = np.transpose(img_3d, (0, 2, 3, 1))
    # scale values to [0, 1] interval:
    return np.asarray(img_rgb, dtype='float32') / 255.

In [None]:
# 2. Plot images

import matplotlib.pyplot as plt
%matplotlib inline

CLASS_CNT = 10  # = np.unique(test_labels)
CLASS_SAMPLE_SIZE = 10  # class images sample size

fig = plt.figure()
plt.subplots_adjust(wspace=0.1, hspace=0.1)

for cls in range(CLASS_CNT):
    # select class images
    X_class = X_train[y_train == cls]
    # choose 10 random images and convert to RGB format
    rnd_indices = np.random.choice(len(X_class), CLASS_SAMPLE_SIZE, replace=False)
    X_cls = cifar_to_rgb_dataset(X_class[rnd_indices])
    # plot them
    for x, img in enumerate(X_cls):
        fig.add_subplot(CLASS_CNT, CLASS_SAMPLE_SIZE, cls * CLASS_SAMPLE_SIZE + x + 1)
        plt.imshow(img)
        plt.axis('off')

plt.show()

In [None]:
# Extract HOG features

from skimage.feature import hog
from skimage import color

def rgb_to_hog(img):
    img_gray = color.rgb2gray(img)
    return hog(img_gray, block_norm='L2-Hys', visualise=False)

def rgb_to_hog_dataset(imgs):
    """ Calculate HOG for all images in dataset """
    result = list()
    for img in imgs:
        result.append(rgb_to_hog(img))
    return np.asarray(result, dtype='float32')

# Extract HOG features, starting from CIFAR format
def cifar_to_hog(imgs):
    return rgb_to_hog_dataset(cifar_to_rgb_dataset(imgs))

In [None]:
# 3. Shallow classifier

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from shutil import rmtree
from tempfile import mkdtemp
from time import time

cachedir = mkdtemp()
hogs = FunctionTransformer(cifar_to_hog)
pipe = Pipeline([('hog', hogs), ('norm', StandardScaler()), ('svc', SVC())], memory=cachedir)

Cs = np.logspace(-1, 3, 5)
grid_params = {'svc__kernel': ['rbf', 'linear'], 'svc__C': Cs}

clf = GridSearchCV(pipe, grid_params, cv=3, n_jobs=-1)

In [None]:
# Fit estimator using a smaller subset of images

# TODO: set appropriate value / use full dataset:
SUBSET_SIZE = 1000

def choose_random_subset(X, y, subset_size):
    indices = np.random.permutation(len(X))[:subset_size]
    return X[indices], y[indices]

X, y = choose_random_subset(X_train, y_train, SUBSET_SIZE)

print('Reduced training set size: %d' % len(X))
print()
print('Fitting ...')
start = time()
 
clf.fit(X, y)
 
end = time()
print('Fitting done.')
print('Time elapsed: %0.03fs' % (end - start,))
rmtree(cachedir)

In [None]:
# Grid search summary

print('Best parameters set found on training set:')
print(clf.best_params_)
print()
print('Grid scores on training set:')
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()
print('Score on test set:')
print(clf.score(X_test, y_test))