[View in Colaboratory](https://colab.research.google.com/github/pawelos88/cifar10/blob/master/Cifar10.ipynb)

### Download CIFAR10 dataset

In [5]:
from urllib.request import urlretrieve
from urllib.parse import urlparse
import os

cifar10_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
cifar10_archive_file = os.path.basename(urlparse(cifar10_url).path)

urlretrieve(cifar10_url, cifar10_archive_file)

('cifar-10-python.tar.gz', <http.client.HTTPMessage at 0x7f30971d6e10>)

In [1]:
print("test")

test


###Extract archive file

In [0]:
import tarfile

def extract_archive_file(archive_file):
  with tarfile.open(archive_file) as file:
    file.extractall()
    
extract_archive_file(cifar10_archive_file)

### Load data from archive file

In [0]:
import numpy as np

cifar_path = "cifar-10-batches-py"

def load_data(cifar_path):
  data = load_batches(cifar_path)
  test_data = load_test_batch(cifar_path)
  return data, test_data 

def unpickle(file):
  import pickle
  with open(file, 'rb') as fo:
      dict = pickle.load(fo, encoding='bytes')
  return dict

def load_batches(cifar_path):
  batch_names = [ "data_batch_{}".format(i) for i in range(1, 6) ]
  data_batches = [ load_batch(cifar_path, batch_name) for batch_name in batch_names ]
  data = [ data for (data, label) in data_batches ]
  labels = [ label for (data, label) in data_batches ]
  return np.concatenate(data), np.concatenate(labels)

def load_test_batch(cifar_path):
  return load_batch(cifar_path, "test_batch");

def load_batch(cifar_path, batch_name):
  data_batch = unpickle(os.path.join(cifar_path, batch_name))
  return data_batch[b'data'], data_batch[b'labels']

(X, y), (X_test, y_test) = load_data(cifar_path)


### Split data to train and dev set

In [10]:
from sklearn.cross_validation import train_test_split

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1)




### Show random images from each category

In [0]:
def reshape(image):
  red_channel = image[0:1024].reshape(32, 32)
  green_channel = image[1024:2048].reshape(32, 32)
  blu_cannel = image[2048:3072].reshape(32, 32)
  return np.dstack((red_channel, green_channel, blu_cannel))

### Benchmark using hog feature extractor

In [0]:
from skimage.feature import hog
from skimage import color
from matplotlib.pyplot import imshow

def get_hog(image):
  grey_image = color.rgb2gray(image)
  return hog(grey_image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys', feature_vector=True)
  
X_dev_hog = [ get_hog(reshape(x)) for x in X_dev ]
X_test_hog = [ get_hog(reshape(x)) for x in X_test ]

In [0]:
X_train_hog = [ get_hog(reshape(x)) for x in X_train ]

In [0]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(kernel='linear', C=100)
clf.fit(X_train_hog, y_train) 

y_pred = clf.predict(X_test_hog)
accuracy_score(y_test, y_pred)


