In [1]:
import tensorflow as tf
import numpy as np
from urllib.request import urlretrieve

In [2]:
data_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

In [3]:
urlretrieve(data_url, 'cifar-10-python.tar.gz')

('cifar-10-python.tar.gz', <http.client.HTTPMessage at 0x7fd8d4a0bbd0>)

In [4]:
!tar -xvf cifar-10-python.tar.gz

cifar-10-batches-py/
cifar-10-batches-py/data_batch_4
cifar-10-batches-py/readme.html
cifar-10-batches-py/test_batch
cifar-10-batches-py/data_batch_3
cifar-10-batches-py/batches.meta
cifar-10-batches-py/data_batch_2
cifar-10-batches-py/data_batch_5
cifar-10-batches-py/data_batch_1


In [5]:
def unpickle(file):
  import pickle
  with open(file, 'rb') as fo:
    dict = pickle.load(fo, encoding='bytes')
  return dict

In [11]:
batch1 = unpickle('/content/cifar-10-batches-py/data_batch_1')
batch2 = unpickle('/content/cifar-10-batches-py/data_batch_2')
batch3 = unpickle('/content/cifar-10-batches-py/data_batch_3')
batch4 = unpickle('/content/cifar-10-batches-py/data_batch_4')
batch5 = unpickle('/content/cifar-10-batches-py/data_batch_5')

# see what's in batch1
batch1.keys()

dict_keys([b'batch_label', b'labels', b'data', b'filenames'])

In [12]:
# concatenate all batches
train_data = np.concatenate((batch1[b'data'], batch2[b'data'], batch3[b'data'],
                             batch4[b'data'], batch5[b'data']),
                            axis=0)
train_labels = batch1[b'labels'] + batch2[b'labels'] + \
batch3[b'labels'] + batch4[b'labels'] + batch5[b'labels']

In [13]:
test = unpickle('/content/cifar-10-batches-py/test_batch')

In [14]:
x_train, y_train = train_data, train_labels
x_test, y_test = test[b'data'], test[b'labels']

In [15]:
def data_desc(x_train, y_train, x_test, y_test):
  # data type
  print('\ttype(x_train): {}'.format(type(x_train)))
  print('\ttype(y_train): {}'.format(type(y_train)))
  # data shape
  print('\tx_train.shape: {}'.format(np.shape(x_train)))
  print('\ty_train.shape: {}'.format(np.shape(y_train)))
  print('\tx_test.shape: {}'.format(np.shape(x_test)))
  print('\ty_test.shape: {}'.format(np.shape(y_test)))

  # an instance
  print('\ty_train[12]: {}'.format(y_train[12]))

In [17]:
xtrain = np.array(x_train)
# one-hot encoding (only for y)
ytrain = tf.keras.utils.to_categorical(y_train, num_classes=10)

xtest = np.array(x_test)
# one-hot encoding (only for y)
ytest = tf.keras.utils.to_categorical(y_test, num_classes=10)

In [18]:
print('--------------------------')
print('before preprocessing:')
data_desc(x_train, y_train, x_test, y_test)
print('--------------------------')
print('after preprocessing:')
data_desc(xtrain, ytrain, xtest, ytest)

--------------------------
before preprocessing:
	type(x_train): <class 'numpy.ndarray'>
	type(y_train): <class 'list'>
	x_train.shape: (50000, 3072)
	y_train.shape: (50000,)
	x_test.shape: (10000, 3072)
	y_test.shape: (10000,)
	y_train[12]: 7
--------------------------
after preprocessing:
	type(x_train): <class 'numpy.ndarray'>
	type(y_train): <class 'numpy.ndarray'>
	x_train.shape: (50000, 3072)
	y_train.shape: (50000, 10)
	x_test.shape: (10000, 3072)
	y_test.shape: (10000, 10)
	y_train[12]: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


In [19]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

In [22]:
assert x_train.shape == (50000, 32, 32, 3)
assert x_test.shape == (10000, 32, 32, 3)
assert y_train.shape == (50000, 1)
assert y_test.shape == (10000, 1)

In [23]:
data_desc(x_train, y_train, x_test, y_test)

	type(x_train): <class 'numpy.ndarray'>
	type(y_train): <class 'numpy.ndarray'>
	x_train.shape: (50000, 32, 32, 3)
	y_train.shape: (50000, 1)
	x_test.shape: (10000, 32, 32, 3)
	y_test.shape: (10000, 1)
	y_train[12]: [7]


In [34]:
x_train.dtype

dtype('float32')

In [36]:
x_train[0][0][0]

array([0.23137255, 0.24313726, 0.24705882], dtype=float32)

In [33]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

In [35]:
# normalization
x_train /= 255
x_test /= 255