In [1]:
from struct import unpack
import gzip
from numpy import zeros, uint8, float32

def get_labeled_data():
    """Read input-vector (image) and target class (label, 0-9) and return
       it as list of tuples.
    """
    # Open the images with gzip in read binary mode
    images = gzip.open('t10k-images-idx3-ubyte.gz', 'rb')
    labels = gzip.open('t10k-labels-idx1-ubyte.gz', 'rb')

    # Read the binary data

    # We have to get big endian unsigned int. So we need '>I'

    # Get metadata for images
    images.read(4)  # skip the magic_number
    number_of_images = images.read(4)
    number_of_images = unpack('>I', number_of_images)[0]
    rows = images.read(4)
    rows = unpack('>I', rows)[0]
    cols = images.read(4)
    cols = unpack('>I', cols)[0]

    # Get metadata for labels
    labels.read(4)  # skip the magic_number
    N = labels.read(4)
    N = unpack('>I', N)[0]

    if number_of_images != N:
        raise Exception('number of labels did not match the number of images')

    # Get the data
    x = zeros((N, rows, cols), dtype=float32)  # Initialize numpy array
    y = zeros((N, 1), dtype=uint8)  # Initialize numpy array
    for i in range(N):
        if i % 1000 == 0:
            print("i: %i" % i)
        for row in range(rows):
            for col in range(cols):
                tmp_pixel = images.read(1)  # Just a single byte
                tmp_pixel = unpack('>B', tmp_pixel)[0]
                x[i][row][col] = tmp_pixel
        tmp_label = labels.read(1)
        y[i] = unpack('>B', tmp_label)[0]
    return (x, y)
x_train, y_train = get_labeled_data()

i: 0
i: 1000
i: 2000
i: 3000
i: 4000
i: 5000
i: 6000
i: 7000
i: 8000
i: 9000


In [10]:
import numpy as np

def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e


training_inputs = [np.reshape(x, (784, 1)) for x in x_train[0:8000]]
training_results = [vectorized_result(y) for y in y_train[0:8000]]
training_data = zip(training_inputs, training_results)

test_inputs = [np.reshape(x, (784, 1)) for x in x_train[-2000:]]
test_results = [vectorized_result(y) for y in y_train[-2000:]]
test_data = zip(test_inputs, test_results)