In [1]:
#function to transform the images into workable numpy arrays
from struct import unpack
import gzip
import numpy as np
from numpy import zeros, uint8, float32
import cv2

def get_labeled_data(imagefile, labelfile):
    """Read input-vector (image) and target class (label, 0-9) and return
       it as list of tuples.
       
    """
    # Open the images with gzip in read binary mode
    images = gzip.open(imagefile, 'rb')
    labels = gzip.open(labelfile, 'rb')

    # Read the binary data

    
    # We have to get big endian unsigned int. So we need '>I'

    # Get metadata for images
    images.read(4)  # skip the magic_numbe
    number_of_images = images.read(4)
    number_of_images = unpack('>I', number_of_images)[0]
    rows = images.read(4)
    rows = unpack('>I', rows)[0]
    cols = images.read(4)
    cols = unpack('>I', cols)[0]

    # Get metadata for labels
    labels.read(4)  # skip the magic_number
    N = labels.read(4)
    N = unpack('>I', N)[0]

    if number_of_images != N:
        raise Exception('number of labels did not match the number of images')

    # Get the data
    x = zeros((N, rows, cols), dtype=float32)  # Initialize numpy array
    y = zeros((N, 1), dtype=uint8)  # Initialize numpy array
    for i in range(N):
        if i % 1000 == 0:
            print("i: %i" % i)
        for row in range(rows):
            for col in range(cols):
                tmp_pixel = images.read(1)  # Just a single byte
                tmp_pixel = unpack('>B', tmp_pixel)[0]
                x[i][row][col] = tmp_pixel
        tmp_label = labels.read(1)
        y[i] = unpack('>B', tmp_label)[0]
    return (x, y)

In [2]:
train_images, train_labels = get_labeled_data('files/train-images-idx3-ubyte.gz', 'files/train-labels-idx1-ubyte.gz')

i: 0
i: 1000
i: 2000
i: 3000
i: 4000
i: 5000
i: 6000
i: 7000
i: 8000
i: 9000
i: 10000
i: 11000
i: 12000
i: 13000
i: 14000
i: 15000
i: 16000
i: 17000
i: 18000
i: 19000
i: 20000
i: 21000
i: 22000
i: 23000
i: 24000
i: 25000
i: 26000
i: 27000
i: 28000
i: 29000
i: 30000
i: 31000
i: 32000
i: 33000
i: 34000
i: 35000
i: 36000
i: 37000
i: 38000
i: 39000
i: 40000
i: 41000
i: 42000
i: 43000
i: 44000
i: 45000
i: 46000
i: 47000
i: 48000
i: 49000
i: 50000
i: 51000
i: 52000
i: 53000
i: 54000
i: 55000
i: 56000
i: 57000
i: 58000
i: 59000


In [3]:
test_images, test_labels = get_labeled_data('files/t10k-images-idx3-ubyte.gz', 'files/t10k-labels-idx1-ubyte.gz')

i: 0
i: 1000
i: 2000
i: 3000
i: 4000
i: 5000
i: 6000
i: 7000
i: 8000
i: 9000


In [62]:
np.save('train_labels', train_labels)
np.save('test_labels', test_labels)

In [4]:
#thresholding
tim = cv2.threshold(train_images[0],127,255,cv2.THRESH_BINARY)
train_imagesT = []
test_imagesT = []
for image in train_images:
    train_imagesT.append(cv2.threshold(image, 140, 255, cv2.THRESH_BINARY)[1])

for image in test_images:
    test_imagesT.append(cv2.threshold(image, 140, 255, cv2.THRESH_BINARY)[1])



In [55]:
#use untreated images as reference
train_imagesFlat_untouched = []
for image in train_imagesT:
    train_imagesFlat_untouched.append(np.ravel(image))
    
test_imagesFlat_untouched = []
for image in test_imagesT:
    test_imagesFlat_untouched.append(np.ravel(image))

In [56]:
df_train_untouched = pd.DataFrame(train_imagesFlat_untouched)
df_test_untouched = pd.DataFrame(test_imagesFlat_untouched)

In [59]:
df_train_untouched.to_pickle('train_images_untouched.pkl')
df_test_untouched.to_pickle('test_images_untouched.pkl')

In [5]:
#crop the images to contain numbers as tightly as possible
def crop(x):
    length = x.shape[0]
    done = False
    for j in range(length):
        for i in range(length):
            if x[i,j] > 0:
                x_min = j
                done = True
                break
        if done == True:
            break


    done = False
    for j in range(length):
        for i in range(length):
            if x[i,length-1 - j] > 0:
                x_max = length-1 - j
                done = True
                break
        if done == True:
            break



    done = False
    for i in range(length):
        for j in range(length):
            if x[i,j] > 0:
                y_max = i
                done = True
                break
        if done == True:
            break

    done = False
    for i in range(length):
        for j in range(length):
            if x[length - 1 - i, j] > 0:
                y_min = length-1 - i
                done = True
                break
        if done == True:
            break

    

    x_center, y_center = int(np.round(np.mean([x_min,x_max]))), int(np.round(np.mean([y_min,y_max])))
#     x_center, y_center = int(np.mean([x_min,x_max])), int(np.mean([y_min,y_max]))
    
    return x[y_max:y_min + 1,x_min:x_max + 1]

In [30]:
train_imagesS = []
for image in train_imagesT:
    train_imagesS.append(crop(image))

test_imagesS = []
for image in test_imagesT:
    test_imagesS.append(crop(image))

In [36]:
#resize each image to a a uniform 20 x 20 pixel dimension
train_imagesF = []
for image in train_imagesS:
    train_imagesF.append(cv2.resize(image,(20,20), interpolation = cv2.INTER_NEAREST))
    
test_imagesF = []
for image in test_imagesS:
    test_imagesF.append(cv2.resize(image,(20,20), interpolation = cv2.INTER_NEAREST))


    


In [43]:
#flatten images for use as features in naive bayes classifier
train_imagesFlat=[]
for image in train_imagesF:
    train_imagesFlat.append(np.ravel(image))
               
test_imagesFlat=[]
for image in test_imagesF:
    test_imagesFlat.append(np.ravel(image))


In [51]:
df_train_touched = pd.DataFrame(train_imagesFlat)
df_test_touched = pd.DataFrame(test_imagesFlat)

In [60]:
df_train_touched.to_pickle('train_images_touched.pkl')
df_test_touched.to_pickle('test_images_touched.pkl')