In [None]:
# https://www.kaggle.com/code/hojjatk/read-mnist-dataset/notebook

#
# This is a sample Notebook to demonstrate how to read "MNIST Dataset"
#
import numpy as np # linear algebra
import struct
from array import array
from os.path  import join

#
# MNIST Data Loader Class
#
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)   

In [None]:
# https://www.kaggle.com/code/hojjatk/read-mnist-dataset/notebook

#
# Verify Reading Dataset via MnistDataloader class
#
%matplotlib inline
import random
import matplotlib.pyplot as plt
import pickle

#
# Set file paths based on added MNIST Datasets
#
input_path = 'dataset/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

#
# Helper function to show a list of images with their relating titles
#
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(data_train, label_train), (data_test, label_test) = mnist_dataloader.load_data()

with open("dataset/pickled/data_train.pickle", "wb") as outfile:
    pickle.dump(data_train, outfile)
with open("dataset/pickled/label_train.pickle", "wb") as outfile:
    pickle.dump(label_train, outfile)
with open("dataset/pickled/data_test.pickle", "wb") as outfile:
    pickle.dump(data_test, outfile)
with open("dataset/pickled/label_test.pickle", "wb") as outfile:
    pickle.dump(label_test, outfile)

first_10_data_train = []
first_10_label_train = []
for i in range(0, 10):
    first_10_data_train.append(data_train[i])
    first_10_label_train.append(label_train[i])

with open("dataset/pickled/first_10_data_train.pickle", "wb") as outfile:
    pickle.dump(first_10_data_train, outfile)
with open("dataset/pickled/first_10_label_train.pickle", "wb") as outfile:
    pickle.dump(first_10_label_train, outfile)

# There are 60000 training examples
# Try making it into 10 batches => 6000 per batch

data_batch_01 = []
data_batch_02 = []
data_batch_03 = []
data_batch_04 = []
data_batch_05 = []
data_batch_06 = []
data_batch_07 = []
data_batch_08 = []
data_batch_09 = []
data_batch_10 = []
label_batch_01 = []
label_batch_02 = []
label_batch_03 = []
label_batch_04 = []
label_batch_05 = []
label_batch_06 = []
label_batch_07 = []
label_batch_08 = []
label_batch_09 = []
label_batch_10 = []
for i in range(0, 6000):
    data_batch_01.append(data_train[i + (0 * 6000)])
    data_batch_02.append(data_train[i + (1 * 6000)])
    data_batch_03.append(data_train[i + (2 * 6000)])
    data_batch_04.append(data_train[i + (3 * 6000)])
    data_batch_05.append(data_train[i + (4 * 6000)])
    data_batch_06.append(data_train[i + (5 * 6000)])
    data_batch_07.append(data_train[i + (6 * 6000)])
    data_batch_08.append(data_train[i + (7 * 6000)])
    data_batch_09.append(data_train[i + (8 * 6000)])
    data_batch_10.append(data_train[i + (9 * 6000)])
    label_batch_01.append(label_train[i + (0 * 6000)])
    label_batch_02.append(label_train[i + (1 * 6000)])
    label_batch_03.append(label_train[i + (2 * 6000)])
    label_batch_04.append(label_train[i + (3 * 6000)])
    label_batch_05.append(label_train[i + (4 * 6000)])
    label_batch_06.append(label_train[i + (5 * 6000)])
    label_batch_07.append(label_train[i + (6 * 6000)])
    label_batch_08.append(label_train[i + (7 * 6000)])
    label_batch_09.append(label_train[i + (8 * 6000)])
    label_batch_10.append(label_train[i + (9 * 6000)])

with open("dataset/pickled/data_batch_01.pickle", "wb") as outfile:
    pickle.dump(data_batch_01, outfile)
with open("dataset/pickled/data_batch_02.pickle", "wb") as outfile:
    pickle.dump(data_batch_02, outfile)
with open("dataset/pickled/data_batch_03.pickle", "wb") as outfile:
    pickle.dump(data_batch_03, outfile)
with open("dataset/pickled/data_batch_04.pickle", "wb") as outfile:
    pickle.dump(data_batch_04, outfile)
with open("dataset/pickled/data_batch_05.pickle", "wb") as outfile:
    pickle.dump(data_batch_05, outfile)
with open("dataset/pickled/data_batch_06.pickle", "wb") as outfile:
    pickle.dump(data_batch_06, outfile)
with open("dataset/pickled/data_batch_07.pickle", "wb") as outfile:
    pickle.dump(data_batch_07, outfile)
with open("dataset/pickled/data_batch_08.pickle", "wb") as outfile:
    pickle.dump(data_batch_08, outfile)
with open("dataset/pickled/data_batch_09.pickle", "wb") as outfile:
    pickle.dump(data_batch_09, outfile)
with open("dataset/pickled/data_batch_10.pickle", "wb") as outfile:
    pickle.dump(data_batch_10, outfile)

with open("dataset/pickled/label_batch_01.pickle", "wb") as outfile:
    pickle.dump(label_batch_01, outfile)
with open("dataset/pickled/label_batch_02.pickle", "wb") as outfile:
    pickle.dump(label_batch_02, outfile)
with open("dataset/pickled/label_batch_03.pickle", "wb") as outfile:
    pickle.dump(label_batch_03, outfile)
with open("dataset/pickled/label_batch_04.pickle", "wb") as outfile:
    pickle.dump(label_batch_04, outfile)
with open("dataset/pickled/label_batch_05.pickle", "wb") as outfile:
    pickle.dump(label_batch_05, outfile)
with open("dataset/pickled/label_batch_06.pickle", "wb") as outfile:
    pickle.dump(label_batch_06, outfile)
with open("dataset/pickled/label_batch_07.pickle", "wb") as outfile:
    pickle.dump(label_batch_07, outfile)
with open("dataset/pickled/label_batch_08.pickle", "wb") as outfile:
    pickle.dump(label_batch_08, outfile)
with open("dataset/pickled/label_batch_09.pickle", "wb") as outfile:
    pickle.dump(label_batch_09, outfile)
with open("dataset/pickled/label_batch_10.pickle", "wb") as outfile:
    pickle.dump(label_batch_10, outfile)

data_batch_first_100 = []
label_batch_first_100 = []
for i in range(100):
    data_batch_first_100.append(data_train[i])
    label_batch_first_100.append(label_train[i])

with open("dataset/pickled/data_batch_first_100.pickle", "wb") as outfile:
    pickle.dump(data_batch_first_100, outfile)
with open("dataset/pickled/label_batch_first_100.pickle", "wb") as outfile:
    pickle.dump(label_batch_first_100, outfile)

#
# Show some random training and test images 
#
images_2_show = []
titles_2_show = []
for i in range(0, 10):
   images_2_show.append(first_10_data_train[i])
   titles_2_show.append('training image [' + str(i) + '] = ' + str(first_10_label_train[i]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(data_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(label_test[r]))    

show_images(images_2_show, titles_2_show)