## Dependencies

In [1]:
import gzip
import os
import pathlib
import zipfile

import numpy as np

## Data Loader

In [2]:
for _ in ['training-data', 'testing-data']:
    with zipfile.ZipFile(f'{_}.zip', 'r') as zip_ref:
        zip_ref.extractall()

In [3]:
# By zalandoresearch

def load_mnist(path, kind='train'):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)
        print(labels.shape)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)
        print(images.shape)
        
    return images, labels

In [4]:
X_train_full, y_train_full = load_mnist('training-data', kind='train')
X_test, y_test             = load_mnist('testing-data', kind='t10k')

(60000,)
(60000, 784)
(10000,)
(10000, 784)


In [5]:
X_train_full.shape

(60000, 784)

In [6]:
y_train_full.shape

(60000,)

In [7]:
X_test.shape

(10000, 784)

In [8]:
y_test.shape

(10000,)

## Main Pre-processing Step for the Sample Image Classification Model

In [9]:
maximum_pixel_intensity = 255.0

In [10]:
X_train = X_train_full / maximum_pixel_intensity
X_test  = X_test / maximum_pixel_intensity

In [11]:
X_train.shape

(60000, 784)

In [12]:
X_test.shape

(10000, 784)

## Post-processing: Step 1

In [13]:
with gzip.open('training-data/train-images-idx3-ubyte.gz','wb') as f:
    f.write(X_train)

In [14]:
with gzip.open('training-data/train-labels-idx1-ubyte.gz','wb') as f:
    f.write(y_train_full)

In [15]:
with gzip.open('testing-data/t10k-images-idx3-ubyte.gz','wb') as f:
    f.write(X_test)

In [16]:
with gzip.open('testing-data/t10k-labels-idx1-ubyte.gz','wb') as f:
    f.write(y_test)

## Post-processing: Step 2

In [20]:
# def f(dirname):
    
#     working_directory = pathlib.Path(f'{dirname}/')

#     with zipfile.ZipFile(f'{dirname}.zip', mode='w') as archive:
#         for gz_path in working_directory.iterdir():
#             print(gz_path)
#             archive.write(gz_path, arcname=gz_path.name)

#     with zipfile.ZipFile(f'{dirname}.zip', mode='r') as archive:
#         archive.printdir()

In [21]:
# f('training-data')

training-data/train-images-idx3-ubyte.gz
training-data/train-labels-idx1-ubyte.gz
File Name                                             Modified             Size
train-images-idx3-ubyte.gz                     2022-04-05 10:13:08     42729786
train-labels-idx1-ubyte.gz                     2022-04-05 10:13:08        29522


In [22]:
# f('testing-data')

testing-data/t10k-images-idx3-ubyte.gz
testing-data/t10k-labels-idx1-ubyte.gz
File Name                                             Modified             Size
t10k-images-idx3-ubyte.gz                      2022-04-05 10:13:14      7150101
t10k-labels-idx1-ubyte.gz                      2022-04-05 10:13:14         5138
