# MNIST Dataset Loading

This script downloads, unpacks, merges and saves MNIST datasets (both train and test)

## 0) Initialization

In [21]:
# Import the required modules:
import gzip
import shutil
import os
import numpy as np
from urllib.request import urlretrieve
from mnist import MNIST

# Define MNIST path:
MNIST_folder = '../../Data/MNIST/'


## 1) Download and Unpack Data

In [22]:
### Defining reqired functions:

# downloadData() - downloads a file and saves it to desired path
def downloadData(file_url, gz_file_name):
    print ('Downloading ' + file_url)
    gzfname, h = urlretrieve(file_url, MNIST_folder+gz_file_name)
    return

# unpackData()   - unpacks a gz file and deletes the gz after
def unpackData(gz_file_name, file_name):
    print ('Unpacking ' + gz_file_name)
    with gzip.open(MNIST_folder+gz_file_name, 'rb') as f_in, open(MNIST_folder+file_name, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(MNIST_folder+gz_file_name)
    return

In [23]:
### Download and unpack MNIST dataset files

# Define filenames and urls:

gznames = ['train-images-idx3-ubyte.gz',
           'train-labels-idx1-ubyte.gz',
           't10k-images-idx3-ubyte.gz',
           't10k-labels-idx1-ubyte.gz']

names   = ['train-images-idx3-ubyte',
          'train-labels-idx1-ubyte',
          't10k-images-idx3-ubyte',
          't10k-labels-idx1-ubyte']

urls    = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

for x in range(0, 4):
    downloadData(urls[x],gznames[x])
    unpackData(gznames[x],names[x])

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Unpacking train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Unpacking train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Unpacking t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Unpacking t10k-labels-idx1-ubyte.gz


## 2) Data Loading Examples

MNIST module automatically reads images and labels (into list and arrays), and checks data validity (magic number signature).

    - images -> list of image (image -> list of unsigned bytes)
    - labels -> array of unsigned bytes

In [24]:
# MNIST module checks the data validity (magic number) and reads it

mndata = MNIST(MNIST_folder)

train_images, train_labels = mndata.load_training()
test_images,  test_labels  = mndata.load_testing()

In [32]:
# Convert to numpy arrays:
train_imgs = np.array(train_images)
train_lbls = np.array(train_labels)
test_imgs = np.array(test_images)
test_lbls = np.array(test_labels)

# # Clear regular arrays:
# train_images = None
# train_labels = None
# test_images  = None
# test_labels  = None


numpy.ndarray

In [31]:
# Sparsity check (80% are white pixels -> makes sense to use SPARSE matrices)

sum(sum(train_imgs==0))/(len(train_imgs)*784)*100


80.879770408163267

In [51]:
# Checking contrast scale (254-255) - almost perfect

myMin = 255

for k in range(0,len(train_imgs)):
    if(np.max(train_imgs[k])<255):
        #print(k)
        if(np.max(train_imgs[k])<myMin):
            myMin = np.max(train_imgs[k])
            
myMin


254