# Loading MNIST Dataset

This script downloads, unpacks, merges and saves MNIST datasets (both train and test)

http://yann.lecun.com/exdb/mnist/index.html

## 0) Initialization

In [1]:
# Import the required modules:
import gzip
import shutil
import os
import numpy as np
from urllib.request import urlretrieve
from mnist import MNIST

# Define DATA paths:
DATA_folder  = '../../Data/'
MNIST_folder = DATA_folder+'MNIST/'

if not os.path.exists(MNIST_folder):
    os.makedirs(MNIST_folder)

## 1) Download and Unpack Data

In [2]:
### Defining reqired functions:

# downloadData() - downloads a file and saves it to desired path
def downloadData(file_url, gz_file_name):
    print ('Downloading ' + file_url)
    gzfname, h = urlretrieve(file_url, MNIST_folder+gz_file_name)
    return

# unpackData()   - unpacks a gz file and deletes the gz after
def unpackData(gz_file_name, file_name):
    print ('Unpacking ' + gz_file_name)
    with gzip.open(MNIST_folder+gz_file_name, 'rb') as f_in, open(MNIST_folder+file_name, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(MNIST_folder+gz_file_name)
    return

In [3]:
### Download and unpack MNIST dataset files

# Define filenames and urls:

gznames = ['train-images-idx3-ubyte.gz',
           'train-labels-idx1-ubyte.gz',
           't10k-images-idx3-ubyte.gz',
           't10k-labels-idx1-ubyte.gz']

names   = ['train-images-idx3-ubyte',
          'train-labels-idx1-ubyte',
          't10k-images-idx3-ubyte',
          't10k-labels-idx1-ubyte']

urls    = ['http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
           'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

for x in range(0, 4):
    downloadData(urls[x],gznames[x])
    unpackData(gznames[x],names[x])

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Unpacking train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Unpacking train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Unpacking t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Unpacking t10k-labels-idx1-ubyte.gz


## 2) Data Loading Examples

MNIST module automatically reads images and labels (a list of images and array of labels), and checks data validity (magic number signature).

    - images -> list of image (image -> list of unsigned bytes)
    - labels -> array of unsigned bytes

In [6]:
# MNIST module data loading:
mndata = MNIST(MNIST_folder)
train_images, train_labels = mndata.load_training()
test_images,  test_labels  = mndata.load_testing()

In [7]:
# Convert to numpy arrays:
train_imgs = np.array(train_images)/1.0
train_lbls = np.array(train_labels)
test_imgs = np.array(test_images)/1.0
test_lbls = np.array(test_labels)

# Clear regular arrays:
train_images = None
train_labels = None
test_images  = None
test_labels  = None

# Save numPy versions:
np.save(DATA_folder+'train_imgs.npy',train_imgs)
np.save(DATA_folder+'train_lbls.npy',train_lbls)
np.save(DATA_folder+'test_imgs.npy',test_imgs)
np.save(DATA_folder+'test_lbls.npy',test_lbls)