# Prepare the notMNIST dataset #

We thank the authors of this [Udacity tutorial](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/udacity) which was the main inspiration for this tutorial. We have reused some of their code snippets.

**Import** all packages that will be used.

In [None]:
import os, sys, tarfile, pickle
from six.moves.urllib.request import urlretrieve
import numpy as np
from scipy import ndimage
from IPython.display import display, Image
import matplotlib.pyplot as plt

**Download** the dataset.

In [None]:
url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = 'data/notMNIST'

# make sure the dataset directory exists
if not os.path.isdir(data_root):
  os.makedirs(data_root)

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

The dataset was downloaded as two tarballs. **Extract** both of them.

In [None]:
num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

**Load all images** and create a single Tensor for each letter. For example, there are about 53000 pictures of letter A in the dataset from which we will choose 10000 - the script will create a single Tensor of dimensions (10000, 28, 28), where 28 is both the width and height of each image. Due to memory constraints, we will save each Tensor into a [pickle](https://docs.python.org/3/library/pickle.html).

In [None]:
image_size = 28                             # pixel width and height

train_per_class = 4500                      # number of training images for each class
valid_per_class = 250                       # number of validation images for each class
test_per_class = 250                        # number of testing images for each class

total_per_class = train_per_class + valid_per_class + test_per_class

letters = ["A", "B", "C", "D"]              # letter to load

def load_letter(folder, max_images):

  image_files = os.listdir(folder)
  max_images = min(max_images, len(image_files))
  dataset = np.ndarray(shape=(max_images, image_size, image_size), dtype=np.float32)

  num_images = 0
  for image in image_files:
    
    if num_images >= max_images:
      break
    
    image_file = os.path.join(folder, image)
    try:
      image_data = ndimage.imread(image_file)
                    
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      pass
    
  dataset = dataset[:num_images, :, :]

  return dataset
        
def maybe_pickle(data_folders, total_per_class, letters, force=False):

  dataset = {}
    
  for folder in data_folders:

    letter_name = folder.split("/")[-1]

    if letter_name in letters:
      dataset[letter_name] = load_letter(folder, total_per_class)
  
  return dataset

dataset = maybe_pickle(train_folders, total_per_class, letters)

for key, item in dataset.items():
    print("letter {:s}: {:d} samples".format(key, item.shape[0]))
    assert item.shape[0] == total_per_class

**Create a subset** for training, validation and testing.

In [None]:
labels = {letter: i for i, letter in enumerate(letters)}

splits = {}
for letter in letters:
    splits[letter] = {}
    
    splits[letter]["train"] = dataset[letter][:train_per_class]
    splits[letter]["valid"] = dataset[letter][train_per_class:train_per_class + valid_per_class]
    splits[letter]["test"] = dataset[letter][train_per_class + valid_per_class:train_per_class + 
                                             valid_per_class + test_per_class]
    
train_data = np.concatenate([splits[letter]["train"] for letter in letters])
train_labels = np.concatenate([np.zeros(splits[letter]["train"].shape[0]) + labels[letter] 
                               for letter in letters]).astype(np.int32)

valid_data = np.concatenate([splits[letter]["valid"] for letter in letters])
valid_labels = np.concatenate([np.zeros(splits[letter]["valid"].shape[0]) + labels[letter] 
                               for letter in letters]).astype(np.int32)

test_data = np.concatenate([splits[letter]["test"] for letter in letters])
test_labels = np.concatenate([np.zeros(splits[letter]["test"].shape[0]) + labels[letter] 
                              for letter in letters]).astype(np.int32)

Make sure the letters were loaded correctly.

In [None]:
print("left to right:", letters)
print("run again for different images")

from mpl_toolkits.axes_grid1 import ImageGrid

index = np.random.randint(0, len(letters))

fig = plt.figure(1, figsize=(10, 10))
grid = ImageGrid(fig, 111, nrows_ncols=(1, 4), axes_pad=0.2)

for i, letter in enumerate(letters):
    grid[i].imshow(dataset[letter][index] / 255, interpolation="bilinear", cmap="gray")
    grid[i].tick_params(axis='both', which='both', bottom='off', top='off', 
                    labelbottom='off', right='off', left='off', labelleft='off')

plt.show()

**Save** the preprocessed dataset.

In [None]:
# save the dataset as a pickle
pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_data,
    'train_labels': train_labels,
    'valid_dataset': valid_data,
    'valid_labels': valid_labels,
    'test_dataset': test_data,
    'test_labels': test_labels,
    'labels': labels
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)