In [None]:
### Load packages

import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

In [None]:
### Define environment variables

BASEDIR = "/Users/theodore/workspace/pycharm/TensorFlowUdacity"

DATADIR = os.path.join(BASEDIR, "data")
TRAINDATADIR = os.path.join(DATADIR, "notmnist", "notmnist_large")
TESTDATADIR = os.path.join(DATADIR, "notmnist", "notmnist_small")

In [None]:
### Functions for getting array of directory paths and array of file paths

def get_dir_paths(root):
  return [os.path.join(root, n) for n in sorted(os.listdir(root)) if os.path.isdir(os.path.join(root, n))]

def get_file_paths(root):
  return [os.path.join(root, n) for n in sorted(os.listdir(root)) if os.path.isfile(os.path.join(root, n))]

In [None]:
### Get directory and file paths of training and test sets

train_data_paths = get_dir_paths(TRAINDATADIR)
test_data_paths = get_dir_paths(TESTDATADIR)

In [None]:
### Problem 1

In [None]:
## Display a sample of 5 images in their initial png format

nsamples = 5

for i in np.arange(nsamples):
    display(Image(filename=np.random.choice(get_file_paths(np.random.choice(test_data_paths)))))

In [None]:
## Set image properties

image_size = 28 # Pixel width and height
pixel_depth = 255.0  # Number of levels per pixel

In [None]:
## Read a sample image

image_file = np.random.choice(get_file_paths(np.random.choice(test_data_paths)))
image_data = ndimage.imread(image_file).astype(float)

In [None]:
## Show numeric representation of image

image_data

In [None]:
## Show type of image object

type(image_data)

In [None]:
## Show dimensions of image object

image_data.shape

In [None]:
## Plot image using imshow

plt.imshow(image_data)
plt.show()

In [None]:
## Plot image using a scatterplot

colors = [str(i/pixel_depth) for i in np.ravel(image_data)]
plt.scatter(
    np.tile(np.arange(image_size), image_size),
    np.repeat(np.flipud(np.arange(image_size)), image_size),
    s=150,
    c=colors,
    marker='s'
)
plt.show()

In [None]:
## Plot image using a scatterplot by setting cmap option

colors = [str(i/pixel_depth) for i in np.ravel(image_data)]
plt.scatter(
    np.tile(np.arange(image_size), image_size),
    np.repeat(np.flipud(np.arange(image_size)), image_size),
    s=150,
    c=colors,
    marker='s',
    cmap=plt.cm.viridis    
)
plt.show()

In [None]:
## Function for loading data for a single letter

def load_letter(root, image_size, pixel_depth, verbose=True, min_nimages=1):
  """Load data for a single letter."""

  if verbose:
        print(root)

  image_files = get_file_paths(root)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32)

  image_index = 0
  for image in image_files:
    try:
      image_data = (ndimage.imread(image).astype(float)-pixel_depth/2)/pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[image_index, :, :] = image_data
      image_index += 1
    except IOError as e:
      print('Skipping because of not being able to read: ', image_file)

  dataset = dataset[0:image_index, :, :]
  if image_index < min_nimages:
    raise Exception('Fewer images than expected: %d < %d' % (image_index, min_nimages))

  if verbose:    
    print('Full dataset tensor: ', dataset.shape)
    print('Mean: ', np.mean(dataset))
    print('Standard deviation: ', np.std(dataset))

  return dataset

In [None]:
letter_data = load_letter(test_data_paths[2], image_size, pixel_depth)

In [None]:
letter_data.shape

In [None]:
letter_data[0, :, :].shape

In [None]:
letter_data[0, :, :]

In [None]:
## Function for pickling data of all letters

def pickle_letters(root, image_size, pixel_depth, verbose=True, min_nimages=1, force=False):
  dataset_files = []
  for d in root:
    pickle_file = d + '.pickle'
    dataset_files.append(pickle_file)
    if os.path.exists(pickle_file) and not force:
      print('%s already present, skipping pickling' % pickle_file)
    else:
      print('Pickling %s' % pickle_file)
      dataset = load_letter(d, image_size, pixel_depth, verbose=verbose, min_nimages=min_nimages)
      try:
        with open(pickle_file, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
  
  return dataset_files

In [None]:
train_datasets = pickle_letters(train_data_paths, image_size, pixel_depth)

In [None]:
test_datasets = pickle_letters(test_data_paths, image_size, pixel_depth)