In [1]:
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
import tensorflow as tf

from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression

from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

%matplotlib inline

In [None]:
url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = '.'
        
def maybe_download(filename, expected_bytes, force=False):
    dest_filename = os.path.join(data_root, filename)
    if force or not os.path.exists(dest_filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, dest_filename)
        print('\nDownload Complete!')
    statinfo = os.stat(dest_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', dest_filename)
    else:
        raise Exception(
          'Failed to verify ' + dest_filename + '.')
    return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

In [None]:
num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove extensions
    if os.path.isdir(root) and not force:
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(data_root)
        tar.close()
    data_folders = [
        os.path.join(root, d) for d in sorted(os.listdir(root)) if os.path.isdir(os.path.join(root, d))]
        if len(data_folders) != num_classes:
            raise Exception(
                'Expected %d folders, one per class. Found %d instead.' % (num_classes, len(data_folders)))
    print(data_folders)
    return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

In [None]:
import random
from IPython.display import Image

def display_random(folder):
    selected = random.choice([x for x in os.listdir(folder) if os.path.isfile(os.path.join(folder, x))])
    # print(os.path.join(folder, selected))
    display(Image(filename=os.path.join(folder, selected)))

foldername = "notMNIST_large"

for letter in os.listdir(foldername):
    print(letter)
    display_random(os.path.join(foldername, letter))

In [None]:
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
    """Load the data for a single letter label."""
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
    print(folder)
    num_images = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
        try:
            image_data = (imageio.imread(image_file).astype(float) - 
                        pixel_depth / 2) / pixel_depth
            if image_data.shape != (image_size, image_size):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))
            dataset[num_images, :, :] = image_data
            num_images = num_images + 1
        except (IOError, ValueError) as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
    dataset = dataset[0:num_images, :, :]
    if num_images < min_num_images:
        raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
    print('Full dataset tensor:', dataset.shape)
    print('Mean:', np.mean(dataset))
    print('Standard deviation:', np.std(dataset))
    return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
    dataset_names = []
        for folder in data_folders:
            set_filename = folder + '.pickle'
            dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
        print('%s already present - Skipping pickling.' % set_filename)
    else:
        print('Pickling %s.' % set_filename)
        dataset = load_letter(folder, min_num_images_per_class)
        try:
            with open(set_filename, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', set_filename, ':', e)
  
    return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

In [None]:
def display_numbers_of_images(data_folders):
    for folder in data_folders:
        pickle_filename = folder + '.pickle'
        try:
            with open(pickle_filename, 'rb') as f:
                dataset = pickle.load(f)
        except:
            print('Unable to read data from ', pickle_filename, ":", e)
        print("Number of images in ", folder, ": ", len(dataset))

display_numbers_of_images(train_folders)

In [None]:
def make_arrays(nb_rows, img_size):
    if nb_rows:
        dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
        labels = np.ndarray(nb_rows, dtype=np.int32)
    else:
        dataset, labels = None, None
    return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_size)
    train_dataset, train_labels = make_arrays(train_size, image_size)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes

    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    for label, pickle_file in enumerate(pickle_files):       
        try:
            with open(pickle_file, 'rb') as f:
                letter_set = pickle.load(f)
                np.random.shuffle(letter_set)
                
                if valid_dataset is not None:
                    valid_letter = letter_set[:vsize_per_class, :, :]
                    valid_dataset[start_v:end_v, :, :] = valid_letter
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class
                    
                train_letter = letter_set[vsize_per_class:end_l, :, :]
                train_dataset[start_t:end_t, :, :] = train_letter
                train_labels[start_t:end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise
    
    return valid_dataset, valid_labels, train_dataset, train_labels
            
            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

In [None]:
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [None]:
pretty_labels = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J'}

def displ_random(dataset, label):
    for i, item_num in enumerate(random.sample(range(len(label)), 8)):
        plt.subplot(2, 4, i+1)
        plt.axis('off')
        plt.title(pretty_labels[label[item_num]])
        plt.imshow(dataset[item_num])
        
displ_random(train_dataset, train_labels)

In [None]:
pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

In [None]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
    dataset = dataset.reshape(
        (-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)

print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

In [None]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [None]:
import numpy as np

# Pad images with 0s
train_dataset     = np.pad(train_dataset, ((0,0),(2,2),(2,2),(0,0)), 'constant')
valid_dataset = np.pad(valid_dataset, ((0,0),(2,2),(2,2),(0,0)), 'constant')
test_dataset       = np.pad(test_dataset, ((0,0),(2,2),(2,2),(0,0)), 'constant')
    
print("Updated Image Shape: {}".format(train_dataset[0].shape))

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

index = random.randint(0, len(train_dataset))
image = train_dataset[index].squeeze()

plt.figure(figsize=(1,1))
plt.imshow(image, cmap="gray")
print(np.argmax(train_labels[index]))

In [None]:
# Check classes in validation set are balanced
np.count_nonzero(valid_labels, axis=0)
valid_labelscc

In [None]:
image_size = 28 + 4
batch_size = 16
beta = 1e-3

graph = tf.Graph()

with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    global_step = tf.Variable(0, trainable=False)
    
    # Variables
    c1_w = tf.Variable(tf.truncated_normal([5, 5, num_channels, 6], stddev=0.1))
    c1_b = tf.Variable(tf.zeros([6]))
    
    c3_w = tf.Variable(tf.truncated_normal([5, 5, 6, 16], stddev=0.1))
    c3_b = tf.Variable(tf.zeros(16))
    
    f5_w = tf.Variable(tf.truncated_normal([400, 120], stddev=0.1))
    f5_b = tf.Variable(tf.zeros(120))
    
    f6_w = tf.Variable(tf.truncated_normal([120, 84], stddev=0.1))
    f6_b = tf.Variable(tf.zeros(84))
    
    f7_w = tf.Variable(tf.truncated_normal([84, 10], stddev=0.1))
    f7_b = tf.Variable(tf.zeros(10))
    def model(data):
        # C1: 32x32x1 -> 28x28x6
        
        c1 = tf.nn.conv2d(data, c1_w, [1, 1, 1, 1], padding='VALID') + c1_b
        c1 = tf.nn.relu(c1)
        shape = c1.get_shape().as_list()
        assert(shape[1] == 28 and shape[2] == 28 and shape[3] == 6), "The shape is %s" % shape
 
        # S2: 28x28x6 -> 14x14x6
        s2 = tf.nn.max_pool(c1, [1, 2, 2, 1], [1, 2, 2, 1], padding='VALID')
        shape = s2.get_shape().as_list()
        assert(shape[1] == 14 and shape[2] == 14 and shape[3] == 6), "The shape is %s" % shape
        
        # C3: 14x14x6 -> 10x10x16
        c3 = tf.nn.conv2d(s2, c3_w, [1,1,1,1], padding="VALID") + c3_b
        c3 = tf.nn.relu(c3)
        shape = c3.get_shape().as_list()
        assert(shape[1] == 10 and shape[2] == 10 and shape[3] == 16), "The shape is %s" % shape

        # S4: 10x10x16 -> 5x5x16
        s4 = tf.nn.max_pool(c3, [1,2,2,1], [1,2,2,1], padding='VALID')
        shape = s4.get_shape().as_list()
        assert(shape[1] == 5 and shape[2] == 5 and shape[3] == 16), "The shape is %s" % shape

        # F5: 5x5x16 -> 120
        f5_in = tf.contrib.layers.flatten(s4)
#         shape = s4.get_shape().as_list()
#         f5_in = tf.reshape(s4, [shape[0], shape[1] * shape[2] * shape[3]])
        f5 = tf.nn.relu(tf.matmul(f5_in, f5_w) + f5_b)

        # F6: 120 -> 84
        f6 = tf.nn.relu(tf.matmul(f5, f6_w) + f6_b)

        # F7: 84 -> 10
        f7 = tf.matmul(f6, f7_w) + f7_b # ???
        return f7
    
    # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)) + \
            beta*(tf.nn.l2_loss(c1_w) + tf.nn.l2_loss(c3_w) + tf.nn.l2_loss(f5_w) + tf.nn.l2_loss(f6_w) + tf.nn.l2_loss(f7_w))
    

    # Optimizer.
    learning_rate = tf.train.inverse_time_decay(0.05, global_step, 500, 0.85, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
num_steps = 10001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 50 == 0):
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
    valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))