In [2]:
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# MNIST TFRecord Creator

This notebook will create TFRecords from the MNIST dataset included with Keras and then upload them to S3 so that they can be used to train a TensorFlow model using Amazon Sagemaker

In [4]:
import os
import numpy as np
import sagemaker
from keras.datasets import mnist
import tensorflow as tf
tf.enable_eager_execution()
role = sagemaker.get_execution_role() 

## Sagemaker Specific Setup and Config

In [5]:
def load_mnist_data():   
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = np.reshape(x_train, [-1, 28,28,1])
    x_test = np.reshape(x_test, [-1, 28,28,1])
    train_data = {'images':x_train, 'labels':y_train}
    test_data = {'images':x_test, 'labels':y_test}
    return train_data, test_data

In [6]:
def export_tfrecords(data_set, name, directory):
    """Converts MNIST dataset to tfrecords.
    
    Args:
        data_set: Dictionary containing a numpy array of images and labels.
        name: Name given to the exported tfrecord dataset.
        directory: Directory that the tfrecord files will be saved in.
    """
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    
    images = data_set['images']
    labels = data_set['labels']
    num_examples = images.shape[0]  
    rows = images.shape[1]
    cols = images.shape[2]
    depth = images.shape[3]

    filename = os.path.join(directory, name + '.tfrecords')
    print('Writing', filename)
   
    writer = tf.python_io.TFRecordWriter(filename)
    for index in range(num_examples):
        image_raw = images[index].tostring()
        example = tf.train.Example(features=tf.train.Features(feature={
            'height': _int64_feature(rows),
            'width': _int64_feature(cols),
            'depth': _int64_feature(depth),
            'label': _int64_feature(int(labels[index])),
            'image_raw': _bytes_feature(image_raw)}))
        writer.write(example.SerializeToString())
    writer.close()

In [7]:
train_data, test_data = load_mnist_data()

# Create MNIST Data locally


In [8]:
os.makedirs('./data/train/', exist_ok=True)
os.makedirs('./data/test/', exist_ok=True)

export_tfrecords(train_data, "mnist_train","./data/train")
export_tfrecords(test_data, "mnist_test","./data/test")

Writing ./data/train/mnist_train.tfrecords
Writing ./data/test/mnist_test.tfrecords


# Upload MNIST Data to S3

In [10]:
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/ml-model-migration'
train_data=sagemaker.Session().upload_data(path='./data/train', bucket=bucket, key_prefix=prefix+'/data/mnist/train')
test_data=sagemaker.Session().upload_data(path='./data/train', bucket=bucket, key_prefix=prefix+'/data/mnist/test')
print("Training data uploaded to {}".format(train_data))
print("Test data uploaded to {}".format(test_data))

Training data uploaded to s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/train
Test data uploaded to s3://sagemaker-us-east-2-708267171719/sagemaker/ml-model-migration/data/mnist/test
