# Raw data spec
* MNIST dataset : 70,000 * 785(1 label + 28 * 28 data)

# Preprocess

In [None]:
%%writefile preprocessing.py
import argparse
import os
import numpy as np
import pickle, gzip
from sklearn.model_selection import train_test_split

if __name__=='__main__':

    args, _ = argparse.ArgumentParser().parse_known_args()
    
    input_data_path = os.path.join('/opt/ml/processing/input', 'raw_data.csv')
    
    raw_data = np.loadtxt(input_data_path, delimiter=',')
    
    data = raw_data[:, 1:]
    label = raw_data[:, 0]
    
    split_ratio = 0.2
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size = 0.2)
    
    output_path = os.path.join('/opt/ml/processing/output', 'dataset.pkl.gz')
    
    with gzip.open(output_path, 'wb') as f:
        pickle.dump((train_data, train_label, test_data, test_label), f)

In [None]:
%%time
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

role = get_execution_role()
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1)

input_data = 's3://your-bucket/your-prefix/raw_data.csv'

sklearn_processor.run(code='preprocessing.py',
                      inputs=[ProcessingInput(
                          source=input_data,
                          destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(
                          output_name='output',
                          source='/opt/ml/processing/output',
                          destination='s3://your-bucket/your-prefix')])

# Train

In [None]:
%%writefile mnist.py

import tensorflow as tf
import argparse
import os, time
import numpy as np
import json
import gzip, pickle

if __name__ == "__main__":
    
    start = time.time()
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_dir', type=str)
    parser.add_argument('--sm-model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ.get('SM_HOSTS')))
    parser.add_argument('--current-host', type=str, default=os.environ.get('SM_CURRENT_HOST'))
    args, _ = parser.parse_known_args()

    input_path = os.path.join(args.train, 'dataset.pkl.gz')
    with gzip.open(input_path, 'rb') as f:
        train_data, train_label, test_data, test_label = pickle.load(f)
        
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(10, activation='softmax')
    ])

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(train_data, train_label, epochs=3, verbose=2)
    test_loss, test_acc = model.evaluate(test_data, test_label, verbose=0)
    print('Test Average loss: {}, Test Accuracy: {};'.format(test_loss, test_acc))
    
    model.save(os.path.join(args.sm_model_dir, '000000001'), 'my_model.h5')
    
    print('training time: {}'.format(time.time()-start))

In [None]:
from sagemaker.tensorflow import TensorFlow

mnist_estimator = TensorFlow(entry_point='mnist.py',
                             role=role,
                             train_instance_count=1,
                             train_instance_type='ml.m5.xlarge',
                             train_use_spot_instances = True,
                             train_max_run = 600, 
                             train_max_wait = 1200,
                             framework_version='2.1.0',
                             py_version='py3')

In [None]:
%%time
mnist_estimator.fit('s3://your-bucket/your-prefix/dataset.pkl.gz')

# Deploy

In [None]:
predictor = mnist_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

# Inference test

In [None]:
import numpy as np
test_sample = np.loadtxt('../00_Basics/test_sample.csv', delimiter=',')

In [None]:
test_data = test_sample[:10, 1:]
test_label = test_sample[:10, 0]
predictions = predictor.predict(test_data)
for i in range(0, 10):
    prediction = np.argmax(predictions['predictions'][i])
    label = test_label[i]
    print('prediction is {}, label is {}, matched: {}'.format(prediction, label, prediction == label))

# Delete Endpoint

In [None]:
predictor.delete_endpoint()