# AWS re:Invent 2018 AIM350

## TensorFlow training script launcher

In [1]:
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role

### Define sagemaker Sessoin and Role

In [3]:
sagemaker_session = sagemaker.Session()
role = get_execution_role()

### Define S3 bucket and prefix for datasets and models

In [2]:
s3_bucket = 'mh-s3-icn-sagemaker'
s3_prefix = 'reinvent-aim350/tf'

traindata_s3_prefix = '{}/data/train'.format(s3_prefix)
testdata_s3_prefix = '{}/data/test'.format(s3_prefix)
output_s3 = 's3://{}/{}/models/'.format(s3_bucket, s3_prefix)
code_location_s3 = 's3://{}/{}/codes'.format(s3_bucket, s3_prefix)

### Upload training and testing files to S3 bucket

In [4]:
train_s3 = sagemaker_session.upload_data(path='./data/train/', bucket=s3_bucket, key_prefix=traindata_s3_prefix)
test_s3 = sagemaker_session.upload_data(path='./data/test/', bucket=s3_bucket, key_prefix=testdata_s3_prefix)

inputs = {'training':train_s3, 'testing': test_s3}

print(inputs)

{'training': 's3://mh-s3-icn-sagemaker/reinvent-aim350/tf/data/train', 'testing': 's3://mh-s3-icn-sagemaker/reinvent-aim350/tf/data/test'}


In [5]:
!aws s3 ls s3://mh-s3-icn-sagemaker/reinvent-aim350/tf --recursive --summary

2018-11-24 14:04:49       2483 reinvent-aim350/tf/codes/tf-scriptmode-mnist-2018-11-24-14-04-48-455/source/sourcedir.tar.gz
2018-11-24 14:09:28   62720128 reinvent-aim350/tf/data/test/x_test.npy
2018-11-24 14:09:28      10128 reinvent-aim350/tf/data/test/y_test.npy
2018-11-24 14:09:24  376320128 reinvent-aim350/tf/data/train/x_train.npy
2018-11-24 14:09:24      60128 reinvent-aim350/tf/data/train/y_train.npy
2018-11-24 14:07:22    4141248 reinvent-aim350/tf/models/tf-scriptmode-mnist-2018-11-24-14-04-48-455/output/model.tar.gz


### Define Hyperparameters

In [6]:
hyperparameters = {'epochs': 1}

### Define SageMaker TensorFlow Estimator

There are some arguments to be noted;

* **script_mode (bool)**: If set to True will the estimator will use the Script Mode containers (default: False). This will be ignored if py_version is set to 'py3'.

* **model_dir (str)**: S3 location where the checkpoint data and models can be exported to during training (default: None). If not specified a default S3 URI will be generated. It will be passed in the training script as one of the command line arguments.

In [7]:
estimator = TensorFlow(entry_point='tf_mnist_train_sagemaker.py',
                       source_dir='./tf-src',
                       role=role,
#                        training_steps=1,
#                        evaluation_steps=1,
                       train_instance_count=1,
                       train_instance_type='ml.c4.2xlarge',
                       hyperparameters=hyperparameters,
                       framework_version='1.11',
                       py_version='py3',
                       output_path=output_s3,
                       code_location=code_location_s3,
                       base_job_name='tf-scriptmode-mnist'
#                        requirements_file='./requirements.txt'
                      )

Amazon SageMaker runs the specified TensorFlow training script using the below command:

```
/usr/bin/python tf_mnist_train_sagemaker.py --epochs 1 --model_dir s3://sagemaker-ap-northeast-2-850550765017/sagemaker-tensorflow-scriptmode-2018-11-24-09-13-09-790/model
```

In [8]:
%%time

estimator.fit(inputs)

INFO:sagemaker:Creating training-job with name: tf-scriptmode-mnist-2018-11-24-14-09-28-777


2018-11-24 14:09:28 Starting - Starting the training job...
2018-11-24 14:09:51 Starting - Launching requested ML instances......
2018-11-24 14:10:49 Starting - Preparing the instances for training......
2018-11-24 14:11:49 Downloading - Downloading input data
2018-11-24 14:11:49 Training - Training image download completed. Training in progress.
[31m2018-11-24 14:11:49,362 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[31m2018-11-24 14:11:49,364 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2018-11-24 14:11:49,690 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2018-11-24 14:11:49,701 sagemaker-containers INFO     Invoking user script
[0m
[31mTraining Env:
[0m
[31m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "testing": "/opt/ml/input/data/testing",
        "training": "/opt/ml/input/data/training"
    },
    "current_hos

In [9]:
!aws s3 ls --recursive s3://mh-s3-icn-sagemaker/reinvent-aim350/tf/

2018-11-24 14:04:49       2483 reinvent-aim350/tf/codes/tf-scriptmode-mnist-2018-11-24-14-04-48-455/source/sourcedir.tar.gz
2018-11-24 14:09:29       2485 reinvent-aim350/tf/codes/tf-scriptmode-mnist-2018-11-24-14-09-28-777/source/sourcedir.tar.gz
2018-11-24 14:09:28   62720128 reinvent-aim350/tf/data/test/x_test.npy
2018-11-24 14:09:28      10128 reinvent-aim350/tf/data/test/y_test.npy
2018-11-24 14:09:24  376320128 reinvent-aim350/tf/data/train/x_train.npy
2018-11-24 14:09:24      60128 reinvent-aim350/tf/data/train/y_train.npy
2018-11-24 14:07:22    4141248 reinvent-aim350/tf/models/tf-scriptmode-mnist-2018-11-24-14-04-48-455/output/model.tar.gz
2018-11-24 14:12:08    4145134 reinvent-aim350/tf/models/tf-scriptmode-mnist-2018-11-24-14-09-28-777/output/model.tar.gz


In [10]:
# estimator?

In [11]:
# sagemaker.estimator.Framework?

## Automatic Model Tuning

In [12]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

In [13]:
hyperparameter_ranges = {
        'learning_rate': ContinuousParameter(0.001, 0.2),
        'dropout': ContinuousParameter(0.0, 0.4),
        'weighted': ContinuousParameter(2, 6),
    }

metric_definitions = [{'Name': 'loss',
                       'Regex': ' loss: ([0-9\\.]+)'},
                     {'Name': 'val_loss',
                       'Regex': ' val_loss: ([0-9\\.]+)'},
                     {'Name': 'acc',
                       'Regex': ' acc: ([0-9\\.]+)'},
                     {'Name': 'val_acc',
                       'Regex': ' val_acc: ([0-9\\.]+)'},

objective_metric_name = 'loss'
objective_type = 'Minimize'

In [None]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=100,
                            max_parallel_jobs=4,
                            objective_type=objective_type,
                            base_tuning_job_name='tf-tuning')

tuner.fit(inputs)