In [22]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

region = boto3.session.Session().region_name

#如果使用SageMaker的笔记本实例使用下一行
#role = get_execution_role()
#如果使用自建的笔记本实例请自行获取Role，可从IAM控制台获取到
role = "arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235"

In [23]:
#确保sagemaker版本为2.4.0及以上
print(sagemaker.__version__)

2.4.0


In [24]:
input_data = 's3://nowfox/data/cat-vs-dog-1000/'
output_data = 's3://nowfox/data/cat-vs-dog-output/'

## 准备图片

原始数据按不同分类上传到input_data目录
```
input_data
├── class1
│   ├── image001.jpg
│   ├── image002.jpg
│   └── ...
├── class2
│   ├── image001.jpg
│   ├── image002.jpg
│   └── ...
└── classn
    ├── image001.jpg
    ├── image002.jpg
    └── ...
```

In [25]:
from sagemaker.tensorflow import TensorFlow

# 建议使用gpu类型的实例
instance_type='ml.p3.2xlarge'
#instance_type='local'
model_dir = '/opt/ml/model'

# 可以修改epoch_count，batch_size
estimator = TensorFlow(entry_point='train.py',
                             source_dir='./source',
                             role=role,
                             output_path=output_data,
                             model_dir=model_dir,
                             framework_version='1.15.2',
                             hyperparameters={'epoch_count':30, 'batch_size':20}, 
                             py_version='py3',
                             instance_count=1,
                             instance_type=instance_type,
#                             train_volume_size=50,
#                             train_max_run=432000,
                             use_spot_instances=True,
                             max_wait=432000,
#                             metric_definitions=[{'Name': 'loss', 'Regex': 'loss = (.*?),'},
#                                                 {'Name':'epoch','Regex': 'Step_Train = (.*?),'}]
                           )


In [26]:
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {'lr': ContinuousParameter(0.00001, 0.0001),
                         'batch_size': CategoricalParameter([16,32,64])}

In [27]:
# 注意Regex匹配时，会匹配到最后一个，但是最后一个值不一定是期望的值，所以需要自行处理后，再输出自定义标记的值，以便准确匹配
objective_metric_name = 'test acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'test acc',
                       'Regex': 'test acc:([0-9\\.]+)'}]

In [28]:
tuner = HyperparameterTuner(estimator,
                            objective_metric_name,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=9,
                            max_parallel_jobs=1,
                            objective_type=objective_type)

In [29]:
tuner.fit(input_data)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### 打印 model_data 路径， 下载并且解压

In [30]:
best_estimator=tuner.best_estimator()
print(best_estimator.model_data)
os.environ['S3_URL']=str(best_estimator.model_data)  #environ的键值必须是字符串  


2020-09-08 02:56:37 Starting - Preparing the instances for training
2020-09-08 02:56:37 Downloading - Downloading input data
2020-09-08 02:56:37 Training - Training image download completed. Training in progress.
2020-09-08 02:56:37 Uploading - Uploading generated training model
2020-09-08 02:56:37 Completed - Training job completed
s3://nowfox/data/cat-vs-dog-output/tensorflow-training-200908-0224-003-ac81af1c/output/model.tar.gz


In [34]:
%%sh

echo ${S3_URL}

s3://nowfox/data/cat-vs-dog-output/tensorflow-training-200908-0224-003-ac81af1c/output/model.tar.gz


In [18]:
%%sh

cd output
aws s3 cp ${S3_URL} ./model.tar.gz

# aws s3 cp {sli_estimator.model_data} ./model.tar.gz

tar -xvzf ./model.tar.gz 

/home/ec2-user/sagemaker-encapsulation/output
download: s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-08-24-09-33-46-405/output/model.tar.gz to ./model.tar.gz
tf_server/
tf_server/1/
tf_server/1/saved_model.pb
tf_server/1/variables/
tf_server/1/variables/variables.index
tf_server/1/variables/variables.data-00000-of-00001
model.h5


确保模型文件保存到以下目录

`inference.ipynb` 里面会用到

```
output
└── tf_server
    └── 1
        ├── saved_model.pb
        └── variables
            ├── variables.data-00000-of-00001
            └── variables.index
```

```