In [1]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

region = boto3.session.Session().region_name

#如果使用SageMaker的笔记本实例使用下一行
#role = get_execution_role()
#如果使用自建的笔记本实例请自行获取Role，可从IAM控制台获取到
role = "arn:aws-cn:iam::315505707008:role/service-role/AmazonSageMaker-ExecutionRole-20200430T124235"

In [2]:
#确保sagemaker版本为2.4.0及以上
print(sagemaker.__version__)

2.4.0


In [3]:
input_data = 's3://nowfox/data/cat-vs-dog-1000/'
output_data = 's3://nowfox/data/cat-vs-dog-output/'

## 准备图片

原始数据按不同分类上传到input_data目录
```
input_data
├── class1
│   ├── image001.jpg
│   ├── image002.jpg
│   └── ...
├── class2
│   ├── image001.jpg
│   ├── image002.jpg
│   └── ...
└── classn
    ├── image001.jpg
    ├── image002.jpg
    └── ...
```

In [6]:
from sagemaker.tensorflow import TensorFlow

# 建议使用gpu类型的实例
instance_type='ml.p3.2xlarge'
#instance_type='local'
model_dir = '/opt/ml/model'

# 可以修改epoch_count，batch_size
estimator = TensorFlow(entry_point='train.py',
                             source_dir='./source',
                             role=role,
                             output_path=output_data,
                             model_dir=model_dir,
                             framework_version='1.15.2',
                             hyperparameters={'epoch_count':30, 'batch_size':32}, 
                             py_version='py3',
                             instance_count=1,
                             instance_type=instance_type,
#                             train_volume_size=50,
#                             train_max_run=432000,
                             use_spot_instances=True,
                             max_wait=432000,
#                             metric_definitions=[{'Name': 'loss', 'Regex': 'loss = (.*?),'},
#                                                 {'Name':'epoch','Regex': 'Step_Train = (.*?),'}]
                           )


In [8]:
result = estimator.fit(input_data)

2020-09-07 09:45:14 Starting - Starting the training job...
2020-09-07 09:45:16 Starting - Launching requested ML instances......
2020-09-07 09:46:37 Starting - Preparing the instances for training............
2020-09-07 09:48:32 Downloading - Downloading input data...
2020-09-07 09:49:06 Training - Downloading the training image...
[0m
[34m2020-09-07 09:49:31,449 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-09-07 09:49:31,858 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "training": "/opt/ml/input/data/training"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "epoch_count": 30,
        "batch_size": 32,
        "model_dir": "/opt/ml/model"
    },
    "input_config_dir": "/opt/m

### 打印 model_data 路径， 下载并且解压

In [15]:
print(estimator.model_data)
os.environ['S3_URL']=str(estimator.model_data)  #environ的键值必须是字符串  
  

s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-08-24-09-33-46-405/output/model.tar.gz


In [16]:
%%sh

echo ${S3_URL}

s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-08-24-09-33-46-405/output/model.tar.gz


In [18]:
%%sh

if [ ! -d "output" ];then
mkdir output
fi

cd output
aws s3 cp ${S3_URL} ./model.tar.gz

# aws s3 cp {sli_estimator.model_data} ./model.tar.gz

tar -xvzf ./model.tar.gz 

/home/ec2-user/sagemaker-encapsulation/output
download: s3://nowfox/data/cat-vs-dog-output/tensorflow-training-2020-08-24-09-33-46-405/output/model.tar.gz to ./model.tar.gz
tf_server/
tf_server/1/
tf_server/1/saved_model.pb
tf_server/1/variables/
tf_server/1/variables/variables.index
tf_server/1/variables/variables.data-00000-of-00001
model.h5


确保模型文件保存到以下目录

`inference.ipynb` 里面会用到

```
output
└── tf_server
    └── 1
        ├── saved_model.pb
        └── variables
            ├── variables.data-00000-of-00001
            └── variables.index
```

```