In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import sys
import IPython
import os
import numpy as np

if int(sagemaker.__version__.split('.')[0]) == 1:
    print("Installing  SageMaker Version and restarting the kernel")
    !{sys.executable} -m pip install --upgrade sagemaker

else:
    print("Version is good")

Version is good


In [3]:
from sagemaker.tensorflow import TensorFlow
from keras.models import load_model
from sagemaker.tensorflow.model import TensorFlowModel
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.saved_model import builder
from tensorflow.python.saved_model.signature_def_utils import predict_signature_def
from tensorflow.python.saved_model import tag_constants

In [4]:
role = get_execution_role()
sess = sagemaker.Session()
region = boto3.session.Session().region_name
print("Region = {}".format(region))
sm = boto3.Session().client('sagemaker')
rawbucket= sess.default_bucket()

Region = us-east-1


In [5]:
if not os.path.exists('Diabetic-Retinopathy'):
    !unzip Diabetic-Retinopathy.zip

In [6]:
def get_numpy_array_from_image_directory(directory):

    """
    """
    num_images = sum([len(files) for _, _, files in os.walk(directory)])
    datagen = ImageDataGenerator()
    itr = datagen.flow_from_directory(directory=directory, target_size=(224,224), batch_size=num_images, class_mode='categorical')
    X,y = itr.next()
    return X,y 

train_images_location = 'Diabetic-Retinopathy/train'
val_images_location = 'Diabetic-Retinopathy/validation'
test_images_location = 'Diabetic-Retinopathy/test'
X_train, y_train = get_numpy_array_from_image_directory(train_images_location)
X_val, y_val = get_numpy_array_from_image_directory(val_images_location)
X_test, y_test = get_numpy_array_from_image_directory(test_images_location)
    

## alternatively, try tfds.ImageFolder for Prefetch Dataset 
## OR tf.keras.utils.image_dataset_from_directory for Batch Dataset
## Try to combine both and save to S3 or custom directory and load again. 

Found 2611 images belonging to 4 classes.
Found 125 images belonging to 4 classes.
Found 125 images belonging to 4 classes.


In [8]:
import shutil

train_dir = './train'
val_dir = './val'

if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
    
if os.path.exists(val_dir):
    shutil.rmtree(val_dir)
    
os.makedirs(train_dir)
os.makedirs(os.path.join(train_dir, 'X'))
os.makedirs(os.path.join(train_dir, 'y'))
    
os.makedirs(val_dir)
os.makedirs(os.path.join(val_dir, 'X'))
os.makedirs(os.path.join(val_dir, 'y'))


In [9]:
np.save(os.path.join(train_dir + '/X/' + 'trainX.npy'), X_train)
np.save(os.path.join(train_dir + '/y/' + 'trainy.npy'), y_train)
np.save(os.path.join(val_dir + '/X/' + 'valX.npy'), X_val)
np.save(os.path.join(val_dir + '/y/' + 'valy.npy'), y_val)

In [10]:
prefix = 'diabetic-retinopathy'
training_data_path = sess.upload_data(path=train_dir, key_prefix=prefix+"/input/train")
validation_data_path = sess.upload_data(path=val_dir, key_prefix=prefix+"/input/validation")

In [11]:
print(training_data_path)

s3://sagemaker-us-east-1-624905002474/diabetic-retinopathy/input/train


In [12]:
training_data_channel = sagemaker.TrainingInput(s3_data=training_data_path)
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path)

In [13]:
training_data_channel.config

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
   'S3Uri': 's3://sagemaker-us-east-1-624905002474/diabetic-retinopathy/input/train',
   'S3DataDistributionType': 'FullyReplicated'}}}

In [14]:
#in case you stop training midway and want to access checkpoints, they will be found on below location as it 
#will be synchronised with /opt/ml/checkpoints
#https://repost.aws/questions/QUrXX2MIygS5igas27GrAhHw/how-to-checkpoint-sage-maker-model-artifact-during-a-training-job
#https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProcessingS3Output.html
# TODO: Check if it is possible to create a new directory in container using the training script

#The default location to save the checkpoint files is /opt/ml/checkpoints, and SageMaker syncs these files to 
#the specific S3 bucket. Both local and S3 checkpoint locations are customizable.
#https://aws.amazon.com/blogs/machine-learning/implement-checkpointing-with-tensorflow-for-amazon-sagemaker-managed-spot-training/
#TODO: Check how can you provide custom local checkpoints path
s3_checkpoint_uri = f's3://{sess.default_bucket()}/diabetic-retinopathy/checkpoints'

In [15]:
#s3 path where trained model would be saved
output_s3_path = f's3://{sess.default_bucket()}/diabetic-retinopathy/model' 

In [16]:
estimator = TensorFlow(entry_point = 'train.py',
                       instance_type='ml.m5.xlarge',
                       instance_count =8,
                       role = role,
                       source_dir = 'source', # <- add documentation for this
                       framework_version='2.2.0', 
                       py_version='py37',
                       script_mode=True,
                       checkpoint_s3_uri = s3_checkpoint_uri, 
                       output_path = output_s3_path,
                       distribution={'parameter_server': {'enabled': True}}
                        )


In [17]:
estimator.fit({"traindata":training_data_path, "valdata":validation_data_path})

2022-07-03 11:47:41 Starting - Starting the training job...
2022-07-03 11:48:07 Starting - Preparing the instances for trainingProfilerReport-1656848860: InProgress
.........
2022-07-03 11:49:35 Downloading - Downloading input data.........
2022-07-03 11:51:02 Training - Training image download completed. Training in progress.[35m2022-07-03 11:51:04.855333: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[35m2022-07-03 11:51:04.858787: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:106] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[35m2022-07-03 11:51:04.947351: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[32m2022-07-03 11:51:05.005589: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:425] Initializing the SageMaker Profiler.[0m
[32m2022-07-03 11:51:05.011098: W

In [None]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


----------------------------------------

In [30]:
! aws sagemaker list-models

{
    "Models": []
}


In [27]:

# MODEL_LOCATION = os.path.join(os.environ.get('SM_MODEL_DIR'), 'v0', 'dr_model.h5') <- Thus doesn't work if 
# training is not completed. The contents of opt/ml/model are only accessible at the end of training 
# https://discuss.huggingface.co/t/how-to-access-to-opt-ml-model-before-the-end-of-the-model-training/12669
# The SageMaker training mechanism uses training containers on Amazon EC2 instances, and the checkpoint files are
# saved under a local directory of the containers (default is /opt/ml/checkpoints). SageMaker automatically 
# syncs the checkpoints in that directory with Amazon S3.
# https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html
#
#
# So Let us try to load model checkpoints from a local directory first before using checkpoint directory
MODEL_LOCATION = os.path.join('./checkpoints', 'dr_model.h5')
saved_model = load_model(MODEL_LOCATION)

OSError: SavedModel file does not exist at: ./checkpoints/dr_model.h5/{saved_model.pbtxt|saved_model.pb}

In [None]:
def convert_h5_to_aws(loaded_model):
    
    """
    given a pre-trained keras model, this function converts it to a TF protobuf format
    and saves it in the file structure which aws expects
    """  
    
    # This is the file structure which AWS expects. Cannot be changed. 
    model_version = '1'
    export_dir = 'export/Servo/' + model_version
    
    # Build the Protocol Buffer SavedModel at 'export_dir'
    builder = builder.SavedModelBuilder(export_dir)
    
    # Create prediction signature to be used by TensorFlow Serving Predict API
    signature = predict_signature_def(
        inputs={"inputs": loaded_model.input}, outputs={"score": loaded_model.output})
    
    from keras import backend as K
    with K.get_session() as sess:
        # Save the meta graph and variables
        builder.add_meta_graph_and_variables(
            sess=sess, tags=[tag_constants.SERVING], signature_def_map={"serving_default": signature})
        builder.save()
    
    #create a tarball/tar file and zip it
    import tarfile
    with tarfile.open('model.tar.gz', mode='w:gz') as archive:
        archive.add('export', recursive=True)
        
convert_h5_to_aws(saved_model)

In [None]:
!touch dummy_train.py #create an empty python file

In [None]:
sess.upload_data(path='model.tar.gz', key_prefix=prefix+'/output/best_model/')

sagemaker_model = TensorFlowModel(model_data = 's3://' + rawbucket +'/'+ prefix + '/output/best_model/model.tar.gz',
                                  role = role,
                                  framework_version = '2.1.0',
                                  entry_point = 'dummy_train.py')