# Fine-tune mobilenet v2 with SageMaker
  
This notebook should be run in a SageMaker Studio Jupyter notebook with a small-ish instance type (ml.t3.medium is fine).  
Training job will be run on a separate, more powerful instance as defined below.

### Start sagemaker session

In [30]:
import sagemaker, boto3, json
from sagemaker.session import Session

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Define model and training instance type

In [37]:
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.estimator import Estimator

model_id, model_version = "tensorflow-ic-imagenet-mobilenet-v2-100-224-classification-4", "*"
training_instance_type = "ml.g4dn.xlarge"

### Retrieve training uris

In [38]:
# Retrieve the Docker image
train_image_uri = image_uris.retrieve(model_id=model_id,model_version=model_version,image_scope="training",instance_type=training_instance_type,region=None,framework=None)

# Retrieve the training script
train_source_uri = script_uris.retrieve(model_id=model_id, model_version=model_version, script_scope="training")

# Retrieve the pretrained model tarball for transfer learning
train_model_uri = model_uris.retrieve(model_id=model_id, model_version=model_version, model_scope="training")

### Define hyperparameters

In [39]:
# Retrieve the default hyper-parameters for fine-tuning the model
hyperparameters = hyperparameters.retrieve_default(model_id=model_id, model_version=model_version)

# [Optional] Override default hyperparameters with custom values
hyperparameters["epochs"] = "20"
# hyperparameters["augmentation"] = 'Fasle'
hyperparameters["early_stopping"] =  'True'
hyperparameters["early_stopping_min_delta"] = '1.0'
print(str(hyperparameters).replace(", ", "\n"))

{'train_only_top_layer': 'True'
'epochs': '20'
'batch_size': '32'
'optimizer': 'adam'
'learning_rate': '0.001'
'beta_1': '0.9'
'beta_2': '0.999'
'momentum': '0.9'
'epsilon': '1e-07'
'rho': '0.95'
'initial_accumulator_value': '0.1'
'reinitialize_top_layer': 'Auto'
'early_stopping': 'True'
'early_stopping_patience': '5'
'early_stopping_min_delta': '1.0'
'dropout_rate': '0.2'
'regularizers_l2': '0.0001'
'label_smoothing': '0.1'
'image_resize_interpolation': 'bilinear'
'augmentation': 'False'
'augmentation_random_flip': 'horizontal_and_vertical'
'augmentation_random_rotation': '0.2'
'augmentation_random_zoom': '0.1'
'binary_mode': 'False'
'eval_metric': 'accuracy'
'validation_split_ratio': '0.2'
'random_seed': '123'}


### Train with Automatic Model Tuning (HPO)

Amazon SageMaker automatic model tuning, also known as hyperparameter tuning, finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose. We will use a HyperparameterTuner object to interact with Amazon SageMaker hyperparameter tuning APIs.

In [40]:
from sagemaker.tuner import ContinuousParameter

# Use AMT for tuning and selecting the best model
use_amt = False

# Define objective metric per framework, based on which the best model will be selected.
amt_metric_definitions = {
    "metrics": [{"Name": "val_accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"}],
    "type": "Maximize",
}

# You can select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.(https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)
hyperparameter_ranges = {
    "adam-learning-rate": ContinuousParameter(0.0001, 0.1, scaling_type="Logarithmic")
}

# Increase the total number of training jobs run by AMT, for increased accuracy (and training time).
max_jobs = 6
# Change parallel training jobs run by AMT to reduce total training time, constrained by your account limits.
# if max_jobs=max_parallel_jobs then Bayesian search turns to Random.
max_parallel_jobs = 2

### Set up S3 input and output buckets

In [41]:
# The data is available in the following S3 bucket
training_data_bucket = "isicbucket"
training_data_prefix = "preprocessed/preprocess_ps_224"

training_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}"

training_job_name = "isic-mobilenet-v2-finetune"

# Model and model artefacts will be saved to the following S3 bucket
output_bucket = sess.default_bucket()
output_prefix = "isic_mobilenet-v2_finetune"
s3_output_location = f"s3://{output_bucket}/{output_prefix}/ps_output_v2"

In [42]:
from sagemaker.utils import name_from_base

In [43]:
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base
from sagemaker.tuner import HyperparameterTuner

# training_job_name = "isic-mobilenet-v2-finetune"

training_metric_definitions = [
    {"Name": "val_accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"},
    {"Name": "val_loss", "Regex": "val_loss: ([0-9\\.]+)"},
    {"Name": "train_accuracy", "Regex": "- accuracy: ([0-9\\.]+)"},
    {"Name": "train_loss", "Regex": "- loss: ([0-9\\.]+)"},
]

# Create SageMaker Estimator instance
ic_estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location,
    base_job_name=training_job_name,
    metric_definitions=training_metric_definitions,
)

if use_amt:
    hp_tuner = HyperparameterTuner(
        ic_estimator,
        amt_metric_definitions["metrics"][0]["Name"],
        hyperparameter_ranges,
        amt_metric_definitions["metrics"],
        max_jobs=max_jobs,
        max_parallel_jobs=max_parallel_jobs,
        objective_type=amt_metric_definitions["type"],
        base_tuning_job_name=training_job_name,
    )

    # Launch a SageMaker Tuning job to search for the best hyperparameters
    hp_tuner.fit({"training": training_dataset_s3_path})
else:
    # Launch a SageMaker Training job by passing s3 path of the training data
    ic_estimator.fit({"training": training_dataset_s3_path}, logs=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: isic-mobilenet-v2-finetune-2024-03-08-03-06-40-946


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateTrainingJob operation: The account-level service limit 'ml.g4dn.xlarge for training job usage' is 1 Instances, with current utilization of 1 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

### Create and fit Sagemaker Estimator

In [9]:
# # Create SageMaker Estimator instance
# tf_ic_estimator = Estimator(
#     role=aws_role,
#     image_uri=train_image_uri,
#     source_dir=train_source_uri,
#     model_uri=train_model_uri,
#     entry_point="transfer_learning.py",
#     instance_count=1,
#     instance_type=training_instance_type,
#     max_run=360000,
#     hyperparameters=hyperparameters,
#     output_path=s3_output_location,
#     base_job_name = training_job_name
# )

# # Use S3 path of the training data to launch SageMaker TrainingJob
# tf_ic_estimator.fit({"training": training_dataset_s3_path}, logs=True)

## Print out where the model is saved at

In [13]:
import boto3

sm_boto3 = boto3.client("sagemaker")

training_job_name = ic_estimator.latest_training_job.job_name
print(f'Training name: {training_job_name}\n')

ic_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=ic_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("\nModel artifact persisted at " + artifact)

Training name: isic-mobilenet-v2-finetune-2024-03-08-02-16-38-963


2024-03-08 02:28:01 Starting - Preparing the instances for training
2024-03-08 02:28:01 Downloading - Downloading the training image
2024-03-08 02:28:01 Training - Training image download completed. Training in progress.
2024-03-08 02:28:01 Uploading - Uploading generated training model
2024-03-08 02:28:01 Completed - Training job completed

Model artifact persisted at s3://sagemaker-us-west-2-766088526747/isic_mobilenet-v2_finetune/ps_output_v2/isic-mobilenet-v2-finetune-2024-03-08-02-16-38-963/output/model.tar.gz


## Extract Training performance metrics


In [16]:
if use_amt:
    training_job_name = hp_tuner.best_training_job()
else:
    training_job_name = ic_estimator.latest_training_job.job_name
    
import sagemaker
from IPython.core.display import Markdown

sagemaker_session = sagemaker.Session()

link = (
    "https://console.aws.amazon.com/cloudwatch/home?region="
    + sagemaker_session.boto_region_name
    + "#metricsV2:query=%7B/aws/sagemaker/TrainingJobs,TrainingJobName%7D%20"
    + training_job_name
)
display(Markdown("CloudWatch metrics: [link](" + link + ")"))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


CloudWatch metrics: [link](https://console.aws.amazon.com/cloudwatch/home?region=us-west-2#metricsV2:query=%7B/aws/sagemaker/TrainingJobs,TrainingJobName%7D%20isic-mobilenet-v2-finetune-2024-03-08-02-16-38-963)

In [17]:
from sagemaker import TrainingJobAnalytics

df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()

df.head(10)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


Unnamed: 0,timestamp,metric_name,value
0,0.0,val_accuracy,0.6998
1,60.0,val_accuracy,0.7065
2,120.0,val_accuracy,0.7205
3,180.0,val_accuracy,0.7217
4,240.0,val_accuracy,0.723863
5,0.0,val_loss,1.1504
6,60.0,val_loss,1.1231
7,120.0,val_loss,1.0999
8,180.0,val_loss,1.1127
9,240.0,val_loss,1.091067


In [15]:
# TrainingJobAnalytics(training_job_name=training_job_name)
metric_names = [metric["Name"] for metric in training_metric_definitions]
metric_names

['val_accuracy', 'val_loss', 'train_accuracy', 'train_loss']

### Deploy model for inference

In [None]:
inference_instance_type = "ml.g4dn.xlarge"

# Retrieve the inference docker container uri
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope="inference",
    model_id=model_id,
    model_version=model_version,
    instance_type=inference_instance_type,
)
# Retrieve the inference script uri
deploy_source_uri = script_uris.retrieve(
    model_id=model_id, model_version=model_version, script_scope="inference"
)

endpoint_name = name_from_base(f"jumpstart-example-FT-{model_id}-")

# Use the estimator from the previous step to deploy to a SageMaker endpoint
finetuned_predictor = tf_ic_estimator.deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    entry_point="inference.py",
    image_uri=deploy_image_uri,
    source_dir=deploy_source_uri,
    endpoint_name=endpoint_name,
)

### Download example images for inference

In [3]:
# s3_bucket = "isicbucket"
# key_prefix = "ISIC_Raw_Images/"

# def download_from_s3(images):
#     for filename, image_key in images.items():
#         boto3.client("s3").download_file(s3_bucket, f"{key_prefix}/{image_key}", filename)


# test_images = {
#     "img1.jpg": "roses/10503217854_e66a804309.jpg",
#     "img2.jpg": "sunflowers/1008566138_6927679c8a.jpg",
# }
# download_from_s3(flower_images)

NameError: name 'flower_images' is not defined