# Train and track SageMaker Training Jobs with SageMaker Experiments package

Based on [this example notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-experiments/sagemaker_job_tracking/tensorflow_script_mode_training_job.ipynb)

In [5]:
import os
import boto3
import json
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run
from sagemaker.utils import unique_name_from_base

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [6]:
sagemaker_session = Session()
boto_sess = boto3.Session()

role = get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sm = boto_sess.client("sagemaker")
region = boto_sess.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Prepare the training script

In [2]:
!mkdir -p script

In [12]:
%%writefile ./script/train.py

import os

os.system("pip install -U sagemaker")

import numpy as np
import tensorflow as tf
import pandas as pd
import argparse

from sagemaker.session import Session
from sagemaker.experiments import load_run
from sagemaker.s3 import S3Downloader

import boto3

boto_session = boto3.session.Session(region_name=os.environ["REGION"])
sagemaker_session = Session(boto_session=boto_session)
s3 = boto3.client("s3")


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--training", type=str, default=os.environ.get('SM_CHANNEL_TRAINING'))
    parser.add_argument("--validation", type=str, defalut=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--dropout", type=float, default=0.1)

    return parser.parse_known_args()


class ExperimentCallback(tf.keras.callbacks.Callback):
    """ """

    def __init__(self, run, model, x_test, y_test):
        """Save params in constructor"""
        self.run = run
        self.model = model
        self.x_test = x_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        """ """
        keys = list(logs.keys())
        for key in keys:
            self.run.log_metric(name=key, value=logs[key], step=epoch)
            print("{} -> {}".format(key, logs[key]))

def download_data_from_s3():
    pass
            

def load_data(train_dir, test_dir, batch_size, image_size):
    """ """
    
    train_data, val_data = tf.keras.utils.image_dataset_from_directory(
        directory=train_dir,
        labels='inferred',
        label_mode='categorical',
        color_mode='rgb',
        batch_size=batch_size,
        image_size=image_size,
        shuffle=True,
        seed=42,
        validation_split=0.2,
        subset='both',
    )
    
    test_data = tf.keras.utils.image_dataset_from_directory(
        directory=test_dir,
        labels='inferred',
        label_mode='categorical',
        color_mode='rgb',
        batch_size=batch_size,
        image_size=image_size,
        shuffle=True,
        seed=42,
    )
    
    return train_data, val_data, test_data

    
def build_model(num_classes, input_shape, dropout, optimizer=tf.keras.optimizers.Adam(0.01)):
    """ """
    
    base_model = tf.keras.applications.ResNet50(
        include_top = False,
        input_shape = input_shape,
        pooling = 'avg',
    )
    
    i = tf.keras.Input(shape=(None, None, 3,), dtype = tf.uint8)
    x = tf.cast(i, tf.float32)
    x = tf.keras.applications.resnet.preproces_input(x)
    x = base_model(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Dense(units=64, activation='relu')(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    o = tf.keras.layers.Dense(num_classes, activation="softmax")(x)
    
    model = tf.keras.Model(inputs=[i], outputs=[o])
    
    model.compile(
        loss="categorical_crossentropy", 
        optimizer=optimizer, 
        metrics=["accuracy", "f1_score", tf.keras.metrics.AUC()]
    )
    
    return model


def main():
    """ """
    
    num_classes = 6
    input_shape = (224, 224, 3)
    
    args, _ = parse_args()
    print("Args are : ", args)

    model = build_model(
        num_classes=num_classes,
        input_shape=input_shape,
        dropout=args.dropout,
    )
    model.summary()
    
    train_data, val_data, test_data = load_data(
        train_dir=args.train,
        test_dir=args.test,
        batch_size=args.batch_size, 
        image_size=input_shape[:-1]
    )

    ###
    # `load_run` will use the run defined when calling the estimator
    ###
    with load_run(sagemaker_session=sagemaker_session) as run:
        model.fit(
            x=train_data,
            epochs=args.epochs,
            callbacks=[ExperimentCallback(run, model, x_test, y_test)],
            validation_data=val_data,
            class_weight=None,
        )

        score = model.evaluate(x=test_data, verbose=0)
        print("Test loss:", score[0])
        print("Test accuracy:", score[1])

        run.log_metric(name="Final Test Loss", value=score[0])
        run.log_metric(name="Final Test Accuracy", value=score[1])

        model.save("/opt/ml/model")


if __name__ == "__main__":
    main()

Overwriting ./script/train.py


### Create an Experiment and launch a training job

In [13]:
from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.experiments.run import Run

s3_data_uri = "s3://isicbucket/preprocessed/224x224_center_crop_without_unknown_vascular_dermatofibroma/"
exp_name = "tensorflow-script-mode-experiment"

batch_size = 128
epochs = 5
dropout = 0.1

with Run(
    experiment_name=exp_name,
    sagemaker_session=sagemaker_session,
) as run:
    run.log_parameter("batch_size", batch_size)
    run.log_parameter("epochs", epochs)
    run.log_parameter("dropout", dropout)

    estimator = TensorFlow(
        entry_point="./script/train.py",
        role=role,
        model_dir=False,
        hyperparameters={"epochs": epochs, "batch_size": batch_size, "dropout": dropout},
        framework_version="2.8",
        py_version="py39",
        instance_type="ml.m5.xlarge",
        instance_count=1,
        keep_alive_period_in_seconds=3600,
        environment={"REGION": region},
    )

    estimator.fit(
        {
            "train": f"{s3_data_uri}train/",
            "test": f"{s3_data_uri}val/",
        }
    )


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-03-22-22-00-59-292


Using provided s3_resource
2024-03-22 22:01:00 Starting - Starting the training job......
2024-03-22 22:01:44 Starting - Preparing the instances for training...
2024-03-22 22:02:17 Downloading - Downloading input data...
2024-03-22 22:02:32 Downloading - Downloading the training image...
2024-03-22 22:03:28 Training - Training image download completed. Training in progress.
2024-03-22 22:03:28 Uploading - Uploading generated training model.[34m2024-03-22 22:03:23.218026: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-03-22 22:03:23.218195: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2024-03-22 22:03:23.247257: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2024-03-22 22:03:25,111 sagemaker-training-toolkit 

ClientError: An error occurred (ValidationException) when calling the UpdateTrialComponent operation: 2 validation errors detected: Value 'Error for Training job tensorflow-training-2024-03-22-22-00-59-292: Failed. Reason: AlgorithmError: Framework Error: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/trainer.py", line 87, in train
    entrypoint()
  File "/usr/local/lib/python3.9/site-packages/sagemaker_tensorflow_container/training.py", line 281, in main
    train(env, mapping.to_cmd_args(user_hyperparameters))
  File "/usr/local/lib/python3.9/site-packages/sagemaker_tensorflow_container/training.py", line 214, in train
    entry_point.run(
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/entry_point.py", line 92, in run
    files.download_and_extract(uri=uri, path=environment.code_dir)
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/files.py", line 138, in download_and_extract
    s3_download(uri, dst)
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/files.py", line 174, in s3_download
    s3.Bucket(bucket).download_file(key, dst)
  File "/usr/local/lib/python3.9/site-packages/boto3/s3/inject.py", line 277, in bucket_do' at 'status.message' failed to satisfy constraint: Member must have length less than or equal to 1024; Value 'Error for Training job tensorflow-training-2024-03-22-22-00-59-292: Failed. Reason: AlgorithmError: Framework Error: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/trainer.py", line 87, in train
    entrypoint()
  File "/usr/local/lib/python3.9/site-packages/sagemaker_tensorflow_container/training.py", line 281, in main
    train(env, mapping.to_cmd_args(user_hyperparameters))
  File "/usr/local/lib/python3.9/site-packages/sagemaker_tensorflow_container/training.py", line 214, in train
    entry_point.run(
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/entry_point.py", line 92, in run
    files.download_and_extract(uri=uri, path=environment.code_dir)
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/files.py", line 138, in download_and_extract
    s3_download(uri, dst)
  File "/usr/local/lib/python3.9/site-packages/sagemaker_training/files.py", line 174, in s3_download
    s3.Bucket(bucket).download_file(key, dst)
  File "/usr/local/lib/python3.9/site-packages/boto3/s3/inject.py", line 277, in bucket_do' at 'status.message' failed to satisfy constraint: Member must satisfy regular expression pattern: .*

In [None]:
estimator.model_data

In [18]:
from sagemaker.s3 import S3Downloader

S3Downloader.download(
    s3_uri='s3://isicbucket/preprocessed/224x224_center_crop_without_unknown_vascular_dermatofibroma/val/',
    local_path="/tmp/",
    sagemaker_session=sagemaker_session,
)

OSError: [Errno 16] Device or resource busy: '/tmp/..0FAD574f' -> '/tmp/.'

{'py_versions': ['py39'],
 'registries': {'af-south-1': '626614931356',
  'il-central-1': '780543022126',
  'ap-east-1': '871362719292',
  'ap-northeast-1': '763104351884',
  'ap-northeast-2': '763104351884',
  'ap-northeast-3': '364406365360',
  'ap-south-1': '763104351884',
  'ap-south-2': '772153158452',
  'ap-southeast-1': '763104351884',
  'ap-southeast-2': '763104351884',
  'ap-southeast-3': '907027046896',
  'ap-southeast-4': '457447274322',
  'ca-central-1': '763104351884',
  'cn-north-1': '727897471807',
  'cn-northwest-1': '727897471807',
  'eu-central-1': '763104351884',
  'eu-central-2': '380420809688',
  'eu-north-1': '763104351884',
  'eu-south-1': '692866216735',
  'eu-south-2': '503227376785',
  'eu-west-1': '763104351884',
  'eu-west-2': '763104351884',
  'eu-west-3': '763104351884',
  'me-south-1': '217643126080',
  'me-central-1': '914824155844',
  'sa-east-1': '763104351884',
  'us-east-1': '763104351884',
  'us-east-2': '763104351884',
  'us-gov-east-1': '446045086