# Train and track SageMaker Training Jobs with SageMaker Experiments package

Based on [this example notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-experiments/sagemaker_job_tracking/tensorflow_script_mode_training_job.ipynb)

In [2]:
import os
import boto3
import json
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run
from sagemaker.utils import unique_name_from_base

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
sagemaker_session = Session()
boto_sess = boto3.Session()

role = get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sm = boto_sess.client("sagemaker")
region = boto_sess.region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Prepare the training script

In [4]:
!mkdir -p script

In [49]:
%%writefile ./script/train.py

import os

#os.system("pip install -U sagemaker")

import tensorflow as tf
import argparse

from sagemaker.session import Session
from sagemaker.experiments import load_run
from sagemaker.s3 import S3Downloader

import boto3


boto_session = boto3.session.Session(region_name=os.environ["REGION"])
sagemaker_session = Session(boto_session=boto_session)
s3 = boto3.client("s3")


def parse_args():
    """ """
    
    parser = argparse.ArgumentParser()

    parser.add_argument("--train", type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument("--test", type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument("--epochs", type=int, default=5)
    parser.add_argument("--batch_size", type=int, default=64)
    parser.add_argument("--dropout", type=float, default=0.01)

    return parser.parse_known_args()


class ExperimentCallback(tf.keras.callbacks.Callback):
    """ """

    
    def __init__(self, run, model, val_data):
        """Save params in constructor"""
        
        self.run = run
        self.model = model
        self.val_data = val_data

    def on_epoch_end(self, epoch, logs=None):
        """ """
        
        keys = list(logs.keys())
        for key in keys:
            self.run.log_metric(name=key, value=logs[key], step=epoch)
            print("{:-<25}> {:.2f}".format(key, logs[key]))
            

def load_data(train_dir, test_dir, batch_size, image_size):
    """ """
    
    print("\n\nLoading training/validation data:\n")
    train_data, val_data = tf.keras.utils.image_dataset_from_directory(
        directory=train_dir,
        labels='inferred',
        label_mode='categorical',
        color_mode='rgb',
        batch_size=batch_size,
        image_size=image_size,
        shuffle=True,
        seed=42,
        validation_split=0.2,
        subset='both',
    )
    
    print("\nLoading testing data:\n")
    test_data = tf.keras.utils.image_dataset_from_directory(
        directory=test_dir,
        labels='inferred',
        label_mode='categorical',
        color_mode='rgb',
        batch_size=batch_size,
        image_size=image_size,
        shuffle=True,
        seed=42,
    )
    print("\n")
    
    return train_data, val_data, test_data

    
def build_model(num_classes, input_shape, dropout):
    """ """
    
    base_model = tf.keras.applications.ResNet50(
        include_top=False,
        input_shape=input_shape,
        pooling='avg',
    )
    base_model.trainable = False
    
    i = tf.keras.Input(shape=(None, None, 3,), dtype = tf.uint8)
    x = tf.cast(i, tf.float32)
    x = tf.keras.applications.resnet50.preprocess_input(x)
    x = base_model(x, training=False)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Dense(units=64, activation='relu')(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    o = tf.keras.layers.Dense(units=num_classes, activation="softmax")(x)
    
    model = tf.keras.Model(inputs=[i], outputs=[o])
    
    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(), 
        optimizer=tf.keras.optimizers.Adam(0.001), 
        metrics=[
            tf.keras.metrics.CategoricalAccuracy(), 
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            #tf.keras.metrics.F1Score(), # This bugs when trying to log to sagemaker experiements
        ]
    )
    
    return model


def main():
    """ """
    
    input_shape = (224, 224, 3)
    
    args, _ = parse_args()
    print("Args are : ", args)
    
    num_classes = len(next(os.walk(args.train))[1])

    model = build_model(
        num_classes=num_classes,
        input_shape=input_shape,
        dropout=args.dropout,
    )
    model.summary()
    
    train_data, val_data, test_data = load_data(
        train_dir=args.train,
        test_dir=args.test,
        batch_size=args.batch_size, 
        image_size=input_shape[:-1]
    )

    # load_run will use the run defined when calling the estimator
    with load_run(sagemaker_session=sagemaker_session) as run:
        model.fit(
            x=train_data,
            epochs=args.epochs,
            callbacks=[ExperimentCallback(run, model, val_data)],
            validation_data=val_data,
            class_weight=None, #TODO
            verbose=2, #Sagemaker spams the output with %010 backspace characters when verbose=1
        )

        score = model.evaluate(x=test_data, verbose=0)
        
        print("\n\nTest loss:", score[0])
        print("Test accuracy:", score[1])
        print("Test precision:", score[2])
        print("Test recall:", score[3], end="\n\n")

        run.log_metric(name="Final Test Loss", value=score[0])
        run.log_metric(name="Final Test Accuracy", value=score[1])
        run.log_metric(name="Final Test Precision", value=score[2])
        run.log_metric(name="Final Test Recall", value=score[3])
        
        #run.log_confusion_matrix() TODO

        model.save("/opt/ml/model")


if __name__ == "__main__":
    main()

Overwriting ./script/train.py


### Create an Experiment and launch a training job

In [50]:
from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.experiments.run import Run
from datetime import datetime

timestamp = datetime.now().strftime("%y%m%d-%H%M%S")

# Hyperparameters
hyperparameters = {
    'batch_size': 128,
    'epochs': 5,
    'dropout': 0.01,
}

# Instance and data specs
device = 'gpu'
s3_data_uri = "s3://isicbucket/preprocessed/224x224_center_crop_without_unknown_vascular_dermatofibroma/"

if device == 'gpu':
    image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.14.1-gpu-py310-cu118-ubuntu20.04-sagemaker'
    instance_type = 'ml.g4dn.xlarge'
    
elif device == 'cpu':
    image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/tensorflow-training:2.14.1-cpu-py310-ubuntu20.04-sagemaker'
    instance_type = 'ml.m5.xlarge'

experiment_name = "{}-SkinScreen-{}-{}-{}e".format(
    timestamp,
    instance_type,
    device,
    hyperparameters['epochs'],
).replace('.','')

output_bucket = default_bucket
s3_output_uri = f"s3://{output_bucket}/Experiments/{experiment_name}"

with Run(
    experiment_name=experiment_name,
    sagemaker_session=sagemaker_session,
    artifact_prefix=f"Experiments/{experiment_name}"
) as run:
    run.log_parameters(hyperparameters)

    estimator = TensorFlow(
        entry_point="./script/train.py",
        role=role,
        model_dir=False, # Set to None when ready to save model to S3 for deployment
        image_uri=image_uri,
        hyperparameters=hyperparameters,
        instance_type=instance_type,
        instance_count=1,
        keep_alive_period_in_seconds=3600,
        environment={"REGION": region},
        output_path=s3_output_uri,
    )

    estimator.fit(
        {
            "train": s3_data_uri + "train/",
            "test": s3_data_uri + "val/",
        }
    )


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-03-26-01-10-22-700


Using provided s3_resource
2024-03-26 01:10:22 Starting - Starting the training job...
2024-03-26 01:10:46 Starting - Preparing the instances for training...
2024-03-26 01:11:22 Downloading - Downloading input data......
2024-03-26 01:12:07 Downloading - Downloading the training image...............
2024-03-26 01:14:32 Training - Training image download completed. Training in progress.[34m2024-03-26 01:14:44.747871: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.[0m
[34m2024-03-26 01:14:44.790194: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered[0m
[34m2024-03-26 01:14:44.790243: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft

In [51]:
estimator.model_data

's3://sagemaker-us-west-2-766088526747/Experiments/240326-011021-SkinScreen-mlg4dnxlarge-gpu-5e/tensorflow-training-2024-03-26-01-10-22-700/output/model.tar.gz'