* changed by nov05 on 2024-12-01  
* local conda env `awsmle_py310`  

In [8]:
!notepad C:\Users\guido\.aws\credentials

In [9]:
## reset the session after updating credentials
import boto3
boto3.DEFAULT_SESSION = None

from sagemaker import get_execution_role
role_arn = get_execution_role()  ## get role ARN
if 'AmazonSageMaker-ExecutionRole' not in role_arn:
    ## your own role here
    role_arn = "arn:aws:iam::061096721307:role/service-role/AmazonSageMaker-ExecutionRole-20241128T055392"
print("👉 Role ARN:", role_arn) ## If local, Role ARN: arn:aws:iam::807711953667:role/voclabs

👉 Role ARN: arn:aws:iam::061096721307:role/service-role/AmazonSageMaker-ExecutionRole-20241128T055392


# Hyperparameter Tuning in SageMaker
In this page we will see how we can automatically tune our hyperparameters when training models.
We can specify three types of hyperparameters to tune in Sagemaker:
- `IntegerParameter`: These are parameters that can take any integer values. For instance, the number of layers in a model or the epochs can be an integer parameter.
- `ContinuousParameter`: These are parameters that can have a continuous value like the learning rate.
- `CategoricalParameter`: Parameters that can take only a few specific values are called categorical parameters. For instance the batch size is a categorical parameter.

The first step will be to import these as well as the `HyperparameterTuner` class.

## `pytorch_mnist.py`
<details>
  <summary> Click here to see the full script code </summary>
   
``` python
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision.datasets import MNIST

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


def train(model, train_loader, optimizer, epoch):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                )
            )


def test(model, test_loader):
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(
        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=2,
        metavar="N",
        help="number of epochs to train (default: 14)",
    )
    parser.add_argument(
        "--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)"
    )
    args = parser.parse_args()

    train_kwargs = {"batch_size": args.batch_size}
    test_kwargs = {"batch_size": args.test_batch_size}

    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
    )
    MNIST.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/"]
    dataset1 = MNIST("../data", train=True, download=True, transform=transform)
    dataset2 = MNIST("../data", train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

    model = Net()

    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    for epoch in range(1, args.epochs + 1):
        train(model, train_loader, optimizer, epoch)
        test(model, test_loader)
    
    torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == "__main__":
    main()

```
</details>

In [10]:
import sagemaker
from sagemaker.tuner import (
    # IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

Next we will create a Pytorch estimator like we have been doing before. If there are any fixed hyperparameters in your model, you can specify them here.

In [None]:
from sagemaker.pytorch import PyTorch
estimator = PyTorch(
    entry_point="..\script mode\scripts\pytorch_mnist.py",  ## my own script
    base_job_name="hpo-pytorch-mnist",  ## s3 folder 
    role=role_arn,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

The hyperparameters we want to tune are specified in a dictionary as shown below.

In [12]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": CategoricalParameter([32, 64, 128, 256, 512]),
}

We also need to specify the metric that we are trying to optimize for and how Sagemaker can identify it from the training logs. Since we are optimizing for loss, our objective needs to be minimized. Other metrics like accuracy will need to be maximized.

In [13]:
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{
    "Name": "average test loss", 
    "Regex": "Test set: Average loss: ([0-9\\.]+)"}]

Next we will create our hyperparater tuner object with our estimator, hyperparameter dict, and the metric details. We also need to tell Sagemaker how many jobs to run and the number of jobs to run in parallel. 

In [20]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=20,
    max_parallel_jobs=10,
    objective_type=objective_type,
)

Finally, we can start our training. We can also see the name of the best training job and get its hyperparameters

In [21]:
%%time
tuner.fit(wait=True)
## fine the job in "SageMaker - Training - Hyperparameter tuning jobs"
## job name, e.g. pytorch-training-241201-0430 

.............................................................................................................................................................................................................................................................................................................!
CPU times: total: 1.66 s
Wall time: 26min 30s


In [22]:
tuner.best_training_job()

'pytorch-training-241201-0430-003-c42bc262'

In [23]:
tuner.best_estimator()


2024-12-01 10:43:25 Starting - Preparing the instances for training
2024-12-01 10:43:25 Downloading - Downloading the training image
2024-12-01 10:43:25 Training - Training image download completed. Training in progress.
2024-12-01 10:43:25 Uploading - Uploading generated training model
2024-12-01 10:43:25 Completed - Resource reused by training job: pytorch-training-241201-0430-015-5b1c9cd3


<sagemaker.pytorch.estimator.PyTorch at 0x1f53ea67250>

In [24]:
tuner.best_estimator().hyperparameters()


2024-12-01 10:43:25 Starting - Preparing the instances for training
2024-12-01 10:43:25 Downloading - Downloading the training image
2024-12-01 10:43:25 Training - Training image download completed. Training in progress.
2024-12-01 10:43:25 Uploading - Uploading generated training model
2024-12-01 10:43:25 Completed - Resource reused by training job: pytorch-training-241201-0430-015-5b1c9cd3


{'_tuning_objective_metric': '"average test loss"',
 'batch-size': '"64"',
 'lr': '0.013365881050587397',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"hpo-pytorch-mnist-2024-12-01-10-30-02-638"',
 'sagemaker_program': '"pytorch_mnist.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-061096721307/hpo-pytorch-mnist-2024-12-01-10-30-02-638/source/sourcedir.tar.gz"'}

In [None]:
# predictor = tuner.deploy(
#     initial_instance_count=1, 
#     instance_type="ml.t2.medium")