In [1]:
import os
import json

import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

In [2]:
# Initialze a SageMaker session
sess = sagemaker.Session()

# Grab IAM role associated with the notebook instance
role = get_execution_role()

output_path = "s3://" + sess.default_bucket() + "/test2"

In [3]:
# Set instance_type to local to run the training script on the machine that runs this notebook

instance_type = "local"

estimator = PyTorch(
    entry_point="train.py",
    source_dir="code",  # directory of your training script
    role=role,
    framework_version="1.5.0",
    py_version="py3",
    instance_type=instance_type,
    instance_count=1,
    output_path=output_path,
    hyperparameters={"batch-size": 64, "epochs": 10, "learning-rate": 1e-3},
)

In [4]:
# Train the Model
ratings = "file://ml-latest-small/ratings.csv"
movies  = "file://ml-latest-small/movies.csv"

channels = {"ratings": ratings, "movies": movies}
estimator.fit(inputs=channels)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-02-25-18-17-33-379
INFO:sagemaker.local.local_session:Starting training job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-tc2h3:
    command: train
    container_name: ihf1td93ms-algo-1-tc2h3
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3
    networks:
      sagemaker-local:
        aliases:
        - algo-1-tc2h3
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmpgzn1nlfz/algo-1-tc2h3/input:/opt/ml/input
    - /tmp/tm

Login Succeeded


INFO:sagemaker.local.image:image pulled: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3


Creating ihf1td93ms-algo-1-tc2h3 ... 
Creating ihf1td93ms-algo-1-tc2h3 ... done
Attaching to ihf1td93ms-algo-1-tc2h3
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:03,739 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:03,762 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:03,778 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:03,793 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:04,018 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mihf1td93ms-algo-1-tc2h3 |[0m 2023-02-25 18:19:04,032 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mihf1td93ms-algo-1-

INFO:root:creating /tmp/tmpgzn1nlfz/artifacts/output/data
INFO:root:copying /tmp/tmpgzn1nlfz/algo-1-tc2h3/output/success -> /tmp/tmpgzn1nlfz/artifacts/output
INFO:root:copying /tmp/tmpgzn1nlfz/model/model.pth -> /tmp/tmpgzn1nlfz/artifacts/model


===== Job Complete =====


In [6]:
dummyModel_data = estimator.model_data
print("Model artifact saved at:\n", dummyModel_data)
%store dummyModel_data

Model artifact saved at:
 s3://sagemaker-us-east-1-472925017889/test2/pytorch-training-2023-02-25-18-17-33-379/model.tar.gz
Stored 'dummyModel_data' (str)
