In [2]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
# role = "arn:aws:iam::636218042492:role/service-role/AmazonSageMaker-ExecutionRole-20220829T114489"
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::636218042492:role/service-role/AmazonSageMaker-ExecutionRole-20220829T114489
sagemaker bucket: sagemaker-us-east-1-636218042492
sagemaker session region: us-east-1


In [3]:
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
print(torchvision.__version__)

1.10.0 True
0.11.1


# Train Model

In [3]:
from sagemaker.huggingface import HuggingFace
import time

print(sagemaker.__version__)

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,                          # number of training epochs
                 'train_batch_size': 4,               # batch size for training
                 'eval_batch_size': 2,                # batch size for evaluation
                 'learning_rate': 5e-5,                # learning rate used during training
                 'model_id':'microsoft/layoutlmv2-base-uncased', # pre-trained model
                 'fp16': True,                         # Whether to use 16-bit (mixed) precision training
                }

2.107.0


In [4]:
# define Training Job Name 
job_name = f'huggingface-workshop-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = '',        # fine-tuning script used in training jon
#     source_dir           = './',       # directory where fine-tuning script is stored
    # instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    instance_type        = 'local_gpu',
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    # transformers_version = '4.17.0',           # the transformers version used in the training job
    # pytorch_version      = '1.10.2',           # the pytorch_version version used in the training job
    py_version           = 'py38',            # the python version used in the training job
    image_uri            = '636218042492.dkr.ecr.us-east-1.amazonaws.com/huggingface-sagemaker-pytorch-training-detectron2:latest',
    hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
)

In [5]:
huggingface_estimator.training_image_uri()
huggingface_estimator.instance_type
job_name

'huggingface-workshop-2022-09-06-05-57-25'

In [6]:
training_input_path = f's3://{sess.default_bucket()}/dataset-pp/train'
test_input_path = f's3://{sess.default_bucket()}/dataset-pp/test'

# define a data input dictionary with our uploaded s3 uris
data = {
    'train': training_input_path,
    'test': test_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

Login Succeeded


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Creating plt27dpd0o-algo-1-iw4xu ... 
Creating plt27dpd0o-algo-1-iw4xu ... done
Attaching to plt27dpd0o-algo-1-iw4xu
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,235 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,266 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,266 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,273 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,304 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36mplt27dpd0o-algo-1-iw4xu |[0m 2022-09-06 06:02:46,335 sagemaker-training-toolkit INFO     instance_groups entry not present in resource_config
[36m

Failed to delete: /tmp/tmpocaofvyn/algo-1-iw4xu Please remove it manually.


===== Job Complete =====


# Deploy from Model

In [4]:
from sagemaker.huggingface import HuggingFaceModel

# Hub Model configuration. https://huggingface.co/models
hub = {
    'HF_MODEL_ID':'microsoft/layoutlmv2-base-uncased',
    'HF_TASK':'token-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data="s3://sagemaker-us-east-1-636218042492/huggingface-workshop-2022-09-06-05-57-2-2022-09-06-05-57-31-172/model.tar.gz",  # path to your trained SageMaker model
    role=role,                                            # IAM role with permissions to create an endpoint
    transformers_version="4.17.0",                       # Transformers version used
    pytorch_version="1.10.2",                             # PyTorch version used
    py_version='py38',                                    # Python version used
    env=hub,
    image_uri='636218042492.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference-detectron2:latest'
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)

-----------!

# Send request to Endpoint

In [5]:
import boto3

client = boto3.client('sagemaker-runtime')

In [None]:
custom_attributes = "c000b4f9-df62-4c85-a0bf-7c525f9104a4"  # An example of a trace ID.
endpoint_name = "..."                                       # Your endpoint name.
content_type = "..."                                        # The MIME type of the input data in the request body.
accept = "..."                                              # The desired MIME type of the inference in the response.
payload = "..."                                             # Payload for inference.
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    CustomAttributes=custom_attributes,
    ContentType=content_type,
    Accept=accept,
    Body=payload
)


print(response['CustomAttributes'])                         # If model receives and updates the custom_attributes header 
                                                            # by adding "Trace id: " in front of custom_attributes in the request,
                                                            # custom_attributes in response becomes
                                                            # "Trace ID: c000b4f9-df62-4c85-a0bf-7c525f9104a4"