In [35]:
import sagemaker
import boto3

# 2) Hard‑code (or read from env var) the execution‑role ARN you created
role = "arn:aws:iam::371087393859:role/defaultrole"
bucket = "ir-sagemaker"
session = boto3.Session(profile_name="lprofile", region_name="us-east-1")

sm_session = sagemaker.Session(boto_session=session, default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [36]:

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sm_session.default_bucket()}")
print(f"sagemaker session region: {sm_session.boto_region_name}")


sagemaker role arn: arn:aws:iam::371087393859:role/defaultrole
sagemaker bucket: ir-sagemaker
sagemaker session region: us-east-1


In [37]:
from sagemaker.s3 import S3Uploader
bucket = sm_session.default_bucket()
prefix = "modernbert"

train_uri = f"s3://{bucket}/{prefix}/train/train.jsonl"
val_uri   = f"s3://{bucket}/{prefix}/val/val.jsonl"
test_uri  = f"s3://{bucket}/{prefix}/test/test.jsonl"

In [6]:
train_uri = S3Uploader.upload("modernbert/data/train/train.jsonl", f"s3://{bucket}/{prefix}/train")
val_uri = S3Uploader.upload("modernbert/data/val/val.jsonl",   f"s3://{bucket}/{prefix}/val")
test_uri = S3Uploader.upload("modernbert/data/test/test.jsonl", f"s3://{bucket}/{prefix}/test")

In [42]:
from sagemaker.huggingface import HuggingFace
hyper = {"learning_rate":3e-5,
         "num_train_epochs":3,
         "temperature":0.05,
         "deepspeed": "ds_zero3.json"}
est = HuggingFace(
    entry_point="train_sm.py",
    source_dir="modernbert",
    role=role,
    instance_type="ml.g5.12xlarge",
    instance_count=1,
    distribution={"mpi": {"enabled": True}},
    transformers_version="4.54.0", pytorch_version="2.5.1", py_version="py311",
    hyperparameters=hyper,
    environment={
        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "NCCL_DEBUG": "INFO"
    },
    output_path=f"s3://{bucket}/{prefix}/outputs"
)

In [43]:
est.fit({"train": train_uri, "val": val_uri, "test": test_uri})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-07-28-22-30-43-226


2025-07-28 22:30:47 Starting - Starting the training job
2025-07-28 22:30:47 Pending - Training job waiting for capacity...
2025-07-28 22:31:17 Pending - Preparing the instances for training......
2025-07-28 22:32:03 Downloading - Downloading the training image........................
2025-07-28 22:36:00 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 550.163.01[0m
[34mCurrent installed NVIDIA driver version is 550.163.01[0m
[34mSkipping CUDA compat setup as newer NVIDIA driver is installed[0m
[34m2025-07-28 22:36:39,572 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-07-28 22:36:39,608 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-07-28 22:36

In [None]:
from sagemaker.huggingface import HuggingFaceModel
model = HuggingFaceModel(model_data=est.model_data, role=role,
                         transformers_version="4.42", pytorch_version="2.2", py_version="py39")
predictor = model.deploy(1, "ml.m5.xlarge")

In [None]:
predictor.predict({"inputs":"Example prefix …"})
predictor.delete_endpoint()    # stop billing

In [28]:
import os, torch
alloc_conf = os.getenv("PYTORCH_CUDA_ALLOC_CONF")
print("Allocator setting:", alloc_conf)

Allocator setting: None
