In [47]:
import sagemaker
import boto3

# 2) Hard‑code (or read from env var) the execution‑role ARN you created
role = "arn:aws:iam::371087393859:role/defaultrole"
bucket = "ir-sagemaker"
session = boto3.Session(profile_name="lprofile", region_name="us-east-1")

sm_session = sagemaker.Session(boto_session=session, default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [48]:

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sm_session.default_bucket()}")
print(f"sagemaker session region: {sm_session.boto_region_name}")


sagemaker role arn: arn:aws:iam::371087393859:role/defaultrole
sagemaker bucket: ir-sagemaker
sagemaker session region: us-east-1


In [49]:
import sys
print(sys.version)

3.11.13 (main, Jun  5 2025, 13:12:00) [GCC 11.2.0]


In [50]:
from sagemaker.s3 import S3Uploader
bucket = sm_session.default_bucket()
prefix = "modernbert"

train_uri = f"s3://{bucket}/{prefix}/train/train.jsonl"
val_uri   = f"s3://{bucket}/{prefix}/val/val.jsonl"
test_uri  = f"s3://{bucket}/{prefix}/test/test.jsonl"

In [6]:
train_uri = S3Uploader.upload("modernbert/data/train/train.jsonl", f"s3://{bucket}/{prefix}/train")
val_uri = S3Uploader.upload("modernbert/data/val/val.jsonl",   f"s3://{bucket}/{prefix}/val")
test_uri = S3Uploader.upload("modernbert/data/test/test.jsonl", f"s3://{bucket}/{prefix}/test")

In [51]:
from sagemaker.huggingface import HuggingFace

metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

hyper = {"learning_rate":3e-5,
         "num_train_epochs":5,
         "temperature":0.05,
         "deepspeed": "ds_zero3.json"}

est = HuggingFace(
    entry_point="train_sm.py",
    source_dir="modernbert",
    role=role,
    instance_type="ml.g5.12xlarge",
    instance_count=1,
    distribution={"mpi": {"enabled": True}},
    transformers_version="4.49.0", pytorch_version="2.5.1", py_version="py311",
    hyperparameters=hyper,
    metric_definitions=metric_definitions,
    environment={
        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "NCCL_DEBUG": "INFO"
    },
    output_path=f"s3://{bucket}/{prefix}/outputs"
)

In [None]:
est.fit({"train": train_uri, "val": val_uri, "test": test_uri})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-07-30-12-35-05-226


In [46]:
from sagemaker.analytics import TrainingJobAnalytics

df = TrainingJobAnalytics(training_job_name='huggingface-pytorch-training-2025-07-30-03-59-45-613').dataframe()
print(df)



    timestamp              metric_name      value
0         0.0                     loss   3.360500
1       180.0                     loss   2.731600
2         0.0            learning_rate   1.721739
3       180.0            learning_rate   4.173913
4         0.0            eval_accuracy   0.031250
5         0.0                  eval_f1   0.019156
6         0.0           eval_precision   0.022735
7         0.0              eval_recall   0.031250
8         0.0             eval_runtime  21.798900
9         0.0  eval_samples_per_second  22.157000
10        0.0                    epoch   0.430000
11      180.0                    epoch   0.870000
12      240.0                    epoch   1.000000
