In [13]:
import wandb
wandb.login()
settings = wandb.setup().settings
current_api_key = wandb.wandb_lib.apikey.api_key(settings=settings)

In [14]:
import sagemaker
import boto3
boto3_session = boto3.Session(profile_name="cornell")
session = sagemaker.Session(boto_session=boto3_session)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [15]:
from sagemaker.pytorch import PyTorch

# This job name is used as prefix to the sagemaker training job. Makes it easy for your look for your training job in SageMaker Training job console.
job_name = "clip-forge-autoencoder"
print("Job name: ", job_name)

# This is the location that SageMaker will automatically store (and load)
# checkpoints from. We can use this to automatically resume training from an
# earlier checkpoint when using Spot instances.
checkpoint_bucket = session.default_bucket()
autoresume_checkpoint_prefix = f"{job_name}/01"
autoresume_checkpoint_s3_uri = f"s3://{checkpoint_bucket}/{autoresume_checkpoint_prefix}"
print(f"Checkpoints resumable from {autoresume_checkpoint_s3_uri}")

estimator = PyTorch(
    base_job_name=job_name,
    source_dir=".",
    entry_point="train_autoencoder.py",  #  the entry point that launches the training script with options
    role="arn:aws:iam::870747888580:role/SageMakerTrainingRole",
    sagemaker_session=session,
    framework_version="1.13.1", # PyTorch version to use
    py_version="py39", # Python version to use
    instance_count=1, # Number of instances to launch
    instance_type="ml.g5.8xlarge", # Instance type to launch
    debugger_hook_config=False,
    environment={"WANDB_API_KEY": current_api_key},
    input_mode="FastFile",
    checkpoint_s3_uri=autoresume_checkpoint_s3_uri, # S3 location to automatically load and store checkpoints from
    use_spot_instances=True, # Use Managed Spot Training
    max_wait=24*60*60+1, # Must be larger than 24 hours, the default max runtime,
    hyperparameters={
        "batch_size": "128",
        "test_batch_size": "128",
        # "lr": "1.0964781961431852e-05" # Learned from the Pytorch Lightining LR Finder
        "lr": "0.001",
        # "checkpoint": "maxdumas/clip_forge/model-clip-forge-autoencoder-2023-02-26-05-03-01-430-9xm6yj-algo-1:v9"
    },
)

Job name:  clip-forge-autoencoder
Checkpoints resumable from s3://sagemaker-us-east-1-870747888580/clip-forge-autoencoder/01


In [16]:
estimator.fit({ "train": "s3://cornell-mfd64/text2building/shapenet" })

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: clip-forge-autoencoder-2023-02-26-16-19-14-432


2023-02-26 16:19:36 Starting - Starting the training job...
2023-02-26 16:19:52 Starting - Preparing the instances for training.........
2023-02-26 16:21:38 Downloading - Downloading input data................................................
2023-02-26 16:29:32 Interrupted - Training job interrupted..................
2023-02-26 16:32:54 Starting - Starting the training job...
2023-02-26 16:33:32 Starting - Preparing the instances for training............
2023-02-26 16:35:10 Downloading - Downloading input data............
2023-02-26 16:37:41 Training - Downloading the training image.........
2023-02-26 16:38:57 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-02-26 16:39:32,132 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-02-26 16:39:32,147 sagemaker-training-toolkit 

KeyboardInterrupt: 