In [1]:
import sagemaker
from sagemaker import get_execution_role

In [8]:
role = sagemaker.get_execution_role()
sess = sagemaker.Session()
bucket = f"az-ade-{sess.account_id()}"
bucket

'az-ade-905847418383'

In [9]:
training_input_path = f's3://{bucket}/processing_output/train_data'
val_input_path = f's3://{bucket}/processing_output/validation_data'

In [30]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 2,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

In [31]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.6',
                            pytorch_version='1.7',
                            py_version='py36',
                            output_path=f's3://{bucket}/training_output/',
                            base_job_name="az-ade-training",
                            hyperparameters = hyperparameters,
                            disable_profiler=True)

In [32]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'val': val_input_path})

2021-10-05 11:20:24 Starting - Starting the training job...
2021-10-05 11:20:28 Starting - Launching requested ML instances......
2021-10-05 11:21:44 Starting - Preparing the instances for training.........
2021-10-05 11:23:18 Downloading - Downloading input data...
2021-10-05 11:23:33 Training - Downloading the training image.................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-10-05 11:26:36,161 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-10-05 11:26:36,184 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-10-05 11:26:36,192 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-10-05 11:26:36,622 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    

In [34]:
training_job_name = huggingface_estimator.latest_training_job.name
training_job_name

'az-ade-training-2021-10-05-11-20-24-189'

In [35]:
%store training_job_name

Stored 'training_job_name' (str)
