# Huggingface Sagemaker-sdk extension example using `Trainer` class

## Installs requirements if you haven´t already done it and sets up ipywidgets for datasets in sagemaker studio

In [1]:
%%capture
!pip install -r ../requirements.txt --upgrade

In [2]:
%%capture
import os 
import IPython
if 'SAGEMAKER_TRAINING_MODULE' in os.environ:
    !conda install -c conda-forge ipywidgets -y
    IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used

## Initializing Sagemaker Session with local AWS Profile

From outside these notebooks, `get_execution_role()` will return an exception because it does not know what is the role name that SageMaker requires.

To solve this issue, pass the IAM role name instead of using `get_execution_role()`.

Therefore you have to create an IAM-Role with correct permission for sagemaker to start training jobs and download files from s3. Beware that you need s3 permission on bucket-level `"arn:aws:s3:::sagemaker-*"` and on object-level     `"arn:aws:s3:::sagemaker-*/*"`. 

You can read [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) how to create a role with right permissions.

In [3]:
# local aws profile configured in ~/.aws/credentials
local_profile_name='default' # optional if you only have default configured

# role name for sagemaker -> needs the described permissions from above
role_name = "AmazonSageMaker-ExecutionRole-20201222T210251"

In [4]:
import sagemaker
import os
try:
    sess = sagemaker.Session()
    role = sagemaker.get_execution_role()
except Exception:
    import boto3
    # creates a boto3 session using the local profile we defined
    if local_profile_name:
        os.environ['AWS_PROFILE'] = local_profile_name # setting env var bc local-mode cannot use boto3 session
        #bt3 = boto3.session.Session(profile_name=local_profile_name)
        #iam = bt3.client('iam')
        # create sagemaker session with boto3 session
        #sess = sagemaker.Session(boto_session=bt3)
    iam = boto3.client('iam')
    sess = sagemaker.Session()
    # get role arn
    role = iam.get_role(RoleName=role_name)['Role']['Arn']
    


print(role)


Couldn't call 'get_role' to get Role ARN from role name lagunas to get Role path.


arn:aws:iam::854676674973:role/service-role/AmazonSageMaker-ExecutionRole-20201222T210251


### Sagemaker Session prints

In [5]:
print(sess.list_s3_files(sess.default_bucket(),'datasets/')) # list objects in s3 under datsets/
print(sess.default_bucket()) # s3 bucketname
print(sess.boto_region_name) # aws region of sagemaker session

['datasets/imdb/test/dataset.arrow', 'datasets/imdb/test/dataset_info.json', 'datasets/imdb/test/state.json', 'datasets/imdb/train/dataset.arrow', 'datasets/imdb/train/dataset_info.json', 'datasets/imdb/train/state.json']
sagemaker-eu-west-1-854676674973
eu-west-1


# Imports

Since we are using the `.py` module directly from `huggingface/` we have to adjust our `sys.path` to be able to import our estimator

In [6]:
import sys, os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)


# Preprocessing the data

## Upload data to sagemaker S3

## Create an local estimator for testing

You run PyTorch training scripts on SageMaker by creating PyTorch Estimators. SageMaker training of your script is invoked when you call fit on a PyTorch Estimator. The following code sample shows how you train a custom PyTorch script `train.py`, passing in three hyperparameters (`epochs`). We are not going to pass any data into sagemaker training job instead it will be downloaded in `train.py`

in sagemaker you can test you training in a "local-mode" by setting your instance_type to `'local'`


## Importing custom sdk-extension for HuggingFace

In [7]:
from huggingface.estimator import HuggingFace

## Create an local Estimator

The following code sample shows how you train a custom HuggingFace script `train.py`, passing in three hyperparameters (`epochs`,`train_batch_size`,`model_name`). We are not going to pass any data into sagemaker training job instead it will be downloaded in `train.py`

In [22]:

local = False
if local:
    instance_type = "local-gpu"
    sess = None
    batch_size = 1
else:
    instance_type = "ml.p3.2xlarge"
    sagemaker_session=sess
    batch_size = 16

    
{'loss': 12.476279296875, 'learning_rate': 0.0004629629629629629, 'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss': 5.739205135345459, 'distil_loss': 13.224848453521728, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.2500004979968071, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.24996592348814012, 'regu_loss': 0.49996642220020293, 'nnz_perc': 1.0, 'epoch': 0.04518344478583047}
    
def build_metric_definitions():
    ret = []
    train_metrics = ['loss',
 'learning_rate',
 'threshold',
 'ampere_temperature',
 'regu_lambda',
 'ce_loss',
 'distil_loss',
 'nnz_perc_attention',
 'regu_loss_attention',
 'nnz_perc_dense',
 'regu_loss_dense',
 'regu_loss',
 'nnz_perc',
 'epoch']
    eval_metrics = ["f1", "precision"]
        
    metric_types = {"train":("",train_metrics), "validation":("eval_", eval_metrics)}
    for k, (prefix, metrics) in metric_types.items():
        for m in metrics:
            ret += {'Name': f"{k}:{m}", 'Regex':f"'{prefix}{m}': (.*?),"},
    return ret
        

    
metric_definitions = build_metric_definitions()

from nn_pruning.examples.question_answering.qa_sparse_xp import SparseQAShortNamer

hyperparameters = {'num-train-epochs': 0.1, "per-device-train-batch-size": batch_size}

def get_hp_name(hyper_parameters):
    p = {k.replace("-", "_"):v for k,v in hyper_parameters.items()}    
    
    sn = SparseQAShortNamer()
    
    ret = sn.shortname(p)
    return ret
    
base_job_name = "nn-pruning-" + get_hp_name(hyperparameters)[3:].replace(".", "-")
print(base_job_name)
        
huggingface_estimator = HuggingFace(entry_point='nn_pruning_train.py',
                                    source_dir='../scripts',
                                    sagemaker_session=sess,
                                    base_job_name=base_job_name,
                                    instance_type=instance_type,
                                    instance_count=1,
                                    role=role,
                                    framework_version={'transformers':'4.1.1','datasets':'1.1.3'},
                                    py_version='py3',
                                    metric_definitions = metric_definitions,
                                    hyperparameters = hyperparameters)

nn-pruning-nte0-1
IMAGE_URI 854676674973.dkr.ecr.eu-west-1.amazonaws.com/huggingface-nn-pruning-training:0.0.1-gpu-transformers4.1.1-datasets1.1.3-cu110


In [23]:
huggingface_estimator.image_uri

'854676674973.dkr.ecr.eu-west-1.amazonaws.com/huggingface-nn-pruning-training:0.0.1-gpu-transformers4.1.1-datasets1.1.3-cu110'

In [24]:
huggingface_estimator.fit()

2021-01-14 16:45:13,020 - sagemaker.image_uris - INFO - Defaulting to the only supported framework/algorithm version: latest.
2021-01-14 16:45:13,024 - sagemaker.image_uris - INFO - Ignoring unnecessary instance type: None.
2021-01-14 16:45:13,181 - sagemaker - INFO - Creating training-job with name: nn-pruning-nte0-1-2021-01-14-15-45-12-691
2021-01-14 15:45:13 Starting - Starting the training job...
2021-01-14 15:45:36 Starting - Launching requested ML instancesProfilerReport-1610639113: InProgress
......
2021-01-14 15:46:38 Starting - Preparing the instances for training......
2021-01-14 15:47:38 Downloading - Downloading input data
2021-01-14 15:47:38 Training - Downloading the training image.....................
2021-01-14 15:51:20 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-01-14 15:51:14,885 sagemaker-training-toolkit I

[34m{'loss': 12.78487548828125, 'learning_rate': 0.00018518518518518518, 'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss': 5.920670676231384, 'distil_loss': 13.547569284439087, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.24999995067715644, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.2499970443546772, 'regu_loss': 0.49999699532985686, 'nnz_perc': 1.0, 'epoch': 0.018073377914332188}[0m
[34m{'loss': 12.50663330078125, 'learning_rate': 0.00037037037037037035, 'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss': 5.75607280254364, 'distil_loss': 13.256698904037476, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.25000024259090425, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.2499660810828209, 'regu_loss': 0.4999663245677948, 'nnz_perc': 1.0, 'epoch': 0.036146755828664376}[0m






[34m{'eval_exact_match': 1.315042573320719, 'eval_f1': 9.593677739438615, 'eval_threshold': 0.1, 'eval_ampere_temperature': 20.0, 'eval_regu_lambda': 10.0, 'ce_loss': 5.342538719177246, 'distil_loss': 12.515705890655518, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.2500021034479141, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.24990336656570433, 'regu_loss': 0.4999054712057114, 'nnz_perc': 1.0, 'epoch': 0.04518344478583047}[0m
[34m{'loss': 11.19134765625, 'learning_rate': 0.0005555555555555556, 'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss': 4.896269073486328, 'distil_loss': 11.216312217712403, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.2500062555074692, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.24983613640069963, 'regu_loss': 0.4998423945903778, 'nnz_perc': 1.0, 'epoch': 0.054220133742996564}[0m
[34m{'loss': 9.512986450195312, 'learning_rate': 0.0007407407407407407, 'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss'





[34m{'eval_exact_match': 14.3519394512772, 'eval_f1': 26.77266717929784, 'eval_threshold': 0.1, 'eval_ampere_temperature': 20.0, 'eval_regu_lambda': 10.0, 'epoch': 0.09036688957166095}[0m
[34m{'threshold': 0, 'ampere_temperature': 0.0, 'regu_lambda': 0.0, 'ce_loss': 3.323194795184665, 'distil_loss': 7.077506692321212, 'nnz_perc_attention': 1.0, 'regu_loss_attention': 0.25007145051602964, 'nnz_perc_dense': 1.0, 'regu_loss_dense': 0.24922610405418608, 'regu_loss': 0.49929755705374257, 'nnz_perc': 1.0, 'epoch': 0.10012651364540033}[0m






[34m01/14/2021 15:51:25 - INFO - __main__ -   Training/evaluation parameters {'model_name_or_path': 'bert-base-uncased', 'dataset_name': 'squad', 'do_train': 1, 'do_eval': 1, 'per_device_train_batch_size': 16, 'max_seq_length': 384, 'doc_stride': 128, 'num_train_epochs': 0.1, 'logging_steps': 100, 'save_steps': 5000, 'eval_steps': 250, 'save_total_limit': 5, 'seed': 17, 'evaluation_strategy': 'steps', 'learning_rate': 3e-05, 'mask_scores_learning_rate': 0.01, 'output_dir': '/opt/ml/model', 'logging_dir': '/opt/ml/model', 'overwrite_cache': 0, 'overwrite_output_dir': 1, 'warmup_steps': 5400, 'initial_warmup': 1, 'final_warmup': 10, 'initial_threshold': 0, 'final_threshold': 0.1, 'dense_pruning_method': 'sigmoied_threshold:1d_alt', 'dense_block_rows': 1, 'dense_block_cols': 1, 'dense_lambda': 1.0, 'attention_pruning_method': 'sigmoied_threshold', 'attention_block_rows': 1, 'attention_block_cols': 1, 'attention_lambda': 1.0, 'ampere_pruning_method': 'disabled', 'mask_init': 'constant', '


2021-01-14 16:09:56 Uploading - Uploading generated training model
2021-01-14 16:11:17 Completed - Training job completed
ProfilerReport-1610639113: IssuesFound
Training seconds: 1418
Billable seconds: 1418


## Create an Estimator

The following code sample shows how you train a custom HuggingFace script `train.py`, passing in three hyperparameters (`epochs`,`train_batch_size`,`model_name`). We are not going to pass any data into sagemaker training job instead it will be downloaded in `train.py`


In [None]:
from huggingface.estimator import HuggingFace


huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='../scripts',
                            sagemaker_session=sess,
                            base_job_name='huggingface-sdk-extension',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            framework_version={'transformers':'4.1.1','datasets':'1.1.3'},
                            py_version='py3',
                            hyperparameters = {'epochs': 1,
                                               'train_batch_size': 32,
                                               'model_name':'distilbert-base-uncased'
                                                })

In [None]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

# Estimator Parameters

### Get S3 url for model data

In [None]:
huggingface_estimator.model_data

### Get latest training job name

In [None]:
huggingface_estimator.latest_training_job.name

### Attach to old estimator 

e.g. to get model data

In [None]:
old_job_name='huggingface-sdk-extension-2020-12-27-15-25-50-506'

In [None]:
from sagemaker.estimator import Estimator

In [None]:
huggingface_estimator_loaded = Estimator.attach(old_job_name)

In [None]:
huggingface_estimator_loaded.model_data

### Download model from s3

**using huggingface utils**

In [None]:
from huggingface.utils import download_model

download_model(model_data=huggingface_estimator_loaded.model_data,
               unzip=True,
               model_dir=huggingface_estimator_loaded.latest_training_job.name)

**using class built-in method**

In [None]:
huggingface_estimator.download_model(unzip=False)

### Access logs

until [PR](https://github.com/aws/sagemaker-python-sdk/pull/2059) is merged

In [None]:
huggingface_estimator.sagemaker_session.logs_for_job(huggingface_estimator.latest_training_job.name, wait=True)

**after merged PR**

In [None]:
huggingface_estimator.logs()