# Huggingface Sagemaker-sdk extension example using `Trainer` class

## Installs requirements if you haven´t already done it and sets up ipywidgets for datasets in sagemaker studio

In [10]:
%%capture
!pip install -r ../requirements.txt --upgrade

In [5]:
%%capture
import os 
import IPython
if 'SAGEMAKER_TRAINING_MODULE' in os.environ:
    !conda install -c conda-forge ipywidgets -y
    IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used

## Install git-lfs

In [None]:
#!brew install git-lfs
# ! curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
# !sudo apt-get install git-lfs


## Initializing Sagemaker Session with local AWS Profile

From outside these notebooks, `get_execution_role()` will return an exception because it does not know what is the role name that SageMaker requires.

To solve this issue, pass the IAM role name instead of using `get_execution_role()`.

Therefore you have to create an IAM-Role with correct permission for sagemaker to start training jobs and download files from s3. Beware that you need s3 permission on bucket-level `"arn:aws:s3:::sagemaker-*"` and on object-level     `"arn:aws:s3:::sagemaker-*/*"`. 

You can read [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) how to create a role with right permissions.

In [1]:
# local aws profile configured in ~/.aws/credentials
local_profile_name='hf-sm' # optional if you only have default configured

# role name for sagemaker -> needs the described permissions from above
role_name = "SageMakerRole"

In [2]:
import sagemaker
import os
try:
    sess = sagemaker.Session()
    role = sagemaker.get_execution_role()
except Exception:
    import boto3
    # creates a boto3 session using the local profile we defined
    if local_profile_name:
        os.environ['AWS_PROFILE'] = local_profile_name # setting env var bc local-mode cannot use boto3 session
        #bt3 = boto3.session.Session(profile_name=local_profile_name)
        #iam = bt3.client('iam')
        # create sagemaker session with boto3 session
        #sess = sagemaker.Session(boto_session=bt3)
    iam = boto3.client('iam')
    sess = sagemaker.Session()
    # get role arn
    role = iam.get_role(RoleName=role_name)['Role']['Arn']
    


print(role)


Couldn't call 'get_role' to get Role ARN from role name philipp to get Role path.


arn:aws:iam::558105141721:role/SageMakerRole


### Sagemaker Session prints

In [3]:
print(sess.list_s3_files(sess.default_bucket(),'datasets/')) # list objects in s3 under datsets/
print(sess.default_bucket()) # s3 bucketname
print(sess.boto_region_name) # aws region of sagemaker session

['datasets/imdb/small/test/dataset.arrow', 'datasets/imdb/small/test/dataset_info.json', 'datasets/imdb/small/test/state.json', 'datasets/imdb/small/test/test_dataset.pt', 'datasets/imdb/small/train/dataset.arrow', 'datasets/imdb/small/train/dataset_info.json', 'datasets/imdb/small/train/state.json', 'datasets/imdb/small/training/train_dataset.pt', 'datasets/imdb/test/dataset.arrow', 'datasets/imdb/test/dataset_info.json', 'datasets/imdb/test/state.json', 'datasets/imdb/train/dataset.arrow', 'datasets/imdb/train/dataset_info.json', 'datasets/imdb/train/state.json']
sagemaker-eu-central-1-558105141721
eu-central-1


# Imports

Since we are using the `.py` module directly from `huggingface/` we have to adjust our `sys.path` to be able to import our estimator

In [3]:
import sys, os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)


# Preprocessing the data

In [121]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [122]:
# load dataset
dataset = load_dataset('imdb')

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

#helper tokenizer function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000)) # smaller the size for test dataset to 10k 

# sample a to small dataset for training
#train_dataset = train_dataset.shuffle().select(range(2000)) # smaller the size for test dataset to 10k 
#test_dataset = test_dataset.shuffle().select(range(150)) # smaller the size for test dataset to 10k 


# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# set format for pytorch
train_dataset.rename_column_("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.rename_column_("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Reusing dataset imdb (/Users/philippschmid/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)
Reusing dataset imdb (/Users/philippschmid/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)
Loading cached shuffled indices for dataset at /Users/philippschmid/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-f7ed38da5ada7a37.arrow


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Upload data to sagemaker S3

In [123]:
import glob
def upload_data_to_s3(dataset=None,prefix='datasets',split_type='train'):
    """helper function with saves the dataset locally using dataset.save_to_disk() and upload its then to s3. """
    
    temp_prefix =f"{prefix}/{split_type}"
    # saves datasets in local directory
    dataset.save_to_disk(f"./{temp_prefix}")
    
    # loops over saved files and uploads them to s3 
    for file in glob.glob(f"./{temp_prefix}/*"):
        sess.upload_data(file, key_prefix=temp_prefix)

    # return s3 url to files for estimator.fit()
    return f"s3://{sess.default_bucket()}/{temp_prefix}"

In [124]:
prefix = 'datasets/imdb'

training_input_path  = upload_data_to_s3(dataset=train_dataset,prefix=prefix,split_type='train')
test_input_path      = upload_data_to_s3(dataset=test_dataset,prefix=prefix,split_type='test')

print(training_input_path)
print(test_input_path)


s3://sagemaker-eu-central-1-558105141721/datasets/imdb/train
s3://sagemaker-eu-central-1-558105141721/datasets/imdb/test


## Create an Estimator

The following code sample shows how you train a custom HuggingFace script `train.py`, passing in three hyperparameters (`epochs`,`train_batch_size`,`model_name`). We are not going to pass any data into sagemaker training job instead it will be downloaded in `train.py`


In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
training_input_path  = "s3://sagemaker-eu-central-1-558105141721/datasets/imdb/small/train"
test_input_path      = "s3://sagemaker-eu-central-1-558105141721/datasets/imdb/small/train"

In [10]:
from huggingface.estimator import HuggingFace


huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='../scripts',
                            sagemaker_session=sess,
                            base_job_name='huggingface-sdk-extension',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            framework_version={'transformers':'4.1.1','datasets':'1.1.3'},
                            py_version='py3',
                            hyperparameters = {'epochs': 1,
                                               'train_batch_size': 32,
                                               'model_name':'distilbert-base-uncased'
                                                })

In [11]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path},)

Starting training
2021-01-06 16:38:17 Starting - Starting the training job...
2021-01-06 16:38:41 Starting - Launching requested ML instancesProfilerReport-1609951097: InProgress
......
2021-01-06 16:39:45 Starting - Preparing the instances for training......
2021-01-06 16:40:46 Downloading - Downloading input data...
2021-01-06 16:41:03 Training - Downloading the training image....................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-01-06 16:44:29,297 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-01-06 16:44:29,320 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-01-06 16:44:32,340 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-01-06 16:44:32,764 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0

[34m{'eval_loss': 0.6557341814041138, 'eval_accuracy': 0.796, 'eval_f1': 0.7838983050847458, 'eval_precision': 0.8495981630309989, 'eval_recall': 0.727630285152409, 'epoch': 1.0}[0m
[34m{'epoch': 1.0}[0m
[34m***** Eval results *****[0m
[34m#015Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 442/442 [00:00<00:00, 392kB/s][0m
[34m#015Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]#015Downloading:   1%|▏         | 3.76M/268M [00:00<00:07, 37.6MB/s]#015Downloading:   3%|▎         | 8.43M/268M [00:00<00:06, 39.9MB/s]#015Downloading:   5%|▍         | 13.3M/268M [00:00<00:06, 42.3MB/s]#015Downloading:   7%|▋         | 18.4M/268M [00:00<00:05, 44.6MB/s]#015Downloading:   9%|▉         | 23.9M/268M [00:00<00:05, 47.2MB/s]#015Downloading:  11%|█         | 29.4M/268M [00:00<00:04, 49.3MB/s]#015Downloading:  13%|█▎        | 34.9M/268M [00:00<00:04, 51.0MB/s]#015Downloading:  15%|█▌        | 40.5M/268M [00:00<00:04, 52.2MB/s]#015Downloading:  


2021-01-06 16:45:46 Uploading - Uploading generated training model
2021-01-06 16:46:47 Completed - Training job completed
ProfilerReport-1609951097: NoIssuesFound
Training seconds: 341
Billable seconds: 341
uploading model to hub


# Upload model to manually (soon integrated)

### Logn into HF Hub

In [93]:
import os
from transformers.hf_api import HfApi
from huggingface.HfRepository import HfRepository
import getpass

In [16]:


username = input("username: ")
password = getpass.getpass("password: ")

huggingface_token = HfApi().login(username, password)

username: philschmid
password: ········


### Create Repository for model

In [17]:
repo_name="sagemaker-test-123"

repo_url = HfApi().create_repo(token=huggingface_token,name=repo_name)

### Initialize HF Repository API

In [22]:
!git --version
!git lfs --version

git version 2.24.2 (Apple Git-127)
git-lfs/2.13.1 (GitHub; darwin amd64; go 1.15.5)


In [92]:
model_dir = "./model/huggingface-sdk-extension-2021-01-06-16-38-17-142"

In [94]:
model_repo = HfRepository(repo_url=repo_url,huggingface_token=huggingface_token,model_dir=model_dir)

**create model card**

In [95]:
model_repo.create_model_card(dataset="imdb", 
                             model_id=huggingface_estimator.latest_training_job.name,
                             hyperparameters=huggingface_estimator.hyperparameters(),
                             eval_results={"nothing":'logged'})

In [96]:
!cat {model_dir}/README.md


---
tags:
- sagemaker
datasets:
- imdb
---
## huggingface-sdk-extension-2021-01-06-16-38-17-142 Trained from SageMaker HuggingFace extension.

#### Hyperparameters
```json
{'epochs': '1', 'train_batch_size': '32', 'model_name': '"distilbert-base-uncased"', 'sagemaker_submit_directory': '"s3://sagemaker-eu-central-1-558105141721/huggingface-sdk-extension-2021-01-06-16-38-17-142/source/sourcedir.tar.gz"', 'sagemaker_program': '"train.py"', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"huggingface-sdk-extension-2021-01-06-16-38-17-142"', 'sagemaker_region': '"eu-central-1"'}
```

#### Eval
| key | value |
| --- | ----- |
| nothing | logged |


In [97]:
model_repo.commit_files_and_push_to_hub()