# HuggingFace Hub meets Amazon SageMaker

Fine-tune a Multi-Class Classification with Trainer and emotion dataset and push it to the Hugging Face Hub

From: https://github.com/huggingface/notebooks/blob/master/sagemaker/14_train_and_push_to_hub/sagemaker-notebook.ipynb

In [1]:
!pip install "sagemaker>=2.69.0" "transformers==4.12.3" --upgrade
# using older dataset due to incompatibility of sagemaker notebook & aws-cli with > s3fs and fsspec to >= 2021.10
!pip install  "datasets==1.13" --upgrade

Collecting huggingface-hub<1.0,>=0.1.0
  Using cached huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.0.19
    Uninstalling huggingface-hub-0.0.19:
      Successfully uninstalled huggingface-hub-0.0.19
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 1.13.0 requires huggingface-hub<0.1.0,>=0.0.18, but you have huggingface-hub 0.1.2 which is incompatible.[0m
Successfully installed huggingface-hub-0.1.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


Collecting huggingface-hub<0.1.0,>=0.0.18
  Using cached huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
Installing collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.1.2
    Uninstalling huggingface-hub-0.1.2:
      Successfully uninstalled huggingface-hub-0.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.12.3 requires huggingface-hub<1.0,>=0.1.0, but you have huggingface-hub 0.0.19 which is incompatible.[0m
Successfully installed huggingface-hub-0.0.19
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import sagemaker
assert sagemaker.__version__ >= "2.69.0"

In [3]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::416442423885:role/TeamRole
sagemaker bucket: sagemaker-eu-west-1-416442423885
sagemaker session region: eu-west-1


In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
dataset_name = 'emotion'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/emotion'

In [5]:
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])

# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Using custom data configuration default
Reusing dataset emotion (/home/ec2-user/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# Uploading data to sagemaker_session_bucket

In [6]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path, fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path, fs=s3)

In [7]:
from huggingface_hub import notebook_login

In [8]:
notebook_login()

Login successful
Your token has been saved to /home/ec2-user/.huggingface/token
[1m[31mAuthenticated through git-crendential store but this isn't the helper defined on your machine.
You will have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal to set it as the default

git config --global credential.helper store[0m


In [9]:
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder
import time

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,                                    # number of training epochs
                 'train_batch_size': 32,                         # batch size for training
                 'eval_batch_size': 64,                          # batch size for evaluation
                 'learning_rate': 3e-5,                          # learning rate used during training
                 'model_id':'distilbert-base-uncased',           # pre-trained model
                 'fp16': True,                                   # Whether to use 16-bit (mixed) precision training
                 'push_to_hub': True,                            # Defines if we want to push the model to the hub
                 'hub_model_id': 'sagemaker-distilbert-emotion', # The model id of the model to push to the hub
                 'hub_strategy': 'every_save',                   # The strategy to use when pushing the model to the hub
                 'hub_token': HfFolder.get_token()               # HuggingFace token to have permission to push
                }

# define Training Job Name 
job_name = f'push-to-hub-sample-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',        # fine-tuning script used in training jon
    source_dir           = './scripts',       # directory where fine-tuning script is stored
    instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    transformers_version = '4.12',           # the transformers version used in the training job
    pytorch_version      = '1.9',           # the pytorch_version version used in the training job
    py_version           = 'py38',            # the python version used in the training job
    hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
)

In [10]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': training_input_path,
    'test': test_input_path
}

# starting the train job with our uploaded datasets as input
# setting wait to False to not expose the HF Token
huggingface_estimator.fit(data, wait=False)

In [11]:
# adding waiter to see when training is done
waiter = huggingface_estimator.sagemaker_session.sagemaker_client.get_waiter('training_job_completed_or_stopped')
waiter.wait(TrainingJobName=huggingface_estimator.latest_training_job.name)

# Accessing the model on hf.co/models


In [14]:
from huggingface_hub import HfApi

whoami = HfApi().whoami()
username = whoami['name']
#username = 'marcelcastrobr'

print(f"https://huggingface.co/{username}/{hyperparameters['hub_model_id']}")

https://huggingface.co/marcelcastrobr/sagemaker-distilbert-emotion


## Deploying the model from Hugging Face to a SageMaker Endpoint


In [15]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':f"{username}/{hyperparameters['hub_model_id']}",
	'HF_TASK':'text-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.12',
	pytorch_version='1.9',
	py_version='py38',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)


-----!

In [16]:
sentiment_input= {"inputs": "Winter is coming and it will be dark soon."}

predictor.predict(sentiment_input)

[{'label': 'sadness', 'score': 0.3471243381500244}]

In [17]:
predictor.delete_endpoint()