In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import dotenv
import sagemaker
import boto3
import os
from datasets import load_dataset, load_from_disk

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/nasoungadoy/Library/Application Support/sagemaker/config.yaml


In [4]:
dotenv.load_dotenv()

True

In [5]:
boto3.setup_default_session(profile_name="ai-sandbox-sso")

In [6]:
_ = boto3.client("sts").get_caller_identity()

In [7]:
from typing import Literal

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!!! SET THE FOLLOWING VARIABLES !!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
T5_VARIANT: Literal["mt5", "byt5"] = "byt5"

In [8]:
role_name = os.environ["SAGEMAKER_ROLE_NAME"]

In [9]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

# try:
#     role = sagemaker.get_execution_role()
# except ValueError:
iam = boto3.client("iam")
role = iam.get_role(RoleName=role_name)["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/nasoungadoy/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/nasoungadoy/Library/Application Support/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::542301657622:role/service-role/AmazonSageMaker-ExecutionRole-20231002T234337
sagemaker bucket: sagemaker-us-east-1-542301657622
sagemaker session region: us-east-1


In [10]:
TOKENIZED_DATASET_PATH = f"../../datasets/mbay-translations-tokenized/{T5_VARIANT}"
dataset = load_from_disk(TOKENIZED_DATASET_PATH)

In [11]:
input_dataset_s3_path = f"s3://{sess.default_bucket()}/processed/mbay/{T5_VARIANT}/"
sample_dataset_s3_path = f"s3://{sess.default_bucket()}/processed/mbay-samples/{T5_VARIANT}/"

In [12]:
from datasets import load_dataset

print("uploaded data to:")
# Load dataset from disk and upload to S3
dataset.save_to_disk(input_dataset_s3_path)
print(f"training dataset to: {input_dataset_s3_path}")

sampled_dataset = dataset.shuffle().filter(lambda x, i: i < 500, with_indices=True)
sampled_dataset.save_to_disk(sample_dataset_s3_path)
print(f"sample dataset to: {sample_dataset_s3_path}")

uploaded data to:


Saving the dataset (0/1 shards):   0%|          | 0/34320 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4292 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4292 [00:00<?, ? examples/s]

training dataset to: s3://sagemaker-us-east-1-542301657622/processed/mbay/byt5/


Filter:   0%|          | 0/34320 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4292 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4292 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

sample dataset to: s3://sagemaker-us-east-1-542301657622/processed/mbay-samples/byt5/


In [13]:
import time
from mbay_nmt.training.core import training_job_name
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

use_spot_instances = False
model_id = f"google/{T5_VARIANT}-large"

# define Training Job Name
job_name = training_job_name(model_id)
job_name

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'mbay-nmt-google-byt5-large-fb4rr'

In [14]:
WANDB_TOKEN = os.environ["WANDB_TOKEN"]

In [17]:
DAY = 24 * 60 * 60

checkpoint_s3_uri = f"s3://{sess.default_bucket()}/{job_name}/checkpoints"


# hyperparameters, which are passed into the training job
hyperparameters = {
    "model_id": model_id,  # pre-trained model
    "dataset_path": "/opt/ml/input/data/training",  # path where sagemaker will save training dataset
    "epochs": 10,  # number of training epochs
    "per_device_train_batch_size": 32,  # batch size for training
    "lr": 1e-3,  # learning rate used during training
    "hf_token": HfFolder.get_token(),  # huggingface token to access llama 2
    "wandb_token": WANDB_TOKEN,
    "merge_weights": True,  # wether to merge LoRA into the model (needs more memory) // we only have access to g5.2xlarge. This is not enough memory
    "output_dir": "/opt/ml/checkpoints",
}


# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point="run.py",  # train script
    source_dir="../../projects/run_fine_tune_t5",  # directory which includes all the files needed for training
    instance_type="ml.g5.2xlarge",  # instances type used for the training job
    instance_count=1,  # the number of instances used for training
    base_job_name=job_name,  # the name of the training job
    role=role,  # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size=200,  # the size of the EBS volume in GB
    max_wait=10 * DAY if use_spot_instances else None,
    max_run=5 * DAY,
    transformers_version="4.28",  # the transformers version used in the training job
    pytorch_version="2.0",  # the pytorch_version version used in the training job
    py_version="py310",  # the python version used in the training job
    hyperparameters=hyperparameters,  # the hyperparameters passed to the training job
    use_spot_instances=use_spot_instances,  # wether to use spot instances or not
    checkpoint_s3_uri=checkpoint_s3_uri,
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache"
    },  # set env variable to cache models in /tmp
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/nasoungadoy/Library/Application Support/sagemaker/config.yaml


In [18]:
# define a data input dictonary with our uploaded s3 uris
# data = {"training": training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(input_dataset_s3_path, wait=False)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: mbay-nmt-google-byt5-large-fb4rr-2023-11-17-20-02-51-301
