In [None]:
import dotenv
import sagemaker
import boto3
import os

In [None]:
dotenv.load_dotenv()

In [None]:
boto3.client("sts").get_caller_identity()

In [None]:
role_name = os.getenv("SAGEMAKER_ROLE_NAME")

In [None]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

# try:
#     role = sagemaker.get_execution_role()
# except ValueError:
iam = boto3.client("iam")
role = iam.get_role(RoleName=role_name)["Role"]["Arn"]

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
# save train_dataset to s3
training_input_path = f"s3://{sess.default_bucket()}/processed/mistral/dolly/train"

In [None]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

use_spot_instances = True

# define Training Job Name
job_name = (
    f'llama2-huggingface-qlora-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
)

# hyperparameters, which are passed into the training job
hyperparameters = {
    "model_id": model_id,  # pre-trained model
    "dataset_path": "/opt/ml/input/data/training",  # path where sagemaker will save training dataset
    "epochs": 3,  # number of training epochs
    "per_device_train_batch_size": 2,  # batch size for training
    "lr": 2e-4,  # learning rate used during training
    "hf_token": HfFolder.get_token(),  # huggingface token to access llama 2
    "merge_weights": True,  # wether to merge LoRA into the model (needs more memory) // we only have access to g5.2xlarge. This is not enough memory
}


# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point="run_clm.py",  # train script
    source_dir="../scripts/sagemaker-llama2-qlora/",  # directory which includes all the files needed for training
    instance_type="ml.g5.2xlarge",  # instances type used for the training job
    instance_count=1,  # the number of instances used for training
    base_job_name=job_name,  # the name of the training job
    role=role,  # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size=300,  # the size of the EBS volume in GB
    transformers_version="4.28",  # the transformers version used in the training job
    pytorch_version="2.0",  # the pytorch_version version used in the training job
    py_version="py310",  # the python version used in the training job
    hyperparameters=hyperparameters,  # the hyperparameters passed to the training job
    # use_spot_instances   =  use_spot_instances, # wether to use spot instances or not
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache"
    },  # set env variable to cache models in /tmp
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {"training": training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)