## Download Model

In [None]:
!pip install huggingface_hub

In [None]:
!mkdir -p /home/ec2-user/SageMaker/models

In [None]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

HF_MODEL_ID="EleutherAI/gpt-j-6b"
# create model dir
model_name = HF_MODEL_ID.split("/")[-1]
model_tar_dir = Path(f"/home/ec2-user/SageMaker/models/{model_name}")
if not os.path.isdir(model_tar_dir):
    model_tar_dir.mkdir(exist_ok=True)
    # Download model from Hugging Face into model_dir
    snapshot_download(HF_MODEL_ID, 
                      local_dir=str(model_tar_dir), 
                      local_dir_use_symlinks=False,
                      cache_dir="/home/ec2-user/SageMaker/")

In [None]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'demo-clm-finetune'

In [None]:
%%bash -s "$model_name" "$bucket" "$prefix"
DIR="/home/ec2-user/SageMaker/models/$1"
if [ -d "$DIR" ];
then
    echo "model has already been uploaded to S3"
else
    aws s3 cp --recursive /home/ec2-user/SageMaker/models/$1 s3://$2/$3/huggingface-models/$1
fi

## Dataset

In [None]:
!aws s3 ls s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk/
!aws s3 ls s3://{bucket}/{prefix}/huggingface-models/gpt-j-6b/

## Lora Training for Multiple GPUs

In [None]:
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
import boto3
import sagemaker

role = get_execution_role()

hyperparameters = {}

session = boto3.session.Session(region_name="us-east-1")
region = session.region_name
print(region)
sm_boto_client = boto3.client("sagemaker", region_name=region)
sagemaker_session = sagemaker.session.Session(boto_session=session)


based_job_mame = "gptj-lora-torchrun"
entry_point = "train-lora-torchrun.sh"
S3_OUTPUT=f"s3://{bucket}/{prefix}/sagemaker/gpt-j-6B-lora-torchrun"

# https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/data_parallel/bert
estimator = PyTorch(
    sagemaker_session=sagemaker_session,
    base_job_name=based_job_mame,
    source_dir="./source_dir",
    entry_point=entry_point,
    framework_version="1.13.1",
    py_version="py39",
    role=role,
    code_location=S3_OUTPUT,
    output_path=S3_OUTPUT,
    max_run= 5 * 24 * 60 * 60,
    instance_count=1,
    instance_type="ml.g5.12xlarge",  # ml.g5.48xlarge   ml.p3dn.24xlarge ml.p4d.24xlarge
    volume_size=450,
    debugger_hook_config=False,
    disable_profile=True,
    environment={"TRANSFORMERS_OFFLINE": "1", "HF_DATASETS_OFFLINE":"1", "S3_OUTPUT": S3_OUTPUT},
    keep_alive_period_in_seconds=1800,
    hyperparameters=hyperparameters,
)

wait = False
fast_file = lambda x: TrainingInput(x, input_mode="FastFile")
estimator.fit(
    {
        "pre-trained": fast_file(f"s3://{bucket}/{prefix}/huggingface-models/gpt-j-6b/"),
        "train_data": fast_file(f"s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk"),
    },
    wait=wait,
)

In [None]:
!awslogs get --aws-region=us-east-1  -s1d /aws/sagemaker/TrainingJobs "gptj-ds-2023-05-25-11-07-23-459/algo-1-1685012935" &> ds-training.log