## Download Model

In [None]:
!pip install huggingface_hub

In [8]:
!mkdir -p /home/ec2-user/SageMaker/models

In [9]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

HF_MODEL_ID="EleutherAI/gpt-j-6b"
# create model dir
model_name = HF_MODEL_ID.split("/")[-1]
model_tar_dir = Path(f"/home/ec2-user/SageMaker/models/{model_name}")
if not os.path.isdir(model_tar_dir):
    model_tar_dir.mkdir(exist_ok=True)
    # Download model from Hugging Face into model_dir
    snapshot_download(HF_MODEL_ID, 
                      local_dir=str(model_tar_dir), 
                      local_dir_use_symlinks=False,
                      cache_dir="/home/ec2-user/SageMaker/")

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Downloading (…)c10ad481a2/README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

Downloading (…)481a2/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)a2/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)0ad481a2/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Downloading (…)10ad481a2/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)10ad481a2/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)481a2/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading flax_model.msgpack:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

In [1]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()                    # Set a default S3 bucket
prefix = 'demo-clm-finetune'

In [16]:
!aws s3 cp --recursive /home/ec2-user/SageMaker/models/gpt-j-6b s3://{bucket}/{prefix}/huggingface-models/gpt-j-6b

upload: ../../../models/gpt-j-6b/added_tokens.json to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/added_tokens.json
upload: ../../../models/gpt-j-6b/README.md to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/README.md
upload: ../../../models/gpt-j-6b/.gitattributes to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/.gitattributes
upload: ../../../models/gpt-j-6b/tokenizer_config.json to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/tokenizer_config.json
upload: ../../../models/gpt-j-6b/config.json to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/config.json
upload: ../../../models/gpt-j-6b/special_tokens_map.json to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/special_tokens_map.json
upload: ../../../models/gpt-j-6b/merges.txt to s3://sagemaker-us-east-1-783128296767/huggingface-models/gpt-j-6b/merges.txt
upload: ../../../models/gpt-j-6b/tokenizer.json to s3://sagema

## Dataset

In [2]:
!aws s3 ls s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk/
!aws s3 ls s3://{bucket}/{prefix}/huggingface-models/gpt-j-6b/

                           PRE test/
                           PRE train/
                           PRE validation/
2023-05-25 12:00:32         43 dataset_dict.json
2023-05-25 12:03:28        737 .gitattributes
2023-05-25 12:03:27      10990 README.md
2023-05-25 12:03:28       4039 added_tokens.json
2023-05-25 12:03:28        930 config.json
2023-05-25 12:03:27 24203541928 flax_model.msgpack
2023-05-25 12:03:28     456356 merges.txt
2023-05-25 12:03:28 24207819307 pytorch_model.bin
2023-05-25 12:03:28        357 special_tokens_map.json
2023-05-25 12:03:28 24203955064 tf_model.h5
2023-05-25 12:03:28    1373465 tokenizer.json
2023-05-25 12:03:28        619 tokenizer_config.json
2023-05-25 12:03:28     798156 vocab.json


## Deepspeed Training

In [5]:
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
import boto3
import sagemaker

role = get_execution_role()

hyperparameters = {}

session = boto3.session.Session(region_name="us-east-1")
region = session.region_name
print(region)
sm_boto_client = boto3.client("sagemaker", region_name=region)
sagemaker_session = sagemaker.session.Session(boto_session=session)


based_job_mame = "gptj-ds"
entry_point = "train-deepspeed.sh"
S3_OUTPUT=f"s3://{bucket}/{prefix}/sagemaker/gpt-j-6B-ds"

# https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/data_parallel/bert
estimator = PyTorch(
    sagemaker_session=sagemaker_session,
    base_job_name=based_job_mame,
    source_dir="./source_dir",
    entry_point=entry_point,
    framework_version="1.13.1",
    py_version="py39",
    role=role,
    code_location=S3_OUTPUT,
    output_path=S3_OUTPUT,
    max_run= 5 * 24 * 60 * 60,
    instance_count=1,
    instance_type="ml.g5.24xlarge",  # ml.g5.48xlarge   ml.p3dn.24xlarge ml.p4d.24xlarge
    volume_size=500,
    debugger_hook_config=False,
    disable_profile=True,
    environment={"TRANSFORMERS_OFFLINE": "1", "HF_DATASETS_OFFLINE":"1", "S3_OUTPUT": S3_OUTPUT},
    keep_alive_period_in_seconds=1800,
    hyperparameters=hyperparameters,
)

wait = False
fast_file = lambda x: TrainingInput(x, input_mode="FastFile")
estimator.fit(
    {
        "pre-trained": fast_file(f"s3://{bucket}/{prefix}/huggingface-models/gpt-j-6b/"),
        "train_data": fast_file(f"s3://{bucket}/{prefix}/wiki-tokenized-dataset-chunk"),
    },
    wait=wait,
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


us-east-1


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: gptj-ds-2023-06-23-11-22-09-796


In [24]:
!awslogs get --aws-region=us-east-1  -s1d /aws/sagemaker/TrainingJobs "gptj-ds-2023-05-25-11-07-23-459/algo-1-1685012935" &> ds-training.log