In [1]:
!pip install --upgrade pip --quiet
!pip install sagemaker --upgrade --quiet
# !pip install -r local-requirements.txt --quiet

# make sure updates to the python modules are imported
%load_ext autoreload
%autoreload 2

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.7.0 requires botocore<1.31.65,>=1.31.16, but you have botocore 1.34.121 which is incompatible.
amazon-sagemaker-jupyter-scheduler 3.0.7 requires jupyter-scheduler==2.5, but you have jupyter-scheduler 2.4.0 which is incompatible.[0m[31m
[0m

In [2]:
import sagemaker
import random
import boto3
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.huggingface import HuggingFace

from datetime import datetime
from sagemaker.pytorch.estimator import PyTorch

sess = sagemaker.Session()

sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

output_bucket = sagemaker.Session().default_bucket()
data_s3_location = f"s3://{output_bucket}/tallrec-llm-evaluation"
dataset_s3_location = f"{data_s3_location}/datasets"

print(f"Default data location: {data_s3_location}")
print(f"Default train location: {dataset_s3_location}")

S3Uploader.upload("./datasets/", dataset_s3_location)



sagemaker role arn: arn:aws:iam::174671970284:role/service-role/AmazonSageMaker-ExecutionRole-20240216T153805
sagemaker bucket: sagemaker-us-east-1-174671970284
sagemaker session region: us-east-1
Default data location: s3://sagemaker-us-east-1-174671970284/tallrec-llm-evaluation
Default train location: s3://sagemaker-us-east-1-174671970284/tallrec-llm-evaluation/datasets


's3://sagemaker-us-east-1-174671970284/tallrec-llm-evaluation/datasets'

In [12]:

evaluation_exec_id = datetime.now().strftime("%Y%m%d%H%M%S")

s3_output_path = f"s3://{output_bucket}/{evaluation_exec_id}/"


**Configure HuggingFace Training Estimator**

In [17]:
train_instance_type = "ml.g5.24xlarge"

train_hyperparameters = {
                "model_id": "baffo32/decapoda-research-llama-7B-hf",
                "learning_rate": "1e-4",
                "lora_dropout": "0.05",
                "sample": "-1",
                "num_epochs": "200",
                "cutoff_len": "512",
                "micro_batch_size": "4",
                "batch_size": "16",
                "wandb_project": "LLamaRecs",
                "seed": f"{random.randint(0, 100 - 1)}"
                
            }

train_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/train.json"
valid_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/valid.json"

In [19]:
# Create the Estimator
huggingface_train_estimator = HuggingFace(
    entry_point          = 'tallrec_finetune.py',      # train script
    source_dir           = 'scripts/train_tallrec_g5',         # directory which includes all the files needed for training
    instance_type        = train_instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = "tallrec-training",          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',            # the pytorch_version version used in the training job
    py_version           = 'py310',            # the python version used in the training job
    hyperparameters      =  train_hyperparameters,
    environment          = {
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
                            "HF_TOKEN": "<your_hf_token_here>"
                            },
)
# Set up the input channels for the training job
huggingface_train_estimator.fit(
    {'train': train_data_path,
   'valid': valid_data_path}
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tallrec-training-2024-06-11-13-54-43-286


2024-06-11 13:54:43 Starting - Starting the training job...
2024-06-11 13:54:44 Pending - Training job waiting for capacity.........
2024-06-11 13:56:23 Pending - Preparing the instances for training...
2024-06-11 13:57:08 Downloading - Downloading the training image...............
2024-06-11 13:59:39 Training - Training image download completed. Training in progress.........bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-06-11 14:00:55,719 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-06-11 14:00:55,754 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-06-11 14:00:55,765 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-06-11 14:00:55,767 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-06-11 14:00:57,290 sagemaker-training-toolkit INFO     Installi

In [2]:
from sagemaker.s3 import S3Downloader

S3Downloader.download(
    s3_uri=huggingface_train_estimator.model_data, # S3 URI where the trained model is located
    local_path='.',                          # local path where *.targ.gz is to be saved saved
    sagemaker_session=sess                   # SageMaker session used for training the model
)

['./model.tar.gz']

**Unpack the merged mdoel and adapter weights into a local directory, and then move the tokenizer into the `merged_model` folder**

In [3]:
%%sh
sudo mkdir both_models_unpacked

sudo tar --warning=no-unknown-keyword -xzvf ./model.tar.gz -C both_models_unpacked

cd both_models_unpacked && sudo mv tokenizer_config.json tokenizer.model special_tokens_map.json merged_model/

tokenizer_config.json
checkpoint-1300/
checkpoint-1300/training_args.bin
checkpoint-1300/rng_state.pth
checkpoint-1300/optimizer.pt


checkpoint-1300/scaler.pt
checkpoint-1300/pytorch_model.bin
checkpoint-1300/trainer_state.json
checkpoint-1300/scheduler.pt
tokenizer.model
adapter_model.bin
runs/
runs/Jun11_14-04-31_algo-1/
runs/Jun11_14-04-31_algo-1/1718114671.4514046/
runs/Jun11_14-04-31_algo-1/1718114671.4514046/events.out.tfevents.1718114671.algo-1.99.1
runs/Jun11_14-04-31_algo-1/events.out.tfevents.1718114671.algo-1.99.0
adapter_config.json
merged_model/
merged_model/config.json
merged_model/model-00006-of-00007.safetensors
merged_model/model-00001-of-00007.safetensors
merged_model/model-00005-of-00007.safetensors
merged_model/model-00004-of-00007.safetensors
merged_model/model-00007-of-00007.safetensors
merged_model/model-00002-of-00007.safetensors
merged_model/generation_config.json
merged_model/model.safetensors.index.json
merged_model/model-00003-of-00007.safetensors
special_tokens_map.json


In [4]:
full_model_s3_location = huggingface_train_estimator.model_data[:-len("model.tar.gz")]

merged_model_location = f"{full_model_s3_location}merged_model"

print(f"Uploading the merged model and uploading to S3 location: {merged_model_location}")

S3Uploader.upload("both_models_unpacked/merged_model/", merged_model_location)

!sudo rm -rf both_models_unpacked
!sudo rm -rf model.tar.gz

Uploading the merged model and uploading to S3 location: s3://sagemaker-us-east-1-174671970284/tallrec-training-2024-06-11-13-54-43-286/output/merged_model


## 2.1 Configure Evaluation Estimator Job

In [5]:
# eval_instance_type = "ml.g5.24xlarge"
eval_instance_type = "ml.g5.48xlarge"

eval_hyperparameters = {
                "base_model": "baffo32/decapoda-research-llama-7B-hf",
                "test_data_path": "/opt/ml/input/data/test/test.json",
            }

pretrained_model_s3_uri = huggingface_train_estimator.model_data

test_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/test.json"


## 2.2 Run Evaluation Estimator Job

In [6]:
# create the Estimator
huggingface_eval_estimator = HuggingFace(
    entry_point          = 'evaluate.py',      # train script
    source_dir           = 'scripts/evaluate_tallrec_p4',         # directory which includes all the files needed for training
    instance_type        = eval_instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = "tallrec-evaluation",          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',            # the pytorch_version version used in the training job
    py_version           = 'py310',            # the python version used in the training job
    hyperparameters      =  eval_hyperparameters,
    environment          = {
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
                            "HF_TOKEN": "<your_hf_token_here>"
                            }, # set env variable to cache models in /tmp
)

huggingface_eval_estimator.fit(
    {'test': test_data_path,
     'model': pretrained_model_s3_uri
    }
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tallrec-evaluation-2024-06-13-14-09-09-694


2024-06-13 14:09:10 Starting - Starting the training job...
2024-06-13 14:09:14 Pending - Training job waiting for capacity.........
2024-06-13 14:10:53 Pending - Preparing the instances for training......
2024-06-13 14:11:48 Downloading - Downloading input data................................................
2024-06-13 14:20:15 Training - Training image download completed. Training in progress..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-06-13 14:20:17,210 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-06-13 14:20:17,273 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-06-13 14:20:17,285 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-06-13 14:20:17,287 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-06-13 14:20:19,039 sagemaker-training-tool

The temp.json containing the model results can be found at the S3 location: 
 s3://sagemaker-us-east-1-174671970284/tallrec-evaluation-2024-06-10-15-38-21-196/output/model.tar.gz

In [8]:
print(f"The temp.json containing the model results can be found at the S3 location: \n {huggingface_eval_estimator.model_data}")

fine_tuned_results = S3Downloader.download(
    s3_uri=huggingface_eval_estimator.model_data, # S3 URI where the evaluated model data is located
    local_path='.',                          # local path where *.targ.gz will be saved
    sagemaker_session=sess                   # SageMaker session used for training the model
)

The temp.json containing the model results can be found at the S3 location: 
 s3://sagemaker-us-east-1-174671970284/tallrec-evaluation-2024-06-10-15-38-21-196/output/model.tar.gz
