# Fine-tune Llama2 using TallRec framework for Book recommendations

In [None]:
!pip install --upgrade pip --quiet
!pip install sagemaker --upgrade --quiet

# make sure updates to the python modules are imported
%load_ext autoreload
%autoreload 2

In [None]:
import sagemaker
import random
import boto3
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.huggingface import HuggingFace

from datetime import datetime
from sagemaker.pytorch.estimator import PyTorch

sess = sagemaker.Session()

sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

output_bucket = sagemaker.Session().default_bucket()
data_s3_location = f"s3://{output_bucket}/tallrec-llm-evaluation"
dataset_s3_location = f"{data_s3_location}/datasets"

print(f"Default data location: {data_s3_location}")
print(f"Default train location: {dataset_s3_location}")

S3Uploader.upload("./datasets/", dataset_s3_location)



In [None]:

evaluation_exec_id = datetime.now().strftime("%Y%m%d%H%M%S")

s3_output_path = f"s3://{output_bucket}/{evaluation_exec_id}/"


**Configure HuggingFace Training Estimator**

In [None]:
train_instance_type = "ml.g5.24xlarge"

train_hyperparameters = {
                "model_id": "baffo32/decapoda-research-llama-7B-hf",
                "learning_rate": "1e-4",
                "lora_dropout": "0.05",
                "sample": "-1",
                "num_epochs": "200",
                "cutoff_len": "512",
                "micro_batch_size": "4",
                "batch_size": "16",
                "wandb_project": "LLamaRecs",
                "seed": f"{random.randint(0, 100 - 1)}"
                
            }

train_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/train.json"
valid_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/valid.json"

In [None]:
# Create the Estimator
huggingface_train_estimator = HuggingFace(
    entry_point          = 'tallrec_finetune.py',      # train script
    source_dir           = 'scripts/train_tallrec_g5',         # directory which includes all the files needed for training
    instance_type        = train_instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = "tallrec-training",          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',            # the pytorch_version version used in the training job
    py_version           = 'py310',            # the python version used in the training job
    hyperparameters      =  train_hyperparameters,
    environment          = {
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
                            "HF_TOKEN": "<your_hf_token_here>"
                            },
)
# Set up the input channels for the training job
huggingface_train_estimator.fit(
    {'train': train_data_path,
   'valid': valid_data_path}
)

In [None]:
from sagemaker.s3 import S3Downloader

S3Downloader.download(
    s3_uri=huggingface_train_estimator.model_data, # S3 URI where the trained model is located
    local_path='.',                          # local path where *.targ.gz is to be saved saved
    sagemaker_session=sess                   # SageMaker session used for training the model
)

**Unpack the merged mdoel and adapter weights into a local directory, and then move the tokenizer into the `merged_model` folder**

In [None]:
%%sh
sudo mkdir both_models_unpacked

sudo tar --warning=no-unknown-keyword -xzvf ./model.tar.gz -C both_models_unpacked

cd both_models_unpacked && sudo mv tokenizer_config.json tokenizer.model special_tokens_map.json merged_model/

In [None]:
full_model_s3_location = huggingface_train_estimator.model_data[:-len("model.tar.gz")]

merged_model_location = f"{full_model_s3_location}merged_model"

print(f"Uploading the merged model and uploading to S3 location: {merged_model_location}")

S3Uploader.upload("both_models_unpacked/merged_model/", merged_model_location)

!sudo rm -rf both_models_unpacked
!sudo rm -rf model.tar.gz

## 2.1 Configure Evaluation Estimator Job

In [None]:
# eval_instance_type = "ml.g5.24xlarge"
eval_instance_type = "ml.g5.48xlarge"

eval_hyperparameters = {
                "base_model": "baffo32/decapoda-research-llama-7B-hf",
                "test_data_path": "/opt/ml/input/data/test/test.json",
            }

pretrained_model_s3_uri = huggingface_train_estimator.model_data

test_data_path = f"s3://{output_bucket}/tallrec-llm-evaluation/datasets/book/test.json"


## 2.2 Run Evaluation Estimator Job

In [None]:
# create the Estimator
huggingface_eval_estimator = HuggingFace(
    entry_point          = 'evaluate.py',      # train script
    source_dir           = 'scripts/evaluate_tallrec_p4',         # directory which includes all the files needed for training
    instance_type        = eval_instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = "tallrec-evaluation",          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.36',            # the transformers version used in the training job
    pytorch_version      = '2.1',            # the pytorch_version version used in the training job
    py_version           = 'py310',            # the python version used in the training job
    hyperparameters      =  eval_hyperparameters,
    environment          = {
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
                            "HF_TOKEN": "<your_hf_token_here>"
                            }, # set env variable to cache models in /tmp
)

huggingface_eval_estimator.fit(
    {'test': test_data_path,
     'model': pretrained_model_s3_uri
    }
)

In [None]:
print(f"The temp.json containing the model results can be found at the S3 location: \n {huggingface_eval_estimator.model_data}")

fine_tuned_results = S3Downloader.download(
    s3_uri=huggingface_eval_estimator.model_data, # S3 URI where the evaluated model data is located
    local_path='.',                          # local path where *.targ.gz will be saved
    sagemaker_session=sess                   # SageMaker session used for training the model
)