# Lab model deployment

Run all cells and open "day1-lab1-langchain_introductin.ipynb" to continue the lab

In [19]:
# Do the necessary installations
!pip install --upgrade sagemaker

[0m

In [38]:
import sagemaker, boto3, json, time
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sm_client = boto3.client('sagemaker', aws_region)
model_version = "1.0.0"


In [9]:
sm_client.list_endpoints()

{'Endpoints': [{'EndpointName': 'jumpstart-example-huggingface-llm-falco-2023-06-24-08-48-11-010',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint/jumpstart-example-huggingface-llm-falco-2023-06-24-08-48-11-010',
   'CreationTime': datetime.datetime(2023, 6, 24, 16, 48, 12, 145000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2023, 6, 24, 17, 38, 54, 66000, tzinfo=tzlocal()),
   'EndpointStatus': 'Failed'}],
 'ResponseMetadata': {'RequestId': '1b70dd73-bf25-46ec-a166-803f24fffba5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1b70dd73-bf25-46ec-a166-803f24fffba5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '322',
   'date': 'Sun, 25 Jun 2023 01:43:17 GMT'},
  'RetryAttempts': 0}}

In [41]:
sm_client.delete_endpoint(EndpointName="jumpstart-example-huggingface-llm-falco-2023-06-24-06-11-09-889")

ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Cannot update in-progress endpoint "arn:aws:sagemaker:us-east-1:591736166602:endpoint/jumpstart-example-huggingface-llm-falco-2023-06-24-06-11-09-889".

In [8]:
# Select to reload environment variables for existing endpoint, or deploy a new endpoint
reload_endpoint=False
ai21_endpoint_name = "jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305"
falcon_endpoint_name = "jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305"
embedding_endpoint_name = "jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305"

In [6]:
# Configure Falcon-40B Model
falcon_inference_model = "huggingface-llm-falcon-40b-instruct-bf16"
falcon_instance_type = "ml.g5.12xlarge"
falcon_number_of_gpu = 4
falcon_max_input_length = 1024
falcon_max_total_tokens = 2048
health_check_timeout = 300# Set to true to reload the environment variables for the model

# Configure AI Jurassic model
ai21_inference_model = "ju-grande"
ai21_instance_type = "ml.g5.48xlarge"
ai21_number_of_gpu = 4
ai21_max_input_length = 1024
ai21_max_total_tokens = 2048

# Configure Embedding model
embedding_model = "huggingface-llm-embedding-bf16"
embedding_model_instance_type = "ml.g5.2xlarge"

In [20]:
#models configuration
_MODEL_CONFIG_ = {
    ai21_inference_model:{
        "provider": "marketplace",
        "instance_type": ai21_instance_type,
        "endpoint_name": name_from_base(f"jumpstart-example-{ai21_inference_model.replace('/', '-')}"),
        "env": {
            'HF_MODEL_ID': ai21_inference_model,
            'SM_NUM_GPUS': json.dumps(ai21_number_of_gpu),
            'MAX_INPUT_LENGTH': json.dumps(ai21_max_input_length),
            'MAX_TOTAL_TOKENS': json.dumps(ai21_max_total_tokens),
        },
        "version":"1.0.0",
    },
    falcon_inference_model:{
        "provider": "jumpstart",
        "instance_type": falcon_instance_type,
        "endpoint_name": name_from_base(f"jumpstart-example-{falcon_inference_model.replace('/', '-')}"),
        "env": {
            'HF_MODEL_ID': falcon_inference_model,
            'SM_NUM_GPUS': json.dumps(falcon_number_of_gpu),
            'MAX_INPUT_LENGTH': json.dumps(falcon_max_input_length),
            'MAX_TOTAL_TOKENS': json.dumps(falcon_max_total_tokens),
        },
        "version":"1.0.0",
    },
    embedding_model: {
        "provider": "jumpstart",
        "instance_type": embedding_model_instance_type,
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
        "version":"*",
    }

}

In [22]:

if reload_endpoint:
    _MODEL_CONFIG_[ai21_inference_model]['endpoint_name'] = ai21_endpoint_name
    _MODEL_CONFIG_[falcon_inference_model]['endpoint_name'] = falcon_endpoint_name
    _MODEL_CONFIG_[embedding_model]['endpoint_name'] = embedding_endpoint_name
if not reload_endpoint:
    newline, bold, unbold = "\n", "\033[1m", "\033[0m"

    # Deploys all model endpoints in parallel
    for model_id in _MODEL_CONFIG_:
        endpoint_config = _MODEL_CONFIG_[model_id]
        inference_instance_type = endpoint_config["instance_type"]
        
        model_version = endpoint_config["version"]
        instance_type = endpoint_config["instance_type"]
        if endpoint_config["provider"] == "marketplace":
            deploy_ai21(model_id, endpoint_config)
        else:
            # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
            deploy_image_uri = image_uris.retrieve(
                region=None,
                framework=None,  # automatically inferred from model_id
                image_scope="inference",
                model_id=model_id,
                model_version=model_version,
                instance_type=instance_type,
            )
            # Retrieve the model uri.
            model_uri = model_uris.retrieve(
                model_id=model_id, model_version=model_version, model_scope="inference"
            )
            model_inference = Model(
                image_uri=deploy_image_uri,
                model_data=model_uri,
                role=aws_role,
                predictor_cls=Predictor,
                name=model_id,
                env=endpoint_config["env"],
            )
            # model_predictor_inference = model_inference.deploy(
            #     initial_instance_count=1,
            #     instance_type=inference_instance_type,
            #     predictor_cls=Predictor,
            #     name=endpoint_config['endpoint_name'],
            # )

    print(f"{bold}Model {model_id} has been created successfully.{unbold}{newline}")

    # Await completion of endpoint deployment
    # wait=False,
    # describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])

    # while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    #     time.sleep(15)
    #     print('.', end='')
    #     describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])
    # print('\nEmbedding endpoint created')


KeyError: "Unable to find model manifest for 'ju-grande' with version '1.0.0'. Visit https://sagemaker.readthedocs.io/en/stable/doc_utils/pretrainedmodels.html for updated list of models. Did you mean to use model ID 'xgboost-regression-model'?"

We wait for the embedding model and the inference model to be created. it usually takes 5-10 mins notably for the Flan T5 to be deployed.

We store few variables to be used in the next notebook

In [9]:
%store _MODEL_CONFIG_
# %store embedding_model
%store inference_model

Stored '_MODEL_CONFIG_' (dict)
Stored 'inference_model' (str)
