# Lab model deployment

Run all cells and open "day1-lab1-langchain_introductin.ipynb" to continue the lab

In [20]:
import sagemaker, boto3, json, time
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sm_client = boto3.client('sagemaker', aws_region)
model_version = "1.0.0"


In [None]:
sm_client.list_endpoints()

{'Endpoints': [{'EndpointName': 'jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint/jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305',
   'CreationTime': datetime.datetime(2023, 6, 23, 8, 23, 47, 854000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2023, 6, 23, 8, 35, 11, 141000, tzinfo=tzlocal()),
   'EndpointStatus': 'InService'}],
 'ResponseMetadata': {'RequestId': '57d59e7a-c3c5-4d3a-a6f8-7dd90cf36ea1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '57d59e7a-c3c5-4d3a-a6f8-7dd90cf36ea1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '325',
   'date': 'Fri, 23 Jun 2023 00:46:20 GMT'},
  'RetryAttempts': 0}}

In [36]:
# Select to reload environment variables for existing endpoint, or deploy a new endpoint
reload_endpoint=True
endpoint_name = "jumpstart-example-tiiuae-falcon-40b-ins-2023-06-23-00-23-45-305"

In [27]:
inference_model = "tiiuae/falcon-40b-instruct"
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
max_input_length = 1024
max_total_tokens = 2048
health_check_timeout = 300# Set to true to reload the environment variables for the model

In [33]:

# define environment variables for TGI config
# env = {
#     'HF_MODEL_ID': model_id,
#     'SM_NUM_GPUS': json.dumps(number_of_gpu),
#     'MAX_INPUT_LENGTH': json.dumps(max_input_length),
#     'MAX_TOTAL_TOKENS': json.dumps(max_total_tokens),
# }

#models configuration
_MODEL_CONFIG_ = {
    inference_model:{
        "instance type": "ml.g5.12xlarge",
        "endpoint_name": name_from_base(f"jumpstart-example-{inference_model.replace('/', '-')}"),
        "env": {
            'HF_MODEL_ID': inference_model,
            'SM_NUM_GPUS': json.dumps(number_of_gpu),
            'MAX_INPUT_LENGTH': json.dumps(max_input_length),
            'MAX_TOTAL_TOKENS': json.dumps(max_total_tokens),
        },
        "version":"1.0.0",
    },
}

In [37]:

if reload_endpoint:
    _MODEL_CONFIG_[inference_model]['endpoint_name'] = endpoint_name
if not reload_endpoint:
    newline, bold, unbold = "\n", "\033[1m", "\033[0m"

    inference_instance_type = _MODEL_CONFIG_[inference_model]["instance type"]

    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=inference_model,
        model_version=model_version,
        instance_type=inference_instance_type,
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=inference_model, model_version=model_version, model_scope="inference"
    )
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=_MODEL_CONFIG_[inference_model]['endpoint_name'],
        env=_MODEL_CONFIG_[inference_model]["env"],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=1,
        instance_type=inference_instance_type,
        predictor_cls=Predictor,
        name=_MODEL_CONFIG_[inference_model]['endpoint_name'],
    )

    print(f"{bold}Model {inference_model} has been created successfully.{unbold}{newline}")

    # Await completion of endpoint deployment
    wait=False,
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])

    while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
        time.sleep(15)
        print('.', end='')
        describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])
    print('\nEmbedding endpoint created')


We wait for the embedding model and the inference model to be created. it usually takes 5-10 mins notably for the Flan T5 to be deployed.

We store few variables to be used in the next notebook

In [38]:
%store _MODEL_CONFIG_
# %store embedding_model
%store inference_model

Stored '_MODEL_CONFIG_' (dict)
Stored 'inference_model' (str)
