# Lab model deployment

Run all cells and open "day2-lab1-qa_langchain_jumpstart_bootcamp.ipynb" to continue the lab

In [25]:
import sagemaker, boto3, json, time
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

sagemaker_session = Session()
sm_client = boto3.client('sagemaker')
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
model_version = "*"


In [26]:
#model used for its question & answering capabilities
inference_model = "huggingface-text2text-flan-t5-xxl-fp16"

In [27]:
#models configuration
_MODEL_CONFIG_ = {
    inference_model: {
        "instance type": "ml.g5.12xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1"},
    }
}

In [28]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model_id in _MODEL_CONFIG_:
    endpoint_name = name_from_base(f"raglc-{model_id}")
    inference_instance_type = _MODEL_CONFIG_[model_id]["instance type"]

    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=inference_instance_type,
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model_id, model_version=model_version, model_scope="inference"
    )
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=endpoint_name,
        env=_MODEL_CONFIG_[model_id]["env"],
    )
    # model_predictor_inference = model_inference.deploy(
    #     initial_instance_count=1,
    #     instance_type=inference_instance_type,
    #     predictor_cls=Predictor,
    #     endpoint_name=endpoint_name,
    #     wait=False,
    # )

    print(f"{bold}Model {model_id} has been created successfully.{unbold}{newline}")
    _MODEL_CONFIG_[model_id]["endpoint_name"] = endpoint_name

[1mModel huggingface-text2text-flan-t5-xxl-fp16 has been created successfully.[0m



In [29]:
sm_client.list_endpoints()

{'Endpoints': [{'EndpointName': 'raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint/raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847',
   'CreationTime': datetime.datetime(2023, 6, 21, 15, 42, 46, 723000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2023, 6, 21, 15, 49, 52, 840000, tzinfo=tzlocal()),
   'EndpointStatus': 'InService'},
  {'EndpointName': 'raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-06-41-14-772',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint/raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-06-41-14-772',
   'CreationTime': datetime.datetime(2023, 6, 21, 14, 41, 16, 568000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2023, 6, 21, 14, 48, 27, 643000, tzinfo=tzlocal()),
   'EndpointStatus': 'InService'},
  {'EndpointName': 'js-sd-20230609012305',
   'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint

In [30]:
_MODEL_CONFIG_[inference_model]['endpoint_name']

'raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-09-08-58-793'

In [20]:
sm_client.describe_endpoint(EndpointName="raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847")

{'EndpointName': 'raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847',
 'EndpointArn': 'arn:aws:sagemaker:us-east-1:591736166602:endpoint/raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847',
 'EndpointConfigName': 'raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-07-42-44-847',
 'ProductionVariants': [{'VariantName': 'AllTraffic',
   'DeployedImages': [{'SpecifiedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:1.13.1-transformers4.26.0-gpu-py39-cu117-ubuntu20.04',
     'ResolvedImage': '763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference@sha256:0af3a86c1f433bafe64a03db1375b84789b20eec51f628bbff090863d6ccd9c0',
     'ResolutionTime': datetime.datetime(2023, 6, 21, 15, 42, 47, 246000, tzinfo=tzlocal())}],
   'CurrentWeight': 1.0,
   'DesiredWeight': 1.0,
   'CurrentInstanceCount': 1,
   'DesiredInstanceCount': 1}],
 'EndpointStatus': 'InService',
 'CreationTime': datetime.datetime(2023, 6, 21, 15, 42, 

We wait for the embedding model and the inference model to be created. it usually takes 5-10 mins notably for the Flan T5 to be deployed.

In [31]:
describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])

while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])
print('\nEmbedding endpoint created')

ClientError: An error occurred (ValidationException) when calling the DescribeEndpoint operation: Could not find endpoint "arn:aws:sagemaker:us-east-1:591736166602:endpoint/raglc-huggingface-text2text-flan-t5-xxl-2023-06-21-09-08-58-793".

We store few variables to be used in the next notebook

In [25]:
%store _MODEL_CONFIG_
%store embedding_model
%store inference_model

Stored '_MODEL_CONFIG_' (dict)
Stored 'embedding_model' (str)
Stored 'inference_model' (str)
