# Lab model deployment

Run all cells and open "day2-lab1-qa_langchain_jumpstart_bootcamp.ipynb" to continue the lab

In [18]:
import sagemaker, boto3, json, time
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sm_client = boto3.client("sagemaker", aws_region)
sess = sagemaker.Session()
model_version = "*"

In [19]:
#model used for its question & answering capabilities
inference_model = "huggingface-text2text-flan-t5-xxl-fp16"

#model used to generate embeddings from documents' chunks and search query
embedding_model = "huggingface-textembedding-gpt-j-6b-fp16"

In [20]:
#models configuration
_MODEL_CONFIG_ = {
    inference_model: {
        "instance type": "ml.g5.12xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1", "SAGEMAKER_MODEL_SERVER_WORKERS": "1"},
    },
    embedding_model: {
        "instance type": "ml.g5.2xlarge",
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
    }
}

In [21]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model_id in _MODEL_CONFIG_:
    endpoint_name = name_from_base(f"raglc-{model_id}")
    inference_instance_type = _MODEL_CONFIG_[model_id]["instance type"]

    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=inference_instance_type,
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model_id, model_version=model_version, model_scope="inference"
    )
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=Predictor,
        name=endpoint_name,
        env=_MODEL_CONFIG_[model_id]["env"],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=1,
        instance_type=inference_instance_type,
        predictor_cls=Predictor,
        endpoint_name=endpoint_name,
        wait=False,
    )

    print(f"{bold}Model {model_id} has been created successfully.{unbold}{newline}")
    _MODEL_CONFIG_[model_id]["endpoint_name"] = endpoint_name

[1mModel huggingface-text2text-flan-t5-xxl-fp16 has been created successfully.[0m

[1mModel huggingface-textembedding-gpt-j-6b-fp16 has been created successfully.[0m



We wait for the embedding model and the inference model to be created. it usually takes 5-10 mins notably for the Flan T5 to be deployed.

In [22]:
describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[embedding_model]['endpoint_name'])

while describe_embedding_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_embedding_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[embedding_model]['endpoint_name'])
print('\nEmbedding endpoint created')

..................................
Embedding endpoint created


In [23]:
describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])

while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[inference_model]['endpoint_name'])
print('\nEmbedding endpoint created')


Embedding endpoint created


We store few variables to be used in the next notebook

In [25]:
%store _MODEL_CONFIG_
%store embedding_model
%store inference_model

Stored '_MODEL_CONFIG_' (dict)
Stored 'embedding_model' (str)
Stored 'inference_model' (str)
