# Lab model deployment
This notebook deploys the required endpoints for the upcoming labs in Day1 and Day2.


First:
Go to Sagemaker playground and subscribe to the AI21 Jurrasic-2 Mid model

https://us-east-1.console.aws.amazon.com/sagemaker/playground?region=us-east-1#/foundation-models/playground/prodview-bzjpjkgd542au

Instructions: 
- Select Data Science 3.0 kernel
- Select ml.m5.xlarge
- Run all cells, wait 10-15min and open "day2-lab1-qa_langchain_jumpstart_bootcamp.ipynb" to continue the lab

Run all cells and open "day1-lab1-langchain_introductin.ipynb" to continue the lab

In [None]:
!pip install --upgrade pip --quiet --disable-pip-version-check --root-user-action=ignore
!pip install --upgrade sagemaker --quiet --root-user-action=ignore
!pip install -U ai21 --quiet --root-user-action=ignore

In [None]:
import sagemaker, boto3, json, time
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sm_client = boto3.client('sagemaker', aws_region)


In [None]:
# Uncomment to list your deployed endpoints, for loading them into the client 
# sm_client.list_endpoints()

### Loading existing endpoints (skipped by default)
If you need to change only a subset of the endpoints, e.g you are migrating the endpoint to a different instance, you can set `load_endpoint = True` and specify the endpoint names you wish to retain. This will trigger a deployment of all model endpoints that have a empty string set as endpoint_name.

In [18]:
# Select to reload environment variables for existing endpoint, or deploy a new endpoint
load_endpoint=False

falcon_endpoint_name = ""
embedding_endpoint_name = ""
ai21_endpoint_name = ""

### Deploying endpoints with configuration
Configuration and deployment of the required endpoints using Amazon SageMaker Jumpstart

In [19]:
# Configure Falcon-40B Model
falcon_inference_model = "huggingface-llm-falcon-40b-instruct-bf16"
falcon_model_id = "tiiuae/falcon-4b-instruct"
falcon_instance_type = "ml.g5.12xlarge"
falcon_number_of_gpu = 4
falcon_max_input_length = 1024
falcon_max_total_tokens = 2048
health_check_timeout = 300# Set to true to reload the environment variables for the model

# Configure Embedding model
embedding_model = "huggingface-textembedding-gpt-j-6b-fp16"
embedding_model_instance_type = "ml.g5.2xlarge"

# Configure AI Jurassic model
# ai21_inference_model = "j2-grande"
# ai21_instance_type = "ml.g5.48xlarge"
# ai21_number_of_gpu = 4
# ai21_max_input_length = 1024
# ai21_max_total_tokens = 2048

In [None]:
#models configuration
_MODEL_CONFIG_ = {
    falcon_inference_model:{
        "provider": "jumpstart",
        "instance_type": falcon_instance_type,
        "env": {
            'HF_MODEL_ID': falcon_model_id,
            'SM_NUM_GPUS': json.dumps(falcon_number_of_gpu),
            'MAX_INPUT_LENGTH': json.dumps(falcon_max_input_length),
            'MAX_TOTAL_TOKENS': json.dumps(falcon_max_total_tokens),
        },
        "version":"1.0.0",
    },
    embedding_model: {
        "provider": "jumpstart",
        "instance_type": embedding_model_instance_type,
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "1"},
        "version":"*",
    },
    # ai21_inference_model:{
    #     "provider": "marketplace",
    #     "instance_type": ai21_instance_type,
    #     "env": {
    #         'HF_MODEL_ID': ai21_inference_model,
    #         'SM_NUM_GPUS': json.dumps(ai21_number_of_gpu),
    #         'MAX_INPUT_LENGTH': json.dumps(ai21_max_input_length),
    #         'MAX_TOTAL_TOKENS': json.dumps(ai21_max_total_tokens),
    #     },
    #     "version":"1.0.0",
    # }
}

if load_endpoint:
    _MODEL_CONFIG_[falcon_inference_model]['endpoint_name'] = falcon_endpoint_name
    _MODEL_CONFIG_[embedding_model]['endpoint_name'] = embedding_endpoint_name    
    # _MODEL_CONFIG_[ai21_inference_model]['endpoint_name'] = ai21_endpoint_name

In [None]:
from sagemaker import ModelPackage, get_execution_role

def deploy_ai21(model_id: str, config: dict):
    "Manages the model deployment to the SageMaker environment for the AI21 models"
    role = get_execution_role()
    sagemaker_session = sagemaker.Session()

    runtime_sm_client = boto3.client("runtime.sagemaker")
    model_package_map = {
        "us-east-1": "arn:aws:sagemaker:us-east-1:865070037744:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "us-east-2": "arn:aws:sagemaker:us-east-2:057799348421:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "us-west-1": "arn:aws:sagemaker:us-west-1:382657785993:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "us-west-2": "arn:aws:sagemaker:us-west-2:594846645681:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ca-central-1": "arn:aws:sagemaker:ca-central-1:470592106596:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "eu-central-1": "arn:aws:sagemaker:eu-central-1:446921602837:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "eu-west-1": "arn:aws:sagemaker:eu-west-1:985815980388:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "eu-west-2": "arn:aws:sagemaker:eu-west-2:856760150666:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "eu-west-3": "arn:aws:sagemaker:eu-west-3:843114510376:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "eu-north-1": "arn:aws:sagemaker:eu-north-1:136758871317:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ap-southeast-1": "arn:aws:sagemaker:ap-southeast-1:192199979996:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ap-southeast-2": "arn:aws:sagemaker:ap-southeast-2:666831318237:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ap-northeast-2": "arn:aws:sagemaker:ap-northeast-2:745090734665:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ap-northeast-1": "arn:aws:sagemaker:ap-northeast-1:977537786026:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "ap-south-1": "arn:aws:sagemaker:ap-south-1:077584701553:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39",
        "sa-east-1": "arn:aws:sagemaker:sa-east-1:270155090741:model-package/j2-grande-v1-0-43-6964668ff5aa38edbffbbb3f57e83d39"
    }
    region = boto3.Session().region_name
    if region not in model_package_map.keys():
        raise ("UNSUPPORTED REGION")

    model_package_arn = model_package_map[region]

    # create a deployable model from the model package.
    model = ModelPackage(
        role=aws_role, 
        model_package_arn=model_package_arn,
        sagemaker_session=sagemaker_session
    )

    # Deploy the model
    predictor = model.deploy(1, 
        config['instance_type'],
        endpoint_name=model_id, 
        model_data_download_timeout=3600,
        container_startup_health_check_timeout=600,
        wait=False
    )
    return predictor

In [None]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

# Deploys all model endpoints in parallel
for model_id in _MODEL_CONFIG_:
    # if load_endpoint  and endpoint_name is set on _MODEL_CONFIG_[model_id]
    if load_endpoint and _MODEL_CONFIG_[model_id].get("endpoint_name", ""):
        print(f"{bold}Loading endpoint {_MODEL_CONFIG_[model_id]['endpoint_name']}{unbold}{newline}")
    else:
        # Otherwise deploy new endpoint
        endpoint_name = name_from_base(f"{model_id}")
        endpoint_config = _MODEL_CONFIG_[model_id]
        instance_type = endpoint_config["instance_type"]
        model_version = endpoint_config["version"]
        if endpoint_config["provider"] == "marketplace":
            deploy_ai21(model_id, endpoint_config)
        else:
            print(f"Deploying model {model_id} to SageMaker Enpdoint on a {instance_type} instance type")
            # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
            deploy_image_uri = image_uris.retrieve(
                region=None,
                framework=None,  # automatically inferred from model_id
                image_scope="inference",
                model_id=model_id,
                model_version=model_version,
                instance_type=instance_type,
            )
            # Retrieve the model uri.
            model_uri = model_uris.retrieve(
                model_id=model_id, model_version=model_version, model_scope="inference"
            )
            model_inference = Model(
                image_uri=deploy_image_uri,
                model_data=model_uri,
                role=aws_role,
                predictor_cls=Predictor,
                name=model_id,
                env=endpoint_config["env"],
            )
            model_predictor_inference = model_inference.deploy(
                initial_instance_count=1,
                instance_type=instance_type,
                predictor_cls=Predictor,
                name=endpoint_name,
                wait=False
            )

        print(f"{bold}Deployment of model {model_id} has been initialized.{unbold}{newline}")
        _MODEL_CONFIG_[model_id]["endpoint_name"] = endpoint_name



We wait for the embedding model and the inference models selected to be created. it usually takes 5-10 mins 

In [None]:
describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[embedding_model]['endpoint_name'])
while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_config['endpoint_name'])
print('\nEmbedding endpoint created')

In [None]:
describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[falcon_inference_model]['endpoint_name'])
while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[falcon_inference_model]['endpoint_name'])
print('\nFalcon40B inference endpoint created')

In [None]:
describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=_MODEL_CONFIG_[ai21_inference_model]['endpoint_name'])
while describe_inference_endpoint_response["EndpointStatus"] == 'Creating':
    time.sleep(15)
    print('.', end='')
    describe_inference_endpoint_response = sm_client.describe_endpoint(EndpointName=endpoint_config['endpoint_name'])
print('\nAI21 Jurrasic Grande inference endpoint created')

We store few variables to be used in the next notebook

In [None]:
%store _MODEL_CONFIG_
%store falcon_inference_model
%store embedding_model
# %store ai21_inference_model