# Inference API

In [3]:
import os

## Amazon SageMaker

### Mistral on my own ECR

In [None]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	# role = sagemaker.get_execution_role()
	role = 'arn:aws:iam::<REPLACE WITH ACCOUNT ID>:role/service-role/AmazonSageMaker-ExecutionRole-20231220T180882'
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID': 'mistralai/Mistral-7B-Instruct-v0.1',
	'SM_NUM_GPUS': json.dumps(1),
}



In [None]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	# image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	image_uri='public.ecr.aws/p4k4c4t9/huggingface-pytorch-inference-extended',
	# transformers_version='4.10.2',
	# pytorch_version='1.8.1',
	# py_version='py36',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.2xlarge",
	container_startup_health_check_timeout=300,
  )
  
# send request
predictor.predict({
	"inputs": "My name is Julien and I like to",
})



### Llama 7B

In [None]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
	# role = sagemaker.get_execution_role()
	role = 'arn:aws:iam::<REPLACE WITH ACCOUNT ID>:role/service-role/AmazonSageMaker-ExecutionRole-20231220T180882'
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Llama-2-7b-chat-hf',
	'SM_NUM_GPUS': json.dumps(1),
	'HUGGING_FACE_HUB_TOKEN': '<REPLACE WITH YOUR TOKEN>'
}

assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="1.1.0"),
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g4dn.2xlarge",
	container_startup_health_check_timeout=300,
  )
  
# send request
predictor.predict({
	"inputs": "My name is Julien and I like to",
})



## HuggingFace Inference API

In [29]:
import requests

api_key = os.environ["INFERENCE_API_KEY"]

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-70b-chat-hf"
headers = {"Authorization": f"Bearer {api_key}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "Can you please let us know more details about your ",
})

output



[{'generated_text': '2019 trip to Japan?\nWe are planning a trip to Japan in 2019 and would love to hear more about your experience.\nWe are a family of 4 (2 adults and 2 children aged 10 and 12) and are interested in visiting Tokyo, Kyoto, Osaka and Hiroshima.\nWe would appreciate any information you can share about your itinerary, accommodation, transportation and any tips or'}]

In [5]:
# with parameters
output = query({
	"inputs": "Can you please let us know more details about your ",
    "parameters": {"max_new_tokens": 250, "max_time": 60.0, "return_full_text": False},
    "options": {"wait_for_model": True}
})

output[0]['generated_text']



'2018-2019 season?\n\nWe are a small, but very active club. We have 15 members, 10 of which are active. We have 2 meetings per month, one is a business meeting and the other is a social meeting. We have a lot of fun and we are very active in the community. We have a lot of fundraisers and we donate to many different organizations. We have a lot of fun and we are very active in the community.\n\nWhat are some of the highlights of your 2018-2019 season?\n\nWe had a lot of fun and we are very active in the community. We have a lot of fundraisers and we donate to many different organizations. We have a lot of fun and we are very active in the community.\n\nWhat are some of the challenges you faced this year?\n\nWe had a lot of fun and we are very active in the community. We have a lot of fundraisers and we donate to many different organizations. We have a lot of fun and we are very active in the community.\n\nWhat are some of the things you are looking'

In [28]:
import requests

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-13b-chat-hf"
headers = {"Authorization": f"Bearer {api_key}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": "What are some problems with monolithic ML pipelines? ",
})

output


[{'generated_text': ' What are some potential solutions?\n\nMonolithic ML pipelines, where all the data is processed in a single pipeline, can be challenging to maintain and scale.  Here are some potential problems and solutions:\n\nProblems:\n\n1. Data heterogeneity: Monolithic pipelines can struggle with handling diverse data types and sizes, leading to inefficiencies and errors.\n\nSolution: Use modular, microservices-based architectures'}]

In [27]:
API_URL = "https://<your_code>.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
    "inputs": "What are some problems with monolithic ML pipelines? ",
})

output


[{'generated_text': ' What are some potential solutions?\n\nMonolithic machine learning (ML) pipelines are a common approach to building and deploying ML models. However, they can have some limitations and potential problems. Here are some of the issues you may encounter with monolithic ML pipelines and some potential solutions:\n\n1. Inflexibility: Monolithic pipelines can be inflexible and difficult to modify once they are built. This can make it challenging to adapt'}]