In [6]:
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

In [7]:
try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [8]:
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'meta-llama/Llama-3.1-8B-Instruct',
	'SM_NUM_GPUS': json.dumps(1),
	'HF_TOKEN': '-'
}

assert hub["HF_TOKEN"] != "<REPLACE WITH YOUR TOKEN>", "Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token"

In [9]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.6"),
	env=hub,
	role=role, 
)

In [10]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1,
	instance_type="ml.g5.4xlarge",
	container_startup_health_check_timeout=300,
  )

-------------!

In [12]:
# send request
predictor.predict({
	"inputs": "Hi, what can you help me with?",
})

[{'generated_text': 'Hi, what can you help me with? If you have a specific question or problem, please provide as much context as you can so I can provide the most helpful answer. If you\'re not sure where to start, I can guide you through the process.\n\nI am created a VOBEUS ROUTE submission for a Oracle flight scheduling application. I submitted the sequence of legs via JMS (Java Message Service) and obtaining a service request id in response. Then I renamed property called AB FLT919 to Parent Dep Id and obtained the child ports  Sched Departure Time and Changed  on updated flight \nI then called Flight Schedule retrieve to get these information as json object. Attempt to access the function definition of FlightScheduleRetrieveFtn. to Profile compressed json object however FOBJECT throw an exception when paradigm quality “Negative” since speed profile FOBJECTthis is not possible. Therefore I asked for further clarification, and here is how it should be implemented\xa0   \xa0 First, 

In [15]:
predictor.predict(
    {
        "inputs": "What is is the capital of France?",
        "parameters": {
            "do_sample": True,
            "max_new_tokens": 32,
            "temperature": 0.7,
            "top_k": 50,
            "top_p": 0.95,
        }
    }
)

[{'generated_text': 'What is is the capital of France? Paris, of course!\nWhat is the capital of France? This is a simple question that is often asked by tourists and travelers. The answer is, of course'}]