In [1]:
%pip install huggingface_hub 

Note: you may need to restart the kernel to use updated packages.


In [3]:
import sagemaker

# Your IAM role that provides access to SageMaker and S3. 
# See https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-ex-role.html if running on a SageMaker notebook
iam_role = sagemaker.get_execution_role()  # execution role for the endpoint

# manages interactions with the sagemaker apis
sagemaker_session = sagemaker.session.Session()

# region is needed to retrieve the lmi container
region = sagemaker_session._region_name

# get the lmi image uri
# available frameworks: "djl-lmi" (for vllm), "djl-tensorrtllm" (for tensorrt-llm)
container_uri = sagemaker.image_uris.retrieve(framework="djl-lmi", version="0.30.0", region=region)

# create a unique endpoint name
endpoint_name = sagemaker.utils.name_from_base("my-lmi-endpoint")

In [4]:
container_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.30.0-lmi12.0.0-cu124'

In [5]:
endpoint_name

'my-lmi-endpoint-2025-11-21-10-21-36-572'

In [6]:
from huggingface_hub import snapshot_download
import boto3, os

hf_token = "YOUR_HF_TOKEN"
model_id = "meta-llama/Llama-3.2-1B"

local_dir = "./my-lmi-model"

snapshot_path = snapshot_download(
    repo_id=model_id,
    local_dir=local_dir,
    token=hf_token,
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

In [7]:
bucket = "YOUR_BUCKET_NAME"
prefix = "my-lmi-model"     # this is the exact folder name you wanted in S3

s3 = boto3.client("s3")

for root, dirs, files in os.walk(local_dir):
    for file in files:
        full_path = os.path.join(root, file)
        relative = os.path.relpath(full_path, local_dir)
        key = f"{prefix}/{relative}"
        s3.upload_file(full_path, bucket, key)

In [8]:
%%writefile serving.properties
engine=Python
option.entryPoint=djl_python.huggingface
option.rolling_batch=vllm
option.dtype=bf16
option.tensor_parallel_degree=max
option.max_rolling_batch_size=256

Overwriting serving.properties


In [9]:
serving_file = "./serving.properties"     # wherever your file is located locally
serving_key  = f"{prefix}/serving.properties"

s3.upload_file(serving_file, bucket, serving_key)

In [10]:
s3 = boto3.client("s3")

response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

if "Contents" in response:
    for obj in response["Contents"]:
        print(obj["Key"])
else:
    print("No objects found under this prefix.")

my-lmi-model/.cache/huggingface/.gitignore
my-lmi-model/.cache/huggingface/download/.gitattributes.lock
my-lmi-model/.cache/huggingface/download/.gitattributes.metadata
my-lmi-model/.cache/huggingface/download/LICENSE.txt.lock
my-lmi-model/.cache/huggingface/download/LICENSE.txt.metadata
my-lmi-model/.cache/huggingface/download/README.md.lock
my-lmi-model/.cache/huggingface/download/README.md.metadata
my-lmi-model/.cache/huggingface/download/USE_POLICY.md.lock
my-lmi-model/.cache/huggingface/download/USE_POLICY.md.metadata
my-lmi-model/.cache/huggingface/download/config.json.lock
my-lmi-model/.cache/huggingface/download/config.json.metadata
my-lmi-model/.cache/huggingface/download/generation_config.json.lock
my-lmi-model/.cache/huggingface/download/generation_config.json.metadata
my-lmi-model/.cache/huggingface/download/model.safetensors.lock
my-lmi-model/.cache/huggingface/download/model.safetensors.metadata
my-lmi-model/.cache/huggingface/download/original/consolidated.00.pth.lock
my

In [11]:
model_data = {
    "S3DataSource": {
        "S3Uri": "s3://YOUR_BUCKET_NAME/my-lmi-model/",
        'S3DataType': 'S3Prefix',
        'CompressionType': 'None'
    }
} 

In [15]:
instance_type = "ml.g5.12xlarge"

In [16]:
model = sagemaker.Model(image_uri=container_uri, model_data=model_data, role=iam_role)
# deploy your model
model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    endpoint_name=endpoint_name,
)

----------!

In [17]:
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [18]:
outputs = predictor.predict({
    "inputs": "The meaning of life is", 
    "parameters": {"do_sample": True, "max_new_tokens": 256}
})

In [19]:
print(outputs)

{'generated_text': " death, but in a merry fashion. Anita Ruyer examines that phase of existence in which the two are at least equal.\nThe meaning of life is death, but with a certain merry flavour. Anita Ruyer discusses that phase of existence in which the two are at least equal and unveils different theories about what it should consist of. The author draws an impressionistic outline and fills in the blanks with her distinctive style. On the basis of this, the reader will form her or his own individual meaning. She gives the formula 'live and let live' and, with characteristic sparkle, adds, 'a little humility does not go amiss,' which is generally held as an important principle.\nSample Book 1-3 PSD Logo High â‚¬18.23 $19.00 n/a"}


In [20]:
sagemaker_session.delete_endpoint(endpoint_name)
sagemaker_session.delete_endpoint_config(endpoint_name)
model.delete_model()