# Prepare PaliGemma for deployment

Run the cells below and follow the instructions to deploy the model to the endpoint. You should set the PROCESSING_DIR variable to a directory on your machine which is not git-tracked.  

In [1]:
PROCESSING_DIR = "./TEMPS" 

In [2]:
pwd

'/home/sagemaker-user/tmp/sample-genai-edge-assistant/backend/backend/sagemaker/sagemakerendpoint/prepare_model'

In [3]:
!mkdir -p {PROCESSING_DIR}/code

In [4]:
%%writefile {PROCESSING_DIR}/code/requirements.txt
accelerate
bitsandbytes
git+https://github.com/huggingface/transformers.git@v4.41.2
Pillow

Writing ./TEMPS/code/requirements.txt


In [5]:
%%writefile {PROCESSING_DIR}/code/inference.py
from transformers import AutoTokenizer, PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
from PIL import Image
import base64
from io import BytesIO

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def process_image(img_bytes):
    buffer = BytesIO(img_bytes)
    img = Image.open(buffer)
    return img.convert("RGB")

def initialize_model(model_path):
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_path, 
        torch_dtype=torch.bfloat16
    ).to(device)
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=dtype,
        device_map=device,
        revision="bfloat16",
    ).eval()
    processor = PaliGemmaProcessor.from_pretrained(model_path)
    return model, processor

def generate_response(input_data, model_and_processor):
    model, processor = model_and_processor
    
    text_prompt = input_data.get("prompt", "")
    image_data = base64.b64decode(input_data.get("image", ""))
    
    processed_image = process_image(image_data)

    model_input = processor(
        text=text_prompt, 
        images=processed_image, 
        padding="longest", 
        do_convert_rgb=True, 
        return_tensors="pt"
    ).to(device, dtype=model.dtype)

    with torch.no_grad():
        output_ids = model.generate(**model_input, max_length=496)
        output_text = processor.decode(output_ids[0], skip_special_tokens=True)
    
    return {"response": output_text}

def model_fn(model_dir):
    return initialize_model(model_dir)

def predict_fn(data, model_process):
    return generate_response(data, model_process)

Writing ./TEMPS/code/inference.py


## Fetch the model

Alternatively to the following code, you can download the model by yourself (using Huggingface CLI, ...)

In [7]:
import random
from pathlib import Path
from shutil import copytree
from huggingface_hub import snapshot_download

MODEL_IDENTIFIER = "google/paligemma-3b-mix-224"

# Prerequisite: Accept the Gemma terms and conditions: https://huggingface.co/google/paligemma-3b-mix-224
# Enter your HuggingFace token: https://huggingface.co/settings/tokens
user_token = input("Enter your HuggingFace token: ")
if not user_token:
    raise ValueError("HuggingFace token is required.")

# Fetch model snapshot
snapshot_path = snapshot_download(
    repo_id=MODEL_IDENTIFIER,
    use_auth_token=user_token,
    local_dir=Path(PROCESSING_DIR, "hf_download")
)

# Create model directory with random name
unique_id = random.getrandbits(16)
model_directory = Path(PROCESSING_DIR, f"model-{unique_id}")
model_directory.mkdir(exist_ok=True)

# Copy snapshot to model directory
copytree(snapshot_path, str(model_directory), dirs_exist_ok=True)

# Copy code/ to model directory
copytree(Path(PROCESSING_DIR, "code"), model_directory.joinpath("code"), dirs_exist_ok=True)

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

PosixPath('TEMPS/model-47549/code')

## Compress the model

In [8]:
!tar -czvf {PROCESSING_DIR}/model.tar.gz -C {model_directory} .

./
./.cache/
./.cache/huggingface/
./.cache/huggingface/.gitignore
./.cache/huggingface/download/
./.cache/huggingface/download/added_tokens.json.lock
./.cache/huggingface/download/model-00001-of-00003.safetensors.lock
./.cache/huggingface/download/model-00002-of-00003.safetensors.lock
./.cache/huggingface/download/model-00003-of-00003.safetensors.lock
./.cache/huggingface/download/config.json.lock
./.cache/huggingface/download/.gitattributes.lock
./.cache/huggingface/download/generation_config.json.lock
./.cache/huggingface/download/README.md.lock
./.cache/huggingface/download/added_tokens.json.metadata
./.cache/huggingface/download/model.safetensors.index.json.lock
./.cache/huggingface/download/config.json.metadata
./.cache/huggingface/download/preprocessor_config.json.lock
./.cache/huggingface/download/README.md.metadata
./.cache/huggingface/download/.gitattributes.metadata
./.cache/huggingface/download/special_tokens_map.json.lock
./.cache/huggingface/download/tokenizer.json.lock
.

./model-00002-of-00003.safetensors
./model-00001-of-00003.safetensors
./code/
./code/requirements.txt
./code/inference.py


In [9]:
from sagemaker.s3 import S3Uploader

# Upload model.tar.gz to s3
account_id="" #input("Please fill in your AWS account id: ")
s3_model_uri=S3Uploader.upload(local_path=f"{PROCESSING_DIR}/model.tar.gz", desired_s3_uri=f"s3://vis-assis-sagemaker-endpoint-model-{account_id}/paligemma")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
