In [None]:
!pip install transformers --upgrade
!pip uninstall bitsandbytes -y
!pip install bitsandbytes --upgrade
!pip install sagemaker --upgrade

## Run Image Reasoning LLaVA-NeXT model Locally

If running on a g4dn instance type and you require to quantize the model to fit in the GPU, you might need to run:

```bash
cd /opt/conda/lib/python3.10/site-packages/bitsandbytes/
cp libbitsandbytes_cuda117.so libbitsandbytes_cpu.so
```

and restart this kernel

In [None]:
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch
from PIL import Image
import requests

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16,
            low_cpu_mem_usage=True, device_map="cuda:0", quantization_config=BitsAndBytesConfig(load_in_4bit=True)) 
#model.to("cuda:0")

# prepare image and text prompt, using the appropriate prompt template
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"

inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")

# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)

print(processor.decode(output[0], skip_special_tokens=True))

## Deploy to Amazon SageMaker real-time endpoint with custom inference code

In [None]:
# package_code
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

!tar czvf model.tar.gz code/
code_artifact = sagemaker_session.upload_data("model.tar.gz", bucket, 'model_artifacts')
print(f"S3 Code or Model tar ball uploaded to --- > {code_artifact}")
!rm model.tar.gz

In [None]:
import sagemaker
from sagemaker.huggingface.model import HuggingFaceModel

role = sagemaker.get_execution_role()

HF_MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf"

hub = {
    "HF_MODEL_ID": HF_MODEL_ID,
    "HF_TASK": "visual-question-answering",
}

# creating SageMaker Model
huggingface_model = HuggingFaceModel(
    name='llava-v16',
    transformers_version="4.37",
    model_data=code_artifact,
    pytorch_version="2.1",
    py_version="py310",
    #env=hub,
    role=role,
)

predictor = huggingface_model.deploy(
    instance_type='ml.g5.xlarge',
    initial_instance_count=1,
    endpoint_name='sm-endpoint',
    role=role,
    tags={},
    model_data_download_timeout=3600,
    container_startup_health_check_timeout=1200,
)

In [None]:
# Test Inference
inputs = {
    "inputs": {
        "image": 'http://images.cocodataset.org/val2017/000000039769.jpg',
        "question": "What is the species of the cat depicted in the picture?"
    }
}

response = predictor.predict(inputs)
print(response[0]['answer'])

## Cleanup

In [None]:
predictor.delete_endpoint()
huggingface_model.delete_model()