## Image and video understanding using Amazon Nova

### Set up helper methods and imports
A few helper methods to interact with the Bedrock API anse parse inputs/outputs

In [93]:
import boto3
import os
import json
import base64
import re

bedrock = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-west-2",
)

#MODEL_ID = "us.amazon.nova-lite-v1:0"
MODEL_ID = "us.amazon.nova-pro-v1:0"


def encode_file_to_base64(file_path):
    # Extract file type from file_path
    file_type = file_path.split(".")[-1]
    
    with open(file_path, "rb") as file:
        encoded_string = base64.b64encode(file.read())
        return encoded_string.decode("utf-8"), file_type


def is_path_to_video(path):
    return os.path.isfile(path) and path.split(".")[-1] == "mp4"


def is_path_to_image(path):
    return os.path.isfile(path) and path.split(".")[-1].lower() in ("jpeg", "jpg", "png")


def get_prompt_payload(args, prefill=None):
    content = []

    # Parse all provided arguments and formulate payload.
    for arg in args:
        if is_path_to_video(arg) or is_path_to_image(arg):
            
            KEY = "video" if is_path_to_video(arg) else "image"
            print(f"** {KEY} provided")
            encoded_file, file_type = encode_file_to_base64(arg)
            
            content.append(
                {
                    KEY: {
                        "format": file_type,
                        "source": {"bytes": encoded_file},
                    }
                }
              )

        else:
            # Arg is text, i.e the prompt. Always keep the prompt that's 
            # referring to the image/video at the end of the request payload.
            content.append({"text": arg})

    messages = [
            {
                "role": "user",
                "content": content,
            }
        ]

    if prefill:
        print('** Prefilling the reponse')
        messages.append(
            {
                "role": "assistant",
                "content": [{"text": prefill}],
            }
        )


    prompt_config = {
        # "system": "You are a bla bla", # Working with images/videos, it's best to include this in the final prompt instead of in the system prompt (which is added to the beginning of the prompt payload.)
        "inferenceConfig": {
            "temperature": 0
        },
        "messages": messages,
    }

    print('\n')
    return json.dumps(prompt_config)


def invoke_model(*args, prefill=None):
    payload = get_prompt_payload(args, prefill=prefill)
    response = bedrock.invoke_model(
        body=payload,
        modelId=MODEL_ID,
    )

    return json.loads(response.get("body").read())


def get_completion_from_response(response):
    return response["output"]["message"]["content"][0]["text"]


def extract_and_load_json(text):
    # Find the JSON-like structure
    match = re.search(r"'''json\n(.*?)'''", text, re.DOTALL)
    
    if not match:
        raise ValueError("No JSON object found in the string")
    
    json_str = match.group(1)
    
    # Parse the JSON string
    try:
        json_obj = json.loads(json_str)
        return json_obj
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON format: {e}")

### Set base_dir for assets. 

Note that this may require updating! Valida

In [94]:
from pathlib import Path

current_dir = Path.cwd()
assets_dir = current_dir.joinpath('image-video-understanding', 'assets')
print('\n *** VERIFY THAT THIS IS CORRECT BEFORE CONTINUING ***\n')
print('The assest are located in:', assets_dir)


 *** VERIFY THAT THIS IS CORRECT BEFORE CONTINUING ***

The assest are located in: /home/sagemaker-user/playground/image-video-understanding/assets


### Simple prompt

In [95]:
prompt = """
Describe what's happening in the image above.
"""

response = invoke_model(f'{assets_dir}/image.png', prompt)
print(get_completion_from_response(response))


** image provided


The image depicts a snowy landscape viewed from a window. A person is riding a snowmobile on a snow-covered path. The scene is set in a winter environment with mountains in the background.


In [96]:
prompt = """
Describe what's happening in the video above.
"""

response = invoke_model(f'{assets_dir}/video.mp4', prompt)
print(get_completion_from_response(response))

** video provided


The video shows a person riding a snowmobile across a snowy field. A dog follows behind the snowmobile.


### Add more structure to the prompt

In [97]:
prompt = """
## Instructions
You are an image anlysis tool. 
Given the image above, you provide clear and comprehensive descriptions of the image, following the rules below.

## Rules
- You MUST describe the image in an engaging way, as if you are a sports commentator.
- You MUST answer in a json object

"""

response = invoke_model(f'{assets_dir}/image.png', prompt)
print(get_completion_from_response(response))


** image provided


{
  "description": "Welcome, folks, to what looks like a thrilling winter sports scene! We're looking at a snowy landscape, possibly a frozen lake or a wide, open field blanketed in white. In the foreground, there's a wooden deck or balcony, partially covered in snow, giving us a perfect vantage point. To the right, a person is riding a snowmobile, cutting through the snow with precision. The snowmobile is leaving a trail behind, indicating some serious speed and skill. In the background, we see a line of trees, their branches dusted with snow, and beyond them, a range of mountains, also capped with snow. The sky is overcast, suggesting it might be a chilly day. This image captures the essence of winter adventure and the joy of exploring the great outdoors on a snowmobile."
}


In [98]:
prompt = """
## Instructions
You are a video anlysis tool. 
Given the video above, you provide clear and comprehensive descriptions of the video, following the rules below.

## Rules
- You MUST describe the video in an engaging way, as if you are a sports commentator.
- You MUST answer in a json object

"""

response = invoke_model(f'{assets_dir}/video.mp4', prompt)
print(get_completion_from_response(response))


** video provided


{
  "description": "Welcome to an exhilarating display of winter sports! We're witnessing a thrilling snowmobile race set against the breathtaking backdrop of a snow-covered landscape. The scene unfolds from a vantage point, offering a panoramic view of the snowy terrain. As the race begins, a snowmobiler emerges, skillfully navigating the winding track. The snowmobiler maneuvers through the snow with precision, leaving a trail of excitement in their wake. The crisp air and pristine snow create the perfect setting for this adrenaline-fueled event. Spectators watch in awe as the snowmobiler speeds through the course, showcasing their prowess and determination. It's a true testament to the thrill of winter sports and the beauty of nature's winter wonderland."
}


### Entity extraction with response prefilling
Let's specify a desired structured response format.
To be certain we always get the response on a structured format, let's prefill the assistants response.

In [99]:
desired_info = {
    "num_people": "string",
    "num_dogs": "string",
    "num_snowmobiles": "string",
    "num_cars": "string",
    "num_horses": "string",
    "scene_description": "string",
}

prompt = f'''
## Instructions
You are an image anlysis tool. 
Given the image above, you analyse the image carefully, and extract the desired information on the schema provided below.
You always follow the rules below.

## Schema
{desired_info}

## Rules
- You MUST analyse the image carefully and provide the desired information.
- You MUST answer with a valid and parseable JSON object inside markdown tags

Analyse the image according to the instructions
'''

# We start the assistants response, forcing it continue on the what we provide below.
prefill = """
'''json
{
"""

# pass in the prefill in addition to the image and prompt
response = invoke_model(f'{assets_dir}/video.mp4', prompt, prefill=prefill)

# add the prefill back to the generated text.
complete_generation = prefill + get_completion_from_response(response)

print(complete_generation)

** video provided
** Prefilling the reponse



'''json
{
"num_people": "1",
  "num_dogs": "1",
  "num_snowmobiles": "1",
  "num_cars": "0",
  "num_horses": "0",
  "scene_description": "A person is riding a snowmobile in a snowy field, with a dog following behind. The scene is viewed from a window with a wooden frame, showing a vast snowy landscape with trees and mountains in the distance."
}
'''


#### Access the structured response

In [100]:
parsed_json = extract_and_load_json(complete_generation)
parsed_json.get('num_dogs')

'1'