In [None]:
!pip install -qU langchain==0.1.16 langchain-core==0.1.42 langchain-openai==0.1.3 langgraph==0.0.37 langchainhub==0.1.15

In [None]:
import os
import configparser

from langchain.chat_models import ChatOpenAI


def credential_init():

  credential_file = "credentials.ini"

  if os.path.exists(credential_file):
      credentials = configparser.ConfigParser()
      credentials.read(credential_file)
      os.environ['OPENAI_API_KEY'] = credentials['openai'].get('api_key')
  else:
      os.environ['OPENAI_API_KEY'] = os.environ['OPENAI']

credential_init()


chat_model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
             model_name="gpt-4o-2024-05-13", temperature=0)

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

hf_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

processor = BlipProcessor.from_pretrained(hf_model)
model = BlipForConditionalGeneration.from_pretrained(hf_model).to(device)

In [None]:
from IPython.display import Image

Image('yghzBMOFHZRKGvRuw6AM6.png', width=500, height=750)

In [None]:
from PIL import Image

image = Image.open("yghzBMOFHZRKGvRuw6AM6.png").convert('RGB')

In [None]:
# unconditional image captioning
inputs = processor(image, return_tensors="pt").to(device)

In [None]:
out = model.generate(**inputs, max_new_tokens=20)
print(processor.decode(out[0], skip_special_tokens=True))

In [None]:
out

In [None]:
out = model.generate(**inputs, max_new_tokens=150)
print(processor.decode(out[0], skip_special_tokens=True))

In [None]:
from langchain.prompts import PromptTemplate
from langchain.tools import BaseTool
from langchain.agents import AgentExecutor, create_react_agent


class ImageCaptionTool(BaseTool):
    name = "Image captioner"
    description = """
    use this tool when given the file of an image that you'd like to be
    described. It will return a simple caption describing the image
    """

    def _run(self, filename):
      # download the image and convert to PIL object
      image = Image.open(filename.strip()).convert('RGB')
      # preprocess the image
      inputs = processor(image, return_tensors="pt").to(device)

      # generate the caption
      with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=20)

      # get the caption
      caption = processor.decode(output_ids[0], skip_special_tokens=True)
      return caption

    def _arun(self, query: str):
      raise NotImplementedError("This tool does not support async")

tools = [ImageCaptionTool()]


In [None]:
prompt_template = """
Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer

Thought: you should always think about what to do

Action: the action to take, should be one of [{tool_names}]

Action Input: the input to the action

Observation: the result of the action

... (this Thought/Action/Action Input/Observation can repeat N times)

Thought: I now know the final answer

Final Answer: the final answer to the original input question

Begin!

Question: {input}

Thought:{agent_scratchpad}
"""

prompt = PromptTemplate.from_template(prompt_template)

zero_shot_agent = create_react_agent(
    llm=chat_model,
    tools=tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(agent=zero_shot_agent, tools=tools, verbose=True)

In [None]:
filename = "yghzBMOFHZRKGvRuw6AM6.png"

agent_executor.invoke({"input": "What is the caption for this image? yghzBMOFHZRKGvRuw6AM6.png"})

The death of Elaine Herzberg (August 2, 1968 – March 18, 2018) was the first recorded case of a pedestrian fatality involving a self-driving car, after a collision that occurred late in the evening of March 18, 2018. Herzberg was pushing a bicycle across a four-lane road in Tempe, Arizona, United States, when she was struck by an Uber test vehicle, which was operating in self-drive mode with a human safety backup driver sitting in the driving seat. Herzberg was taken to the local hospital where she died of her injuries.[2][3][4]


伊萊恩·赫茲伯格（Elaine Herzberg，1968年8月2日－2018年3月18日）的死亡是首例涉及自動駕駛車輛的行人死亡案例。這起碰撞事故發生於2018年3月18日的深夜，赫茲伯格當時正在亞利桑那州坦佩市推著一輛自行車穿越一條四車道的道路，結果被一輛Uber測試車輛撞上。該車輛當時正處於自動駕駛模式，車內有一位人類安全備用駕駛員坐在駕駛座上。赫茲伯格被送往當地醫院，最終因傷勢過重去世。

## Use the GPT-4o multi-modal for image caption

In [None]:
import io
import base64

from langchain.prompts import ChatPromptTemplate
from langchain_core.messages.human import HumanMessage


class ImageCaptionTool(BaseTool):
    name = "Image captioner"
    description = """
    use this tool when given the file of an image that you'd like to be
    described. It will return a simple caption describing the image
    """

    def _run(self, filename):
      image_str = self.image_to_base64(filename.strip())

      human_message = HumanMessage(content=[{'type': 'text',
                           'text': 'What is in this image?'},
                          {'type': 'image_url',
                           'image_url': {
                              'url': f"data:image/jpeg;base64,{image_str}"}
                          }]
                    )
      prompt = ChatPromptTemplate.from_messages([human_message])
      chain = prompt|chat_model

      caption = chain.invoke(input={}).content

      return caption

    def _arun(self, query: str):
      raise NotImplementedError("This tool does not support async")

    def image_to_base64(self, image_path):
      with Image.open(image_path) as image:
          buffered = io.BytesIO()
          image.save(buffered, format="JPEG")
          image_str = base64.b64encode(buffered.getvalue())

      return image_str.decode('utf-8')

tools = [ImageCaptionTool()]

In [None]:
prompt_template = """
Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer

Thought: you should always think about what to do

Action: the action to take, should be one of [{tool_names}]

Action Input: the input to the action

Observation: the result of the action

... (this Thought/Action/Action Input/Observation can repeat N times)

Thought: I now know the final answer

Final Answer: the final answer to the original input question

Begin!

Question: {input}

Thought:{agent_scratchpad}
"""

prompt = PromptTemplate.from_template(prompt_template)

zero_shot_agent = create_react_agent(
    llm=chat_model,
    tools=tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(agent=zero_shot_agent, tools=tools, verbose=True)

In [None]:
filename = "yghzBMOFHZRKGvRuw6AM6.png"

agent_executor.invoke({"input": "What is the caption for this image? yghzBMOFHZRKGvRuw6AM6.png"})

In [None]:
from langchain_core.messages.system import SystemMessage


class ImageCaptionTool(BaseTool):
    name = "Image captioner"
    description = """
    Use this tool when given the file of an image that should be described.
    YIt will return a detailed description of the image.
    """

    def _run(self, filename):
      image_str = self.image_to_base64(filename.strip())

      system_message = SystemMessage(content="""
                      You are a helpful AI assistant describing content
                      of an image in great details.""")
      human_message = HumanMessage(content=[{'type': 'text',
                           'text': 'describe this image:'},
                          {'type': 'image_url',
                           'image_url': {
                              'url': f"data:image/jpeg;base64,{image_str}"}
                          }]
                    )
      prompt = ChatPromptTemplate.from_messages([
          system_message,
          human_message
          ])
      chain = prompt|chat_model

      caption = chain.invoke(input={}).content

      return caption

    def _arun(self, query: str):
      raise NotImplementedError("This tool does not support async")

    def image_to_base64(self, image_path):
      with Image.open(image_path) as image:
          buffered = io.BytesIO()
          image.save(buffered, format="JPEG")
          image_str = base64.b64encode(buffered.getvalue())

      return image_str.decode('utf-8')

tools = [ImageCaptionTool()]

prompt_template = """
Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer

Thought: you should always think about what to do

Action: the action to take, should be one of [{tool_names}]

Action Input: the input to the action

Observation: the result of the action

... (this Thought/Action/Action Input/Observation can repeat N times)

Thought: I now know the final answer

Final Answer: the final answer to the original input question

Begin!

Question: {input}

Thought:{agent_scratchpad}
"""

prompt = PromptTemplate.from_template(prompt_template)

zero_shot_agent = create_react_agent(
    llm=chat_model,
    tools=tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(agent=zero_shot_agent, tools=tools, verbose=True)

In [None]:
filename = "yghzBMOFHZRKGvRuw6AM6.png"

agent_executor.invoke({"input": "What is the caption for this image? yghzBMOFHZRKGvRuw6AM6.png"})