In [1]:
! pip install -q langchain_google_genai langchain_openai langchain

In [2]:
import base64
import mimetypes

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI

from langchain.schema.messages import HumanMessage
from langchain.output_parsers import PydanticOutputParser

from pydantic import BaseModel, Field

from dotenv import load_dotenv
load_dotenv() 

True

In [3]:
def load_client(provider="openai", model_id="gpt-4o"):

    if provider == "google":
        return ChatGoogleGenerativeAI(
            model=model_id
        )

    elif provider == "openai":
        return ChatOpenAI(
            model=model_id
        )

In [4]:
def encodeimg(img_path):
    ext = img_path.split('.')[-1].lower()
    mime_type = mimetypes.types_map.get(f'.{ext}', 'image/png')
    with open(img_path, "rb") as file:
        b64 = base64.b64encode(file.read()).decode('utf-8')
        return f"data:{mime_type};base64,{b64}"

In [5]:
class CaptionSchema(BaseModel):
    caption: str = Field(description="Image caption")

In [6]:
class ImageCaptionGenerator():
    def __init__(self):
        self.template = """
    You are an image caption generator. You will be provided with an image, generate a caption for it
    in not more than 10 words.
"""
        self.model = load_client()
        self.parser = PydanticOutputParser(pydantic_object=CaptionSchema)

    def generate_caption(self, inputs):
        content = [
            {
                "type": "text",
                "text": self.template
            },
            {
                "type": "text",
                "text": self.parser.get_format_instructions()
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": encodeimg(inputs['img_path'])
                }
            }
        ]

        result = self.model.invoke(
            [
                HumanMessage(
                    content = content
                )
            ]
        )

        return result.content
    
    def invoke_chain(self, img_path):
        chain = self.generate_caption | self.parser
        result = chain.invoke(
            {
                "img_path": img_path
            }
        )
        return result.model_dump()

In [7]:
img_path = "../assets/sample_img.jpeg"
caption_generator = ImageCaptionGenerator()
caption_generator.invoke_chain(img_path)

{'caption': 'Two playful puppies sitting among orange flowers.'}