In [4]:
from dotenv import load_dotenv


load_dotenv()
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

In [14]:
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
import base64, httpx
from PIL import Image
# Images to extract data from

model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
image = open("test.jpg", "rb").read()
image_data = base64.b64encode(image).decode("utf-8")
# image = Image.open("test.jpg")
# image_data = base64.b64encode(image.convert("RGB").tobytes()).decode("utf-8")# Download and encode the image

# Create a message with the image
message = HumanMessage(
    content=[
        {"type": "text", "text": "describe the fruit in this image"},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
        },
    ],
)

# Invoke the model with the message
response = model.invoke([message])

# Print the model's response
print(response.content)

This image does not contain a fruit. It shows the Eiffel Tower in Paris, France.


In [7]:
from pydantic import BaseModel, Field

class Location(BaseModel):
    """Location of an photo with reasoning."""

    latitude: str = Field(description="The latitude of the location")
    longitude: str = Field(description="The longitude of the location")
    reasoning: str = Field(description="Reasoning about the location of the photo")

In [22]:
import json
import base64
from typing import Dict, List, Any
from langgraph.graph import StateGraph, END
from langchain.tools import Tool
from langchain_core.pydantic_v1 import BaseModel
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from PIL import Image

# Mock tool implementations
def country_prediction(image: Image) -> str:
    return json.dumps({"country": "France"})

def reverse_image_search(image: Image) -> str:
    return json.dumps({"similar_images": ["eiffel_tower.jpg", "paris_street.jpg"]})

def google_search(image: Image) -> str:
    return json.dumps({"results": ["Eiffel Tower is located at 48.8584° N, 2.2945° E"]})

def wikimapia_search(image: Image) -> str:
    return json.dumps({"location": "Eiffel Tower", "coordinates": "48.8584° N, 2.2945° E"})

# def exif_metadata_extraction(image_url: str) -> str:
#     return json.dumps({"gps": {"latitude": 48.8584, "longitude": 2.2945}})

# Define the tools
tools = [
    Tool(name="country_prediction", func=country_prediction, description="Predicts the country of an image"),
    Tool(name="reverse_image_search", func=reverse_image_search, description="Finds similar images"),
    Tool(name="google_search", func=google_search, description="Searches Google for information"),
    Tool(name="wikimapia_search", func=wikimapia_search, description="Searches Wikimapia for location information"),
    #Tool(name="exif_metadata_extraction", func=exif_metadata_extraction, description="Extracts EXIF metadata from an image"),
]
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# Define the state
class State(BaseModel):
    image: str
    tools_results: Dict[str, Any] = {}
    final_coordinates: Dict[str, float] = {}


def reason_with_tools(state: State) -> str:
    prompt = f"""
    Given the current state of the analysis and the given photo:
    {json.dumps(state.tools_results, indent=2)}

    Reason about the location of a photo. Try to get as many accurate and relevant information as possible.
    """
    message = HumanMessage(
        content=[
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url":  f"data:image/jpeg;base64,{state.image}"}},
        ],
    )
    llm_with_tools = llm.bind_tools(tools)
    response = llm_with_tools.invoke([message])
    print(response)
    return response.content

def generate_coordinates(state: State) -> State:
    prompt = f"""
    Based on all the information gathered:
    {json.dumps(state.tools_results, indent=2)}

    Generate the most likely coordinates for the image location.
    """
    message = HumanMessage(
        content=[
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url":  f"data:image/jpeg;base64,{state.image}"}},
        ],
    )
    structured_llm = llm.with_structured_output(Location)
    response = structured_llm.invoke([message])
    state.final_coordinates = json.loads(response.content)
    return state

# Create the graph
workflow = StateGraph(State)

# Add nodes
workflow.add_node("reason_with_tools", reason_with_tools)
#workflow.add_node("generate_coordinates", generate_coordinates)

# Add edges
#workflow.add_edge("reason_with_tools", "generate_coordinates")
workflow.add_edge("reason_with_tools", END)

# Set entry point
workflow.set_entry_point("reason_with_tools")

# Compile the graph
app = workflow.compile()

image = open("test.jpg", "rb").read()
image_data = base64.b64encode(image).decode("utf-8")
# Run the app
initial_state = State(image=image_data)
final_state = app.invoke(initial_state)



content='' additional_kwargs={'function_call': {'name': 'country_prediction', 'arguments': '{"__arg1": "https://storage.googleapis.com/ai-community-images/eiffel_tower.jpg"}'}} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}]} id='run-4d2d0714-7869-4b77-bc36-d77cc3a013f8-0' tool_calls=[{'name': 'country_prediction', 'args': {'__arg1': 'https://storage.googleapis.com/ai-community-images/eiffel_tower.jpg'}, 'id': '41df36a9-d778-4a3a-8ab1-72dd8686856f', 'type': 'tool_call'}] usage_metadata={'input_tokens': 472, 'output_tokens': 35, 'total_tokens': 50

InvalidUpdateError: Expected dict, got 