## Modele multimodalne

In [None]:
!pip install -q langchain langchain-openai python-dotenv


In [None]:
import os
import base64
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


In [None]:
def encode_image(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


### Sterowanie pojazdem na podstawie obrazu

In [None]:
image_b64 = encode_image("data/lane_navigation/img.png")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "This is image from front camera of autonomous vehicle prototype. "
                                     "Tell me if next step of a car should be moving forward, turn left or turn right? "
                                     "Return only single word: forward, left, or right"},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
        ],
    }
]

response = llm.client.chat.completions.create(model="gpt-4o-mini", messages=messages)
print("Model decision:", response.choices[0].message.content.strip())


### Zliczanie samochodów na zdjęciu satelitarnym

In [None]:
image_b64 = encode_image("data/satellite/cars.png")

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Count cars visible on this satellite image. Return only the number."},
            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
        ],
    }
]

response = llm.client.chat.completions.create(model="gpt-4o-mini", messages=messages)
print("Number of cars:", response.choices[0].message.content.strip())


### Rozpoznawanie pojazdu uprzywilejowanego na podstawie dźwięku

In [None]:
audio_path = "data/audio/siren.wav"

messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Listen to this siren sound and classify the emergency vehicle type. "
                                     "Possible answers: ambulance, fire truck, police car. Return only one word."},
            {"type": "input_audio", "input_audio": {
                "data": open(audio_path, "rb").read(),
                "format": "wav"
            }},
        ],
    }
]

response = llm.client.chat.completions.create(model="gpt-4o-mini-audio-preview", messages=messages)
print("Detected vehicle:", response.choices[0].message.content.strip())
