# Day 5 - Multi-modal AI Assistant

We'll now bring together what we've learned to make an AI Assistant that can generate images using DALL-E 3!

In [None]:
import gradio as gr
import os
import json
import base64
from io import BytesIO
from PIL import Image
from dotenv import load_dotenv
from openai import OpenAI

In [None]:
# Load environment variables
load_dotenv(override=True)

# API keys
openai_api_key = os.getenv('OPENAI_API_KEY')
ollama_base_url = os.getenv('OLLAMA_BASE_URL')
ollama_api_key = os.getenv('OLLAMA_API_KEY')
ollama_model = os.getenv('OLLAMA_MODEL', 'qwen3-coder:480b-cloud')

# Verify API keys
if openai_api_key:
    print(f"OpenAI API Key loaded: {openai_api_key[:8]}...")
else:
    print("OpenAI API Key not set")

if ollama_base_url:
    print(f"Ollama configured at: {ollama_base_url}")
    print(f"Ollama Model is : {ollama_model}")

# Initialize Clients
openai_client = OpenAI(api_key=openai_api_key)
ollama_client = OpenAI(
    base_url=f"{ollama_base_url}/v1",
    api_key=ollama_api_key
)

print("All clients initialized successfully")

In [None]:
def artist(city):
    print(f"Generating image for {city}...")
    try:
        image_response = openai_client.images.generate(
                model="dall-e-3",
                prompt=f"An image representing a vacation in {city}, showing tourist spots and everything unique about {city}, in a vibrant pop-art style",
                size="1024x1024",
                n=1,
                response_format="b64_json",
            )
        image_base64 = image_response.data[0].b64_json
        image_data = base64.b64decode(image_base64)
        img = Image.open(BytesIO(image_data))
        
        # Save to file
        filename = f"{city}_vacation.png"
        img.save(filename)
        return f"I have generated an image of {city} and saved it to {filename}."
    except Exception as e:
        return f"Error generating image: {str(e)}"

In [None]:
artist_function = {
    "name": "artist",
    "description": "Generate an image of a city in a pop-art style.",
    "parameters": {
        "type": "object",
        "properties": {
            "city": {
                "type": "string",
                "description": "The city to generate an image for",
            },
        },
        "required": ["city"],
        "additionalProperties": False
    }
}

tools = [{"type": "function", "function": artist_function}]

In [None]:
system_message = """
You are a helpful assistant that can generate images of cities using the artist tool.
If the user asks to see a city or for an image of a city, use the artist tool.
"""

In [None]:
def handle_tool_calls(message):
    responses = []
    for tool_call in message.tool_calls:
        if tool_call.function.name == "artist":
            arguments = json.loads(tool_call.function.arguments)
            city = arguments.get('city')
            result = artist(city)
            responses.append({
                "role": "tool",
                "content": result,
                "tool_call_id": tool_call.id
            })
    return responses

In [None]:
def chat(message, history):
    history = [{"role": h["role"], "content": h["content"]} for h in history]
    messages = [{"role": "system", "content": system_message}] + history + [{"role": "user", "content": message}]
    
    # First call to LLM to check for tool calls
    response = ollama_client.chat.completions.create(
        model=ollama_model,
        messages=messages,
        tools=tools
    )
    
    if response.choices[0].finish_reason == "tool_calls":
        message = response.choices[0].message
        # Handle tool calls
        responses = handle_tool_calls(message)
        
        # Append assistant's tool call message and tool responses to history
        messages.append(message)
        messages.extend(responses)
        
        # Second call to LLM to get final response
        stream = ollama_client.chat.completions.create(
            model=ollama_model,
            messages=messages,
            stream=True
        )
        
        response_content = ""
        for chunk in stream:
            content = chunk.choices[0].delta.content or ""
            response_content += content
            yield response_content
            
    else:
        # No tool call, just return the response
        yield response.choices[0].message.content

In [None]:
gr.close_all()
gr.ChatInterface(fn=chat, type="messages").launch(share=True)