### Install libraries from terminal:

Open your terminal in the project's root directory and run:

`pip install -r requirements.txt` 

This will install all the libraries listed in requirements.txt in your project's environment.

### Install libraries from here

In [1]:
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Setup API Key

In [1]:
import os
from dotenv import load_dotenv
from image_processing import ImageProcessor, ImageSource
from audio_processing import AudioProcessor
from IPython.display import Markdown, display
import re
import gradio as gr

def setup_openai_api_key():
    load_dotenv()

    try:
        openai_api_key = os.environ["OPENAI_API_KEY"]
    except KeyError as e:
        raise ValueError(f"OPENAI_API_KEY is required: {e}")

    return openai_api_key


### Convert a String to Markdown

In [2]:
def to_markdown(text):
    # Replace bullet points (•) with Markdown-compatible lists (* item)
    text = text.replace('•', '  * ')

    # Function to preserve code blocks
    def preserve_code(match):
        return f"\n```python\n{match.group(1)}\n```\n"  # Ensures correct markdown

    # Extract and preserve Python code blocks
    text = re.sub(r"```python\n(.*?)\n```", preserve_code, text, flags=re.DOTALL)

    # Split text into lines for better processing
    lines = text.split("\n")
    formatted_lines = []
    inside_code_block = False

    for line in lines:
        # Detect start and end of a code block
        if line.startswith("```"):
            inside_code_block = not inside_code_block
            formatted_lines.append(line)
            continue

        # Apply blockquote formatting **only** to non-code lines
        if not inside_code_block:
            line = line.strip()  # Remove leading/trailing whitespace from each line
            if line:  # Only add non-empty lines
                formatted_lines.append(f"> {line}")
        else:
            formatted_lines.append(line)

    # Join lines back into a full formatted text
    formatted_text = "\n".join(formatted_lines)
    
    return Markdown(formatted_text)

### Image analyze

In [3]:
def image_analyze(openai_api_key, image_input, threshold_input):
    image_processor = ImageProcessor(openai_api_key)

    if image_input is None: # Check if an image was uploaded
        return "Please upload an image."
    
    # Save image
    image_path = image_processor.save_image(image_input)
    # image_path = "images/crowd-fans-watching-live-performance-music-concert-night.png" # For testing.
    
    # Image analyze.
    # print(f"image_path: {image_path}")
    
    if image_path:
        # image_processor.show_image(image_path, ImageSource.FILE_PATH) # For testing.
        result = image_processor.analyze_crowd(image_path, threshold_input)
        return result.message
    else:
        return "Failed to save image."

### Voice query + Assistant response + TTS

In [4]:
def audio_interaction(openai_api_key, audio_input, crowd_analysis_output):
    audio_processor = AudioProcessor(openai_api_key)
    # voice input
    audio_path = audio_processor.save_audio(audio_input)
    # audio_path = "audios/what-you-recommend-speech.mp3" # For testing.

    if audio_path:
        text = audio_processor.voice_input_to_text(audio_path)
        # Ensure crowd_analysis_output is valid
        crowd_context = crowd_analysis_output if crowd_analysis_output else ""

        # Format the final prompt correctly
        prompt = f"{text} {crowd_context}".strip()  # Avoid unnecessary spaces
        display(to_markdown(f"### Prompt: {prompt}"))
    else:
        text = "Failed to save audio."
        return text, None    
    # query
    query_response_text = audio_processor.get_openai_response(prompt)
    display(to_markdown(query_response_text))
    #return query_response_text
    # TTS
    speech_file_path = audio_processor.text_to_voice(query_response_text)
    if speech_file_path:
        print(f"Audio file saved at: {speech_file_path}")
        audio_processor.play_speech(speech_file_path)
        return query_response_text, speech_file_path
    else:
        print("Text-to-speech conversion failed.")
        return query_response_text, None

### Generate Image with recommend action

In [5]:
def generate_image_recommendation(openai_api_key, crowd_analysis_output, transcription_output):
    prompt = f"""
    A spacious event hall with well-organized seating and ample walking space. 
    The layout includes round tables if needed, clear pathways, and a stage for presentations.
    The venue accommodates the attendees comfortably, avoiding overcrowding.
    The ceiling has good lighting, and the walls are decorated with elegant banners.
    The image must take into consideration:

    Crowd Analysis:
    {crowd_analysis_output}

    Transcription:
    {transcription_output}
    
    Based on this information, the image should...
    """
    image_processor = ImageProcessor(openai_api_key)
    dalle_model = "dall-e-2"
    #image_quality = "hd"
    #image_size = "1792x1024"
    
    image_url = image_processor.generate_image(
        model = dalle_model,
        prompt = prompt,
        size = "256x256" #image_size, 
        #quality = image_quality,
    )
    return image_url

### Main App

In [None]:

openai_api_key = setup_openai_api_key()

with gr.Blocks() as demo:
    openai_api_key_state = gr.State(openai_api_key)  # Store API key in a State component
    with gr.Row():  # Top row for title
        gr.Markdown("# Multimodal AI App")  # Title
    with gr.Row():  # Second row description
        gr.Markdown("## Analyze images, ask and respond using audio, and generate an image based on a response")  # Description
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Analyze an image") 
            image_input = gr.Image(type="pil", label="Upload Image for Crowd Analysis")
            threshold_input = gr.Number(label="Crowd Density Threshold")
            crowd_analysis_output = gr.Markdown(label="Crowd Analysis Warning")
            analyze_button = gr.Button("Analyze")
            analyze_button.click(
                image_analyze,
                inputs = [openai_api_key_state, image_input, threshold_input],  # Pass all three arguments
                outputs = crowd_analysis_output
            )
        with gr.Column():
            gr.Markdown("### Audio interaction") 
            audio_input = gr.Audio(sources=["microphone"], label="Upload Audio") 
            audio_output = gr.Audio(type="filepath", label="The audio respons from the agent") # Define audio output
            transcription_output = gr.Textbox(label="Audio Transcription")
            transcribe_button = gr.Button("Transcribe Audio")
            transcribe_button.click(
                audio_interaction,
                inputs = [openai_api_key_state, audio_input, crowd_analysis_output], 
                outputs = [transcription_output, audio_output]
            )

        with gr.Column():
            gr.Markdown("### Generate image based on the interaction") 
            generated_image_output = gr.Image(label="Generated Image", type="filepath")
            generate_button = gr.Button("Generate image")
            generate_button.click(
                generate_image_recommendation,
                inputs = [openai_api_key_state, crowd_analysis_output, transcription_output], 
                outputs = generated_image_output
            )

demo.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Audio saved at: audios/audio-9a9476c9-1592-463b-891b-76f23c1d6513.mp3


> ### Prompt: Horofizdat!
> The crowd is estimated to be around 300 people. It's quite crowded, so ensure safety measures are in place.

> With an estimated crowd of 300 people, ensure appropriate safety measures are implemented due to the high attendance.

Audio file saved at: audios/audio-1eb7c9a9-81f6-4a3f-bc0e-d7cd3f0f07cb.mp3


Audio saved at: audios/audio-f238423a-cc68-4057-b229-22511022b4f5.mp3


> ### Prompt: how to fix that issue.
> The crowd exceeds the threshold of 200 with an estimated 500 people. The space appears too crowded!

> To address overcrowding, consider: adjusting layout to maximize space, limiting entry, utilizing nearby overflow areas, enhancing crowd control with signage and barriers, optimizing staff deployment, coordinating with security for safety, and communicating with attendees about waiting times or alternative options.

Audio file saved at: audios/audio-29d00d6b-c8d5-4598-8d9c-e73d36c2ce7c.mp3
