In [None]:
# --- System Prompt and JSON Format ---
from pydantic import BaseModel, Field, conlist, ValidationError
from typing import List, Literal, Optional

class VeganStatus(BaseModel):
    determination: Literal[
        "Vegan",
        "Likely Vegan",
        "Not Vegan",
        "Contains Non-Vegan Ingredients",
        "Potentially Non-Vegan",
        "Undetermined"
    ] = Field(..., description="The determined vegan status of the product.")
    reasoning: str = Field(..., description="Brief justification for the vegan determination, citing evidence or lack thereof.")

class PrimaryProductAnalysis(BaseModel):
    product_name: str = Field(
        default="N/A",
        description="Name of the product as extracted from images, or N/A if unreadable/not identified."
    )
    product_description: str = Field(
        default="N/A",
        description="Factual description from packaging (e.g., 'Instant Coffee'), or N/A if unreadable."
    )
    product_composition: List[str] = Field(
        default_factory=list,
        description="List of ingredients extracted from the product packaging."
    )
    allergen_information_explicit: List[str] = Field(
        default_factory=list,
        description="List of allergens explicitly stated on the packaging (e.g., in a 'Contains:' section)."
    )
    allergen_information_inferred_from_ingredients: List[str] = Field(
        default_factory=list,
        description="List of allergens inferred from the product_composition list."
    )
    vegan_status: VeganStatus = Field(
        ..., # optional
        description="Detailed vegan status determination and reasoning."
    )
    other_extracted_text: List[str] = Field(
        default_factory=list,
        description="Any other relevant text snippets from product packaging not fitting elsewhere (e.g., net weight, best before date)."
    )

class GeminiProductResponse(BaseModel):
    user_audio_transcription: str = Field(
        default="N/A",
        description="Transcription of the user's audio request, or N/A if no/inaudible audio."
    )
    ai_response_to_user: str = Field(
        ..., # optional
        description="The AI's concise, direct, audible-style answer to the user's request or a summary, focusing on the primary product."
    )
    primary_product_analysis: PrimaryProductAnalysis = Field(
        ..., # optional
        description="Detailed analysis of the primary product identified in the images."
    )

PROMPT_FOR_GEMINI_MULTI_IMAGE_AUDIO = """
You are a highly specialized AI assistant designed to help Visually Impaired Persons (VIPs) understand product information, with a strong focus on allergen and vegan details. You will be provided with:

1.  A recorded audio of a user's request (if available). The user is likely holding the product they are asking about.
2.  Multiple images of a product (if available). These images will primarily feature the product the user is holding, which should be considered the **primary product**.

Your primary goal is to provide clear, concise, and actionable information to the VIP user. DO NOT ask the user to look at the product themselves or refer to visual elements as if they can see them (e.g., avoid "as you can see").

**Your Tasks:**

1.  **Transcribe User Audio:**
    *   If audio is provided, accurately transcribe the user's request.
    *   If no audio is provided or the audio is inaudible, this field in the JSON should be "N/A".

2.  **Analyze Primary Product Images:**
    *   Focus your analysis on the most prominent product in the images, presumed to be the one the user is holding (the **primary product**).
    *   Extract as much textual information as possible from the packaging of this primary product.
    *   Identify product name, description, composition (ingredients), explicitly stated allergen information, and any vegan indicators (certifications, logos, explicit statements).

3.  **Determine Allergen Information for the Primary Product:**
    *   Look for dedicated "Allergen Information" sections (e.g., "Contains: wheat, soy, milk", "May contain traces of nuts"). List these under `allergen_information_explicit`.
    *   If no explicit section is found, or to supplement it, carefully review the `product_composition` (ingredients list) for common allergens (e.g., milk, eggs, fish, shellfish, tree nuts, peanuts, wheat, soy, sesame, celery, mustard, lupin, sulphites). List these inferred allergens under `allergen_information_inferred_from_ingredients`. If an ingredient *is* an allergen, list the allergen itself.

4.  **Determine Vegan Status for the Primary Product:**
    *   **Explicit Indicators:** Prioritize finding vegan certification logos (e.g., "Certified Vegan") or explicit textual statements like "Suitable for Vegans."
    *   **Ingredient Analysis:** If no explicit indicators, analyze the `product_composition`. Identify common non-vegan ingredients (e.g., meat, poultry, fish, dairy (milk, cheese, whey, casein, lactose), eggs, honey, gelatin, carmine, lanolin, shellac, isinglass). Also note ingredients that are often, but not always, non-vegan and may require further checking if the user had that ability (though you cannot ask them to do this).
    *   **Determination Categories for `vegan_status.determination`:**
        *   **"Vegan"**: If a clear vegan certification/statement is present.
        *   **"Not Vegan"**: If definitive non-vegan ingredients are listed (e.g., milk, eggs, meat).
        *   **"Contains Non-Vegan Ingredients"**: Similar to "Not Vegan," use if specific non-vegan items are identified.
        *   **"Likely Vegan"**: If no explicit certification, but ingredients strongly suggest it (e.g., a product named "Plant-Based Burger" with no obvious animal products). State the basis.
        *   **"Potentially Non-Vegan"**: If ingredients are ambiguous or include items that are often animal-derived but not always (e.g., "natural flavors," some emulsifiers without source specified), and no vegan certification is present. State the basis.
        *   **"Undetermined"**: If very little information is available to make a call.
    *   **Reasoning:** Always provide a brief, clear `reasoning` for the vegan status determination, mentioning key evidence (e.g., "Vegan certified logo present.", "Contains: milk powder, egg yolk.", "Ingredient list suggests vegan, but no certification found.").

5.  **Formulate AI Response to User:**
    *   This is the most crucial part for the user to hear. It should be a direct, audible-style answer.
    *   If there was a `user_audio_transcription` with a specific question, answer that question directly using the information gathered from the `primary_product_analysis`.
    *   If the user's request was general (e.g., "Tell me about this") or if there was no audio, provide a concise summary of the primary product, prioritizing its name, key allergen information, and vegan status.
    *   If information is scarce for the primary product (e.g., blurry image, little text), and you have to make an *inferred assessment* (especially for vegan status or allergens), clearly state the basis for your inference and the level of uncertainty. For example: "Based on the product name 'Soy Milk' and the partial ingredient 'soybeans' I could identify, it is likely vegan, but I cannot confirm this without a full ingredient list or certification." or "I could only read 'wheat flour' in the ingredients, so it contains wheat. I cannot determine other allergens or its vegan status from the available information."
    *   If no primary product can be reasonably analyzed from the images, state that you were unable to analyze the product visually.

**Output Format:**

Return a **single JSON object** strictly following this structure:

```json
{
  "user_audio_transcription": "string (transcription of user's audio, or N/A if no/inaudible audio)",
  "ai_response_to_user": "string (your concise, direct, audible-style answer to the user's request or a summary, focusing on allergens and vegan status of the primary product)",
  "primary_product_analysis": {
    "product_name": "string (from images, or N/A if no product clearly identified as primary or name unreadable)",
    "product_description": "string (factual description from packaging, e.g., 'Instant Coffee', 'Milk Chocolate Bar', or N/A if unreadable)",
    "product_composition": [
      "string (each listed ingredient)",
      ...
    ],
    "allergen_information_explicit": [
      "string (each explicitly stated allergen, e.g., 'Contains wheat, soy.')",
      ...
    ],
    "allergen_information_inferred_from_ingredients": [
      "string (each allergen inferred from the composition list, e.g., 'milk', 'egg')",
      ...
    ],
    "vegan_status": {
      "determination": "Vegan" | "Likely Vegan" | "Not Vegan" | "Contains Non-Vegan Ingredients" | "Potentially Non-Vegan" | "Undetermined",
      "reasoning": "string (brief justification for the vegan determination)"
    },
    "other_extracted_text": [
      "string (any other relevant text snippets from the product packaging if not fitting elsewhere, e.g., 'Net Wt 100g', 'Best before 12/2025')",
      ...
    ]
  }
}
"""

In [None]:
import cv2
import os
import shutil
import time
import datetime
from ultralytics import YOLO
from google import genai as google_genai_SDK
from google.genai import types as google_genai_types_SDK
import typing
import numpy as np
import json
import collections
import threading
import torch
from concurrent.futures import ThreadPoolExecutor, as_completed
import pyaudio
from scipy import signal
import wave 
from TTS.api import TTS
import playsound 

# --- Configuration & Constants ---
# -- User configurable items (will be prompted or use these as defaults) --
VIDEO_DEVICE_DEFAULT = "/dev/video0"            # Camera name
VAD_SELECTED_DEVICE_INDEX_DEFAULT = 6           # Audio input
GEMINI_API_KEY: typing.Optional[str] = None     # Will be ask when run

# -- General --
TARGET_FPS = 30                                 # Targeted FPS for YOLO, Buffer, etc
BUFFER_DURATION_SECONDS = 8                     # Seconds of frame saved as buffer
RECORD_DURATION_SECONDS = 6                     # Used frames from buffer to processed
FRAME_BUFFER_MAXLEN = int(TARGET_FPS * BUFFER_DURATION_SECONDS)         # Maximum frame count to buffer
OUTPUT_BASE_PATH = '/media/gamedisk/!KULIAHH/!Skripsi/github_output'    # Debug photos, audio, JSON path
MAX_WORKERS_CPU_BOUND = os.cpu_count() or 4
DISPLAY_MAX_WIDTH = 1280
DISPLAY_WINDOW_NAME = "Real-time Vision-Audio Assistant - SPACE: Manual | Q: Quit"
NUM_CLEAREST_CROPS_PER_SECOND_TO_SELECT = 1     # Number of frame per second to be selected as best

# -- Camera Settings (set True if using webcam) --
WEBCAM_SET_PROPERTIES = False                   # Master switch to attempt setting custom camera properties
WEBCAM_DESIRED_WIDTH = 1920                     # Desired camera width
WEBCAM_DESIRED_HEIGHT = 1080                    # Desired camera height
WEBCAM_DESIRED_FPS = 30                         # Desired camera frame rate
WEBCAM_FOURCC_CODEC = 'MJPG'                    # Desired codec (e.g., 'MJPG', 'YUYV')

# -- TTS --
AI_AUDIO_OUTPUT_ENABLED = True                  # Master switch to Audio Output
TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"

# -- YOLO --
YOLO_MODEL_PATH = 'detection_model_02.pt'
YOLO_INFERENCE_SIZE_WIDTH = 640

# -- JPEG Buffer --
JPEG_QUALITY = 85                               # More bigger, more detailed, bigger file, longer upload, longer time

# -- Gemini --
GEMINI_MODEL_NAME = 'gemini-2.0-flash'          # Alternative: 'gemini-2.0-flash-lite', 'gemini-2.0-flash', 'gemini-1.5-flash'


# -- Silero VAD & Audio Recording --
VAD_ENABLED = True                              # Master switch for VAD functionality
VAD_DEVICE_CHANNELS = 2                         # Common: 1 or 2. Must match your device.
VAD_DEVICE_FORMAT = pyaudio.paInt16             # Common: paInt16, paInt32. Must match.
VAD_DEVICE_SAMPLE_RATE_HZ = 48000               # Common: 48000, 44100, 16000. Try device default.

SILERO_VAD_TARGET_SAMPLE_RATE = 16000           # Silero VAD expects 16kHz
SILERO_VAD_CHUNK_SAMPLES = 512                  # Samples per VAD chunk (32ms @ 16kHz)

VAD_SPEECH_CONFIDENCE_THRESHOLD = 0.5           # Threshold for a sound to detected as voice
MIN_SPEECH_DURATION_S = 0.1                     # Minimum time of voice to triggered the recording
MIN_SILENCE_AFTER_SPEECH_S = 2                  # Minimim time of silent to cut off the recording


# --- Global Shared State ---
frame_buffer = collections.deque(maxlen=FRAME_BUFFER_MAXLEN)
processing_active = False
gui_status_message = "Standby"
last_ai_response_for_display = ""
shared_data_lock = threading.Lock()

yolo_model_global: typing.Optional[YOLO] = None
gemini_client_global: typing.Optional[google_genai_SDK.client.Client] = None
vad_model_global: typing.Any = None
pyaudio_instance: typing.Optional[pyaudio.PyAudio] = None
audio_stream: typing.Optional[pyaudio.Stream] = None
actual_audio_stream_params: typing.Dict[str, typing.Any] = {}

vad_thread_running = True
vad_confidence_current = 0.0
vad_is_currently_speaking = False
vad_is_recording_audio = False
vad_audio_chunks_for_processing = []
vad_trigger_processing_after_speech = False

tts_model_global: typing.Optional[TTS] = None
tts_active_lock = threading.Lock()              # Lock to prevent multiple TTS outputs playing at once

# --- Helper Functions ---
def calculate_sharpness(image: np.ndarray) -> float:
    """Calculates sharpness of an image using Laplacian variance."""
    if image.ndim == 3: 
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    elif image.ndim == 2: 
        gray = image
    else: 
        raise ValueError("Unsupported image for sharpness.")
    return cv2.Laplacian(gray, cv2.CV_64F).var()

def initialize_yolo_model(model_path: str) -> YOLO:
    """Loads the YOLO model with automatic device selection."""
    print(f"Loading YOLO: {model_path}")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"  PyTorch CUDA available: {torch.cuda.is_available()}. YOLO will attempt to use: {device}")
    try:
        model = YOLO(model_path)
        if next(model.parameters(), None) is not None:
            print(f"  YOLO model parameters are on device: {next(model.parameters()).device}")
        else:
            print(f"  Could not confirm YOLO model device, assuming {device} if PyTorch reports CUDA.")
        print("YOLO model loaded.")
        return model
    except Exception as e: 
        print(f"Error loading YOLO: {e}"); raise

def configure_gemini(api_key_to_use: str) -> google_genai_SDK.client.Client:
    """Configures and returns the Gemini API client using the provided API key."""
    if not api_key_to_use:
        raise ValueError("Gemini API key is not set. Please provide it at script startup.")
    client = google_genai_SDK.Client(api_key=api_key_to_use)
    print("Gemini API client configured."); 
    return client

def setup_output_directories(base_path: str) -> tuple[str, str, str, str, str]:
    """Creates timestamped run directories for outputs."""
    ts=datetime.datetime.now().strftime("%Y%m%d_%H%M%S"); run_dir=os.path.join(base_path,ts)
    dirs={"all":os.path.join(run_dir,"01_decoded_frames"),"audio":os.path.join(run_dir,"00_user_audio"),"crop":os.path.join(run_dir,"02_yolo_crops"),"clear":os.path.join(run_dir,"03_clear_sel")}
    for d_path in [run_dir] + list(dirs.values()):
        if os.path.exists(d_path): 
            shutil.rmtree(d_path)               # Delete existed directory
        os.makedirs(d_path)
    print(f"Created output dirs for run: {run_dir}"); 
    return run_dir, dirs["all"], dirs["audio"], dirs["crop"], dirs["clear"]

# --- Audio Processing Utilities ---
def convert_int_to_float_audio(audio_data_int: np.ndarray, input_format: int) -> np.ndarray:
    """Converts integer audio data to float32 format."""
    if input_format == pyaudio.paInt16: 
        return audio_data_int.astype(np.float32) / 32768.0
    elif input_format == pyaudio.paInt32: 
        return audio_data_int.astype(np.float32) / np.iinfo(np.int32).max
    else: raise ValueError("Unsupported input format for audio conversion")

def resample_audio_scipy(audio_float: np.ndarray, original_rate: int, target_rate: int) -> np.ndarray:
    """Resamples float audio data to a target sample rate using SciPy."""
    if original_rate == target_rate or len(audio_float) == 0: 
        return audio_float
    num_samples_target = int(len(audio_float) * target_rate / original_rate)
    if num_samples_target == 0: 
        return np.array([], dtype=np.float32)
    return signal.resample(audio_float, num_samples_target).astype(np.float32)

def save_audio_to_wav(audio_frames_list: list, filepath: str, channels: int, sample_rate: int, sample_width_bytes: int) -> bool:
    """Saves a list of PyAudio byte chunks to a WAV file."""
    if not audio_frames_list:
        print("Warning: No audio frames to save.")
        return False
    try:
        with wave.open(filepath, 'wb') as wf:
            wf.setnchannels(channels)
            wf.setsampwidth(sample_width_bytes)
            wf.setframerate(sample_rate)
            wf.writeframes(b''.join(audio_frames_list))
        print(f"Audio saved to {filepath}")
        return True
    except Exception as e:
        print(f"Error saving WAV file: {e}")
        return False

# --- VAD Model and PyAudio Initialization ---
def initialize_vad_and_audio_stream(
    selected_device_idx: int,
    req_channels: int,
    req_format: int,
    req_sample_rate_hz: typing.Optional[int]
    ) -> bool:
    """Initializes Silero VAD model and PyAudio stream based on selected device."""
    global vad_model_global, pyaudio_instance, audio_stream, VAD_ENABLED, actual_audio_stream_params
    if not VAD_ENABLED:
        print("VAD is disabled by configuration.")
        return False

    print("Initializing Silero VAD and PyAudio...")
    try:
        torch.set_num_threads(1)
        vad_model_global, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=False)
        print("VAD model loaded.")
    except Exception as e:
        print(f"Error loading VAD model: {e}. VAD will be disabled.")
        VAD_ENABLED = False
        return False

    pyaudio_instance = pyaudio.PyAudio()
    
    try:
        device_info = pyaudio_instance.get_device_info_by_index(selected_device_idx)
        print(f"Using VAD Device: {device_info['name']} (Index: {selected_device_idx})")
    except Exception as e:
        print(f"Error getting device info for index {selected_device_idx}: {e}"); pyaudio_instance.terminate(); VAD_ENABLED = False; return False

    actual_device_sample_rate = req_sample_rate_hz if req_sample_rate_hz else int(device_info['defaultSampleRate'])
    
    silero_vad_chunk_duration_s = SILERO_VAD_CHUNK_SAMPLES / SILERO_VAD_TARGET_SAMPLE_RATE
    pyaudio_frames_per_buffer = int(actual_device_sample_rate * silero_vad_chunk_duration_s)

    try:
        audio_stream = pyaudio_instance.open(format=req_format,
                                       channels=req_channels,
                                       rate=actual_device_sample_rate,
                                       input=True,
                                       input_device_index=selected_device_idx,
                                       frames_per_buffer=pyaudio_frames_per_buffer)
        print(f"Audio stream opened: {actual_device_sample_rate}Hz, {req_channels}ch, "
              f"Format {'Int32' if req_format==pyaudio.paInt32 else 'Int16'}, "
              f"Buffer {pyaudio_frames_per_buffer} frames.")
        
        # Store actual parameters used for the stream to save audio files correctly
        actual_audio_stream_params['rate'] = actual_device_sample_rate
        actual_audio_stream_params['channels'] = req_channels
        actual_audio_stream_params['format'] = req_format
        actual_audio_stream_params['width_bytes'] = pyaudio_instance.get_sample_size(req_format)
        actual_audio_stream_params['frames_per_buffer'] = pyaudio_frames_per_buffer
        
        return True
    except Exception as e:
        print(f"Error opening PyAudio stream: {e}. VAD will be disabled.")
        VAD_ENABLED = False
        if pyaudio_instance: pyaudio_instance.terminate()
        return False

# --- VAD Monitor Thread Helpers ---
def _process_vad_audio_chunk(raw_bytes: bytes, stream_format: int, stream_channels: int, stream_rate: int, target_rate: int) -> np.ndarray:
    """Converts raw audio bytes from PyAudio to a resampled mono float array for VAD."""
    if stream_format == pyaudio.paInt32: 
        audio_int = np.frombuffer(raw_bytes, dtype=np.int32)
    else: 
        audio_int = np.frombuffer(raw_bytes, dtype=np.int16)

    if audio_int.size == 0: 
        return np.array([], dtype=np.float32)
    
    audio_mono_int = audio_int[::stream_channels] if stream_channels > 1 else audio_int
    audio_mono_float = convert_int_to_float_audio(audio_mono_int, stream_format)
    return resample_audio_scipy(audio_mono_float, stream_rate, target_rate)

def _prepare_vad_input_tensor(audio_resampled: np.ndarray, silero_chunk_samples: int) -> torch.Tensor:
    """Pads or truncates resampled audio to the size expected by Silero VAD model."""
    if audio_resampled.size == 0: 
        # Return of zeros if input is empty, to avoid error
        return torch.zeros(silero_chunk_samples, dtype=torch.float32)

    if len(audio_resampled) < silero_chunk_samples:
        vad_input_chunk = np.pad(audio_resampled, (0, silero_chunk_samples - len(audio_resampled)), 'constant')
    else:
        vad_input_chunk = audio_resampled[:silero_chunk_samples]
    return torch.from_numpy(vad_input_chunk)

# --- VAD Monitor Thread ---
def vad_monitor_thread_function():
    global vad_thread_running, vad_confidence_current, vad_is_currently_speaking
    global vad_is_recording_audio, vad_audio_chunks_for_processing, vad_trigger_processing_after_speech
    global shared_data_lock, gui_status_message, processing_active, actual_audio_stream_params

    if not VAD_ENABLED or not audio_stream or not vad_model_global:
        print("VAD thread: VAD not enabled or stream/model not initialized. Exiting.")
        return

    print("🎤 VAD Monitor Thread Started.")
    speech_start_time, silence_start_time = 0, 0
    speech_detected_in_current_segment = False
    vad_paused_due_to_processing = False

    # Stream parameters used by PyAudio
    pyaudio_rate = actual_audio_stream_params.get('rate', VAD_DEVICE_SAMPLE_RATE_HZ)
    pyaudio_channels = actual_audio_stream_params.get('channels', VAD_DEVICE_CHANNELS)
    pyaudio_format = actual_audio_stream_params.get('format', VAD_DEVICE_FORMAT)
    pyaudio_frames_per_chunk = actual_audio_stream_params.get('frames_per_buffer', 512)


    while vad_thread_running:
        try:
            raw_audio_chunk_bytes = audio_stream.read(pyaudio_frames_per_chunk, exception_on_overflow=False)

            # Pause VAD logic while the main processing pipeline is running
            with shared_data_lock: is_pipeline_processing = processing_active
            
            if is_pipeline_processing:
                if not vad_paused_due_to_processing:
                    with shared_data_lock:
                        vad_confidence_current = 0.0
                        vad_is_currently_speaking = False
                vad_paused_due_to_processing = True
                speech_start_time, silence_start_time = 0, 0
                speech_detected_in_current_segment = False
                time.sleep(0.01); continue
            
            if vad_paused_due_to_processing and not is_pipeline_processing:
                vad_paused_due_to_processing = False

            # Process audio chunk for VAD
            audio_resampled = _process_vad_audio_chunk(raw_audio_chunk_bytes, pyaudio_format, pyaudio_channels, int(pyaudio_rate), SILERO_VAD_TARGET_SAMPLE_RATE)
            if audio_resampled.size == 0: continue
            
            vad_input_tensor = _prepare_vad_input_tensor(audio_resampled, SILERO_VAD_CHUNK_SAMPLES)
            current_confidence = vad_model_global(vad_input_tensor, SILERO_VAD_TARGET_SAMPLE_RATE).item()
            
            with shared_data_lock: 
                vad_confidence_current = current_confidence
                vad_is_currently_speaking = current_confidence > VAD_SPEECH_CONFIDENCE_THRESHOLD

            # Determines when to start and stop recording audio, starts after MIN_SPEECH_DURATION_S and stop MIN_SILENCE_AFTER_SPEECH_S
            is_speaking_now = current_confidence > VAD_SPEECH_CONFIDENCE_THRESHOLD
            if is_speaking_now:
                silence_start_time = 0 
                if speech_start_time == 0: speech_start_time = time.time()
                
                if (time.time() - speech_start_time) >= MIN_SPEECH_DURATION_S:
                    with shared_data_lock:
                        if not vad_is_recording_audio and not processing_active:
                            vad_is_recording_audio = True 
                            vad_audio_chunks_for_processing.clear() 
                            speech_detected_in_current_segment = True
            else: # Silence
                if speech_detected_in_current_segment:
                    if silence_start_time == 0: silence_start_time = time.time()
                    if (time.time() - silence_start_time) >= MIN_SILENCE_AFTER_SPEECH_S:
                        with shared_data_lock:
                            if vad_is_recording_audio and not processing_active:
                                vad_trigger_processing_after_speech = True 
                                vad_is_recording_audio = False 
                        speech_detected_in_current_segment = False 
                        silence_start_time = 0 
                speech_start_time = 0 

            # Append audio chunks to buffer, if recording is active
            with shared_data_lock:
                if vad_is_recording_audio:
                    vad_audio_chunks_for_processing.append(raw_audio_chunk_bytes)

        except IOError as e:
            if e.errno == pyaudio.paInputOverflowed: 
                print("VAD Thread: PyAudio Input Overflowed.")
            else: 
                print(f"VAD Thread IOError: {e}")
            time.sleep(0.01)
        except Exception as e:
            print(f"VAD Thread Error: {e}")
            time.sleep(0.1) 

    if audio_stream: audio_stream.stop_stream(); audio_stream.close()
    if pyaudio_instance: pyaudio_instance.terminate()
    print("🎤 VAD Monitor Thread Stopped.")

# --- Audio Output Functions ---
def initialize_tts_model(model_name: str) -> typing.Optional[TTS]:
    """Loads the Coqui TTS model."""
    global AI_AUDIO_OUTPUT_ENABLED
    
    if not AI_AUDIO_OUTPUT_ENABLED:
        print("AI audio output is disabled by configuration.")
        return None
    try:
        print(f"Loading TTS model: {model_name}...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"  TTS will attempt to use device: {device}")
        model = TTS(model_name=model_name).to(device)
        print("TTS model loaded.")
        return model
    except Exception as e:
        print(f"Error loading TTS model: {e}. AI audio output will be disabled.")
        AI_AUDIO_OUTPUT_ENABLED = False         # Disable TTS if failed
        return None
    
def generate_and_play_speech_threaded(text_to_speak: str, output_audio_filepath: str):
    """
    Generates speech from text using TTS and plays it. Runs in a separate thread.
    Manages a lock to prevent simultaneous TTS operations.
    """
    global gui_status_message, shared_data_lock, tts_model_global

    if not tts_model_global or not AI_AUDIO_OUTPUT_ENABLED:
        print("TTS model not available or AI audio output disabled. Skipping speech.")
        return

    # prevents waiting if speech is already playing
    if not tts_active_lock.acquire(blocking=False):
        print("TTS is already active. Skipping new speech request.")
        return

    try:
        with shared_data_lock:
            gui_status_message = "AI: Generating speech..."
        print(f"TTS: Generating speech for: \"{text_to_speak[:50]}...\"")
        
        # Generate speech to WAV file
        tts_model_global.tts_to_file(
            text=text_to_speak,
            file_path=output_audio_filepath
        )
        print(f"TTS: Speech saved to {output_audio_filepath}")

        with shared_data_lock:
            gui_status_message = "AI: Speaking..."
        
        playsound.playsound(output_audio_filepath)

        # After playback, reset the GUI status
        with shared_data_lock:
            if gui_status_message == "AI: Speaking...":
                gui_status_message = "Standby (AI spoken)"

    except AttributeError as ae:
        print(f"TTS Error (Attribute): {ae}. Speech generation/playback failed.")
        with shared_data_lock:
            gui_status_message = "AI: Speech Error (Attr)"
    except Exception as e:
        print(f"TTS Error: {e}. Speech generation/playback failed.")
        with shared_data_lock:
            gui_status_message = "AI: Speech output failed."
    finally:
        # Always release the lock so the next request can processed
        tts_active_lock.release()

# --- Core Processing Logic Functions ---
def decode_jpeg_and_save_frame(jpeg_bytes_bboxes_idx_ts_fname):
    original_idx, jpeg_bytes, bboxes_for_this_frame, abs_capture_timestamp, frame_storage_path = jpeg_bytes_bboxes_idx_ts_fname
    try:
        decoded_frame_array = cv2.imdecode(np.frombuffer(jpeg_bytes, np.uint8), cv2.IMREAD_COLOR)
        if decoded_frame_array is None: 
            return None, original_idx, None, None, None
        cv2.imwrite(frame_storage_path, decoded_frame_array)
        return decoded_frame_array, original_idx, bboxes_for_this_frame, abs_capture_timestamp, frame_storage_path
    except Exception as e: 
        return None, original_idx, None, None, None

def save_segment_and_prepare_data_for_cropping(
    frames_snapshot_from_buffer: list[tuple[bytes, list, float]], current_trigger_time: float,
    segment_duration_sec: int, all_frames_storage_dir: str
) -> list[tuple[np.ndarray, list, int, float]]:
    segment_start_filter_time = current_trigger_time - segment_duration_sec
    selected_segment_data_to_decode = []
    temp_idx = 0
    for jpeg_bytes, bboxes_list, abs_ts in frames_snapshot_from_buffer:
        if abs_ts >= segment_start_filter_time:
            frame_filename = os.path.join(all_frames_storage_dir, f"decoded_frame_{temp_idx:04d}.jpg")
            selected_segment_data_to_decode.append((temp_idx, jpeg_bytes, bboxes_list, abs_ts, frame_filename))
            temp_idx += 1
    if not selected_segment_data_to_decode: return []
    prepared_data_for_cropping = []
    decode_start_time = time.perf_counter()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS_CPU_BOUND) as executor:
        futures = {executor.submit(decode_jpeg_and_save_frame, item): item for item in selected_segment_data_to_decode}
        for future in as_completed(futures):
            decoded_frame, o_idx, bboxes, abs_ts_val, _ = future.result()
            if decoded_frame is not None: prepared_data_for_cropping.append((decoded_frame, bboxes, o_idx, abs_ts_val))
    print(f"  JPEG decoding & saving segment took {time.perf_counter() - decode_start_time:.2f}s.")
    if not prepared_data_for_cropping: 
        return []
    prepared_data_for_cropping.sort(key=lambda x: x[2])
    actual_segment_start_time = prepared_data_for_cropping[0][3]
    final_data = [(f, b, o, ats - actual_segment_start_time) for f,b,o,ats in prepared_data_for_cropping]
    print(f"Decoded frames saved. Prepared {len(final_data)} for cropping.")
    return final_data

def crop_products_from_frames(
    data_for_cropping: list[tuple[np.ndarray, list, int, float]],
    cropped_products_storage_dir: str
) -> list[tuple[np.ndarray, int, int, float, str]]:
    all_cropped_info = []
    if not data_for_cropping: 
        return []
    print(f"Cropping products from {len(data_for_cropping)} frames...")
    for dec_frame, bboxes_list, o_idx, rel_ts in data_for_cropping:
        if not bboxes_list: continue
        crop_idx = 0
        for (x1,y1,x2,y2,_,_) in bboxes_list:
            h,w = dec_frame.shape[:2]; c_x1,c_y1=max(0,x1),max(0,y1); c_x2,c_y2=min(w-1,x2),min(h-1,y2)
            if c_x1>=c_x2 or c_y1>=c_y2: continue
            crop = dec_frame[c_y1:c_y2, c_x1:c_x2]
            if crop.size == 0: continue
            fname=f"dec_f{o_idx:04d}_p{crop_idx:02d}.jpg"; fp=os.path.join(cropped_products_storage_dir,fname)
            cv2.imwrite(fp,crop); all_cropped_info.append((crop.copy(),o_idx,crop_idx,rel_ts,fp)); crop_idx+=1
    print(f"Cropping done. Saved {len(all_cropped_info)} crops.")
    return all_cropped_info

def calculate_sharpness_for_item(item_with_idx_data):
    item_idx, crop_img_data, orig_idx_val, crop_idx_in_frame_val, rel_ts_val = item_with_idx_data
    sharpness_score = calculate_sharpness(crop_img_data)
    return item_idx, sharpness_score, crop_img_data, orig_idx_val, crop_idx_in_frame_val, rel_ts_val

def select_and_save_clearest_crops(
    all_cropped_products_info: list[tuple[np.ndarray, int, int, float, str]],
    clearest_selected_storage_dir: str, num_to_select_per_second: int
) -> list[str]:
    if not all_cropped_products_info: 
        return []
    num_crops = len(all_cropped_products_info)
    print(f"Calculating sharpness for {num_crops} crops in parallel (Max workers: {MAX_WORKERS_CPU_BOUND})...")
    sharpness_task_data = [(i, c[0], c[1], c[2], c[3]) for i, c in enumerate(all_cropped_products_info)]
    crops_with_sharpness_temp = [None] * num_crops
    sharpness_calc_start_time = time.perf_counter()
    with ThreadPoolExecutor(max_workers=MAX_WORKERS_CPU_BOUND) as executor:
        future_to_idx = {executor.submit(calculate_sharpness_for_item, item_data): item_data[0] for item_data in sharpness_task_data}
        for future in as_completed(future_to_idx):
            idx, sharp, img, o_idx, c_idx, r_ts = future.result()
            crops_with_sharpness_temp[idx] = (sharp, img, o_idx, c_idx, r_ts)
    print(f"  Sharpness calculation took {time.perf_counter() - sharpness_calc_start_time:.2f} seconds.")
    crops_with_sharpness = [item for item in crops_with_sharpness_temp if item is not None]
    
    # Group detected crops by the second of appearing
    crops_by_sec: typing.Dict[int, list] = collections.defaultdict(list)
    for sharp, img, o_idx, c_idx, r_ts in crops_with_sharpness: crops_by_sec[int(r_ts // 1.0)].append((sharp, img, o_idx, c_idx))
    
    selected_paths = []
    for sec, crops_in_s in sorted(crops_by_sec.items()):
        crops_in_s.sort(key=lambda x: x[0], reverse=True)
        # Select the N sharpest crops from this second
        for rank, (sharp, img_arr, o_idx, c_idx) in enumerate(crops_in_s[:num_to_select_per_second]):
            fname = f"sel_sec{sec:02d}_rank{rank:02d}_orig{o_idx:04d}_crop{c_idx:02d}.jpg"
            fpath = os.path.join(clearest_selected_storage_dir, fname); cv2.imwrite(fpath, img_arr)
            selected_paths.append(fpath)
    print(f"Saved {len(selected_paths)} clearest crops to {clearest_selected_storage_dir}.")
    return selected_paths

def _create_gemini_part_from_file(filepath: str, mime_type: str) -> typing.Optional[google_genai_types_SDK.Part]:
    """Reads a file and creates a Gemini Part object, returns None on error."""
    try:
        if not os.path.exists(filepath):
            print(f"Err: File does not exist: {filepath}. Skip.")
            return None
        file_size = os.path.getsize(filepath)
        if file_size == 0:
            print(f"Err: File {os.path.basename(filepath)} is 0 bytes. Skip.")
            return None
        
        with open(filepath, 'rb') as f:
            file_bytes = f.read()
            if not file_bytes:
                print(f"Err: Read 0 bytes from file {os.path.basename(filepath)}. Skip.")
                return None
        return google_genai_types_SDK.Part.from_bytes(data=file_bytes, mime_type=mime_type)
    except Exception as e:
        print(f"Err reading/processing file {filepath}. Type: {type(e).__name__}, Error: {e}. Skip.")
        return None

def process_images_and_audio_with_gemini( 
    gemini_client: google_genai_SDK.client.Client,
    image_paths: list[str],
    audio_filepath: typing.Optional[str],
    gemini_model_name: str,
    prompt_text: str, 
    run_output_dir_for_results_json: str
):
    global last_ai_response_for_display, shared_data_lock
    
    if not image_paths and not audio_filepath:
        print("No images or audio to send to Gemini.")
        with shared_data_lock: last_ai_response_for_display = "Error: No input for Gemini."
        return

    print(f"\nPreparing request for Gemini ({gemini_model_name})...")
    contents = [prompt_text] 
    num_images_to_send = 0
    if image_paths:
        print(f"  Processing {len(image_paths)} image paths for Gemini...")
        for img_path in image_paths:
            part = _create_gemini_part_from_file(img_path, 'image/jpeg')
            if part:
                contents.append(part)
                num_images_to_send +=1
    print(f"  Added {num_images_to_send} image(s) to Gemini request.")

    processed_audio_filepath = None
    if audio_filepath:
        part = _create_gemini_part_from_file(audio_filepath, 'audio/wav')
        if part:
            contents.append(part)
            processed_audio_filepath = audio_filepath
            print(f"  Added audio file {os.path.basename(audio_filepath)} to Gemini request.")
        else:
            print(f"Proceeding without audio due to processing error for: {audio_filepath}")

    if num_images_to_send == 0 and not processed_audio_filepath:
        print("No valid images or audio to send after attempting to load. Aborting Gemini call.")
        with shared_data_lock: last_ai_response_for_display = "Error: Failed to load inputs."
        return

    gemini_request_config = {
        'response_mime_type': 'application/json',
        'response_schema': GeminiProductResponse 
    }

    gemini_result_summary = {
        "image_files_sent": [os.path.basename(p) for p in image_paths if os.path.exists(p) and os.path.getsize(p) > 0][:num_images_to_send] if image_paths else [],
        "audio_file_sent": os.path.basename(processed_audio_filepath) if processed_audio_filepath else None,
        "gemini_response_raw_text": None, "gemini_response_parsed_dict": None,
        "sdk_parsed_successfully": False, "error": None
    }

    try:
        print(f"--- Sending request to Gemini ({len(contents)-1} media parts) ---")
        response = gemini_client.models.generate_content(
            model=gemini_model_name, contents=contents, config=gemini_request_config
        )
        gemini_result_summary["gemini_response_raw_text"] = response.text

        parsed_response_object = response.parsed
        if parsed_response_object and isinstance(parsed_response_object, GeminiProductResponse):
            gemini_result_summary["gemini_response_parsed_dict"] = parsed_response_object.model_dump()
            gemini_result_summary["sdk_parsed_successfully"] = True
            print("\nGemini Output Parsed by SDK (Pydantic object):\n", json.dumps(parsed_response_object.model_dump(), indent=2))
            with shared_data_lock: last_ai_response_for_display = parsed_response_object.ai_response_to_user
        else: 
            print(f"Error: Gemini SDK's response.parsed was None or not GeminiProductResponse. Type: {type(parsed_response_object)}")
            with shared_data_lock: last_ai_response_for_display = "Error: Gemini output can't be parsed."

    except Exception as e_api:
        error_msg = f"Gemini API call error: {e_api}"
        print(error_msg); gemini_result_summary["error"] = error_msg
        with shared_data_lock: last_ai_response_for_display = "Error: Gemini API call failed."
    
    results_filepath = os.path.join(run_output_dir_for_results_json, "gemini_processing_results.json")
    with open(results_filepath, 'w') as f_json: json.dump([gemini_result_summary], f_json, indent=2)
    print(f"\nGemini processing results saved to: {results_filepath}")

# --- Processing Thread Function ---
def processing_pipeline_thread_function(
    frames_snapshot_to_process: list[tuple[bytes, list, float]],
    trigger_time: float,
    raw_vad_audio_data: typing.Optional[tuple[list, dict]]
):
    global processing_active, gui_status_message, gemini_client_global, last_ai_response_for_display

    pipeline_start_time = time.perf_counter()
    with shared_data_lock: gui_status_message = "Proc: Setup..."
    run_dir, all_decoded_s_dir, audio_s_dir, cropped_s_dir, clearest_s_dir = setup_output_directories(OUTPUT_BASE_PATH)

    permanent_audio_filepath = None
    
    # Save the captured VAD audio chunks too the save path
    if raw_vad_audio_data:
        vad_chunks, vad_params = raw_vad_audio_data
        if vad_chunks and vad_params:
            audio_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
            user_audio_filename = f"user_audio_vad_{audio_timestamp}.wav"
            path_to_save_vad_audio = os.path.join(audio_s_dir, user_audio_filename)
            
            if save_audio_to_wav(vad_chunks, path_to_save_vad_audio,
                                 vad_params['channels'], vad_params['rate'], vad_params['width_bytes']):
                print(f"VAD audio saved directly to: {path_to_save_vad_audio}")
                permanent_audio_filepath = path_to_save_vad_audio
            else:
                print(f"Failed to save VAD audio to {path_to_save_vad_audio}. Proceeding without VAD audio.")
        else:
            print("VAD data provided, but chunks or params were empty.")
    
    with shared_data_lock: gui_status_message = "Proc: Decode & Prep Imgs..."
    data_for_cropping = save_segment_and_prepare_data_for_cropping(
        frames_snapshot_to_process, trigger_time, RECORD_DURATION_SECONDS, all_decoded_s_dir
    )
    
    clearest_image_paths = []
    if data_for_cropping:
        with shared_data_lock: gui_status_message = "Proc: Cropping Imgs..."
        cropped_details = crop_products_from_frames(data_for_cropping, cropped_s_dir)
        if cropped_details:
            with shared_data_lock: gui_status_message = "Proc: Calc Sharpness..."
            clearest_image_paths = select_and_save_clearest_crops(cropped_details, clearest_s_dir, NUM_CLEAREST_CROPS_PER_SECOND_TO_SELECT)
        else: print("No valid product crops from images.")
    else: print("No frames decoded from buffer for image processing.")

    if not clearest_image_paths and not permanent_audio_filepath:
        print("No images and no audio to send to Gemini. Aborting.")
        with shared_data_lock: processing_active = False; gui_status_message = "Standby (No input for Gemini)"
        return

    with shared_data_lock: gui_status_message = "Proc: Gemini AI..."
    gemini_start_time = time.perf_counter()

    process_images_and_audio_with_gemini(
        gemini_client_global, clearest_image_paths, permanent_audio_filepath,
        GEMINI_MODEL_NAME, PROMPT_FOR_GEMINI_MULTI_IMAGE_AUDIO, run_dir
    )
    gemini_end_time = time.perf_counter()
    print(f"  Gemini processing took {gemini_end_time - gemini_start_time:.2f} seconds.")

    # after getting a response from Gemini, trigger the TTS output thread
    ai_text_for_speech = None
    with shared_data_lock:
        ai_text_for_speech = last_ai_response_for_display
    
    if AI_AUDIO_OUTPUT_ENABLED and tts_model_global and ai_text_for_speech and not ai_text_for_speech.startswith("Error:"):
        ai_audio_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        ai_speech_filename = f"ai_response_{ai_audio_timestamp}.wav"
        ai_speech_filepath = os.path.join(audio_s_dir, ai_speech_filename)

        print(f"Preparing to generate AI speech output to: {ai_speech_filepath}")
        tts_thread = threading.Thread(target=generate_and_play_speech_threaded,
                                      args=(ai_text_for_speech, ai_speech_filepath))
        tts_thread.daemon = True
        tts_thread.start()
    elif ai_text_for_speech and ai_text_for_speech.startswith("Error:"):
        print("AI response was an error, not generating speech.")

    run_name = os.path.basename(run_dir)
    with shared_data_lock: 
        processing_active = False 
        if not tts_active_lock.locked() and not gui_status_message.startswith("AI:"):
            pass 
            
    print(f"Processing pipeline finished for run: {run_name}. Total time: {time.perf_counter() - pipeline_start_time:.2f}s.")

# --- Display Update Function ---
def update_display_overlays(
    display_frame_to_update: np.ndarray,
    current_gui_status: str,
    current_ai_response: str,
    buffer_current_len: int,
    buffer_max_len: int, 
    current_display_fps: float,
    avg_jpeg_encode_time_ms: float,
    avg_yolo_infer_time_ms: float,
    is_vad_enabled: bool, 
    current_vad_confidence: float,
    is_vad_speaking_raw: bool,
    is_main_collecting_audio: bool,
    is_pipeline_processing: bool,
    ):
    """Draws all status text and bounding boxes onto the display frame."""
    y_offset = 30
    cv2.putText(display_frame_to_update, current_gui_status, (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2); y_offset += 30
    cv2.putText(display_frame_to_update, f"Buf:{buffer_current_len}/{buffer_max_len}", (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2); y_offset += 30
    
    cv2.putText(display_frame_to_update, f"DispFPS:{current_display_fps:.1f}", (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 1); y_offset += 25
    cv2.putText(display_frame_to_update, f"JPEGEnc:{avg_jpeg_encode_time_ms:.1f}ms", (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 1); y_offset += 25
    cv2.putText(display_frame_to_update, f"YOLOInf:{avg_yolo_infer_time_ms:.1f}ms", (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 1); y_offset += 25
    
    if is_vad_enabled:
        vad_display_status_str = "Standby (Processing)" if is_pipeline_processing else ("SPEAKING" if is_vad_speaking_raw else "SILENT")
        if not is_pipeline_processing and is_main_collecting_audio: vad_display_status_str += " (REC)"
        cv2.putText(display_frame_to_update, f"VAD:{current_vad_confidence:.2f} [{vad_display_status_str}]", (10, y_offset), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 1); y_offset += 25
    
    if current_ai_response:
        for i, line in enumerate(current_ai_response.split('\n')):
            if y_offset + i*20 > display_frame_to_update.shape[0] - 10 : break
            cv2.putText(display_frame_to_update, line, (10, y_offset + i*20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 200, 255), 1)

# --- Main Application Loop ---
def main_interactive_loop():
    global frame_buffer, processing_active, gui_status_message, last_ai_response_for_display
    global yolo_model_global, gemini_client_global, GEMINI_API_KEY
    global vad_thread_running, vad_is_recording_audio, vad_audio_chunks_for_processing, vad_trigger_processing_after_speech
    global vad_confidence_current, vad_is_currently_speaking, actual_audio_stream_params
    global tts_model_global

    # --- User Configuration Input ---
    print("--- Initial Configuration ---")
    GEMINI_API_KEY = input("Enter your Gemini API Key: ").strip()
    if not GEMINI_API_KEY: print("Warning: Gemini API Key not provided. Gemini features will fail."); # Or exit

    video_device_to_use = input(f"Enter video device path (e.g., /dev/video0) [{VIDEO_DEVICE_DEFAULT}]: ").strip() or VIDEO_DEVICE_DEFAULT
    
    selected_audio_device_idx_to_use = VAD_SELECTED_DEVICE_INDEX_DEFAULT
    if VAD_ENABLED:
        temp_pa = pyaudio.PyAudio()
        print("\nAvailable audio input devices:")
        found_devices = False
        for i in range(temp_pa.get_device_count()):
            dev_info = temp_pa.get_device_info_by_index(i)
            if dev_info['maxInputChannels'] > 0:
                found_devices = True
                print(f"  Idx: {i}, Name: {dev_info['name']}, Channels: {dev_info['maxInputChannels']}, Rate: {int(dev_info['defaultSampleRate'])}")
        if not found_devices: print("No audio input devices found. VAD may not function.");
        else:
            try:
                choice = input(f"Enter audio device index for VAD [{VAD_SELECTED_DEVICE_INDEX_DEFAULT}]: ").strip()
                selected_audio_device_idx_to_use = int(choice) if choice else VAD_SELECTED_DEVICE_INDEX_DEFAULT
            except ValueError:
                print(f"Invalid input, using default device index {VAD_SELECTED_DEVICE_INDEX_DEFAULT}.")
        temp_pa.terminate()
        print("-" * 30)

    print("Initializing models and audio...")
    try:
        yolo_model_global = initialize_yolo_model(YOLO_MODEL_PATH)
        gemini_client_global = configure_gemini(GEMINI_API_KEY)
        tts_model_global = initialize_tts_model(TTS_MODEL_NAME)
        
        vad_setup_success = False
        if VAD_ENABLED:
            vad_setup_success = initialize_vad_and_audio_stream(
                selected_audio_device_idx_to_use, VAD_DEVICE_CHANNELS, 
                VAD_DEVICE_FORMAT, VAD_DEVICE_SAMPLE_RATE_HZ
            )
            if vad_setup_success:
                vad_thread = threading.Thread(target=vad_monitor_thread_function, daemon=True)
                vad_thread.start()
            else: print("VAD initialization failed. Will proceed without VAD.")
        else: print("VAD is disabled in configuration.")

    except Exception as e: print(f"Fatal: Initialization error: {e}"); return

    print(f"Attempting to open camera: {video_device_to_use}")
    cap = cv2.VideoCapture(video_device_to_use)
    if not cap.isOpened():
        print(f"Error opening video: {video_device_to_use}.")
        return

    # Attempt to set custom camera config
    if WEBCAM_SET_PROPERTIES:
        print("\n--- Applying Custom Camera Settings ---")
        fourcc = cv2.VideoWriter_fourcc(*WEBCAM_FOURCC_CODEC)
        if cap.set(cv2.CAP_PROP_FOURCC, fourcc):
            print(f"  Successfully requested codec: {WEBCAM_FOURCC_CODEC}")
        else:
            print(f"  Warning: Could not set codec to {WEBCAM_FOURCC_CODEC}.")

        cap.set(cv2.CAP_PROP_FRAME_WIDTH, WEBCAM_DESIRED_WIDTH)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, WEBCAM_DESIRED_HEIGHT)
        cap.set(cv2.CAP_PROP_FPS, WEBCAM_DESIRED_FPS)
        print(f"  Requested: {WEBCAM_DESIRED_WIDTH}x{WEBCAM_DESIRED_HEIGHT} @ {WEBCAM_DESIRED_FPS} FPS")
        print("---------------------------------------")
    
    # Verify and print the actual camera settings being used
    cam_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    cam_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    cam_fps = cap.get(cv2.CAP_PROP_FPS)
    cam_codec_int = int(cap.get(cv2.CAP_PROP_FOURCC))
    cam_codec_str = "".join([chr((cam_codec_int >> 8 * i) & 0xFF) for i in range(4)])

    print(f"Cam using: {cam_w}x{cam_h} @ {cam_fps:.1f} FPS (Codec: {cam_codec_str})")
    if cam_w == 0:
        print("Cam width is 0. Exiting."); cap.release(); return

    print(f"Buffer: JPEG bytes (Max: {FRAME_BUFFER_MAXLEN} for {BUFFER_DURATION_SECONDS}s), Q:{JPEG_QUALITY}")
    if VAD_ENABLED and vad_setup_success: print("VAD Active. Speak to trigger processing.")
    else: print("VAD Disabled or Failed. Use SPACEBAR to trigger processing.")

    cv2.namedWindow(DISPLAY_WINDOW_NAME, cv2.WINDOW_AUTOSIZE)
    last_successful_bboxes = [] 
    last_display_time = time.time()
    yolo_infer_times = collections.deque(maxlen=TARGET_FPS) 
    jpeg_encode_times = collections.deque(maxlen=TARGET_FPS)
    
    while True:
        ret, raw_frame_4k = cap.read()
        if not ret or raw_frame_4k is None: print("Error: Can't receive frame. Exiting."); break
        current_capture_time = time.time()

        # --- Real-time YOLO ---
        yolo_start_time = time.perf_counter()
        orig_h, orig_w = raw_frame_4k.shape[:2]
        frame_for_yolo, scale_x, scale_y = raw_frame_4k, 1.0, 1.0 # Default if no resize
        if YOLO_INFERENCE_SIZE_WIDTH and orig_w > YOLO_INFERENCE_SIZE_WIDTH:
            sfw = YOLO_INFERENCE_SIZE_WIDTH / orig_w; inf_h = int(orig_h*sfw)
            frame_for_yolo = cv2.resize(raw_frame_4k, (YOLO_INFERENCE_SIZE_WIDTH, inf_h), cv2.INTER_AREA)
            scale_x, scale_y = orig_w/YOLO_INFERENCE_SIZE_WIDTH, orig_h/inf_h
        
        yolo_results = yolo_model_global.predict(frame_for_yolo, verbose=False)
        current_frame_bboxes_for_buffer = []
        if yolo_results and yolo_results[0].boxes:
            for box in yolo_results[0].boxes:
                if yolo_model_global.names[int(box.cls[0])] == "product": # Check class name
                    x1s,y1s,x2s,y2s=map(int,box.xyxy[0]); cnf=float(box.conf[0])
                    ox1,oy1=int(x1s*scale_x),int(y1s*scale_y); ox2,oy2=int(x2s*scale_x),int(y2s*scale_y)
                    ox1=max(0,ox1); oy1=max(0,oy1)
                    ox2=min(orig_w-1,ox2); oy2=min(orig_h-1,oy2)
                    if ox1<ox2 and oy1<oy2: current_frame_bboxes_for_buffer.append((ox1,oy1,ox2,oy2,"product",cnf))
        last_successful_bboxes = current_frame_bboxes_for_buffer
        yolo_infer_times.append((time.perf_counter() - yolo_start_time) * 1000)

        # --- JPEG Encode & Buffer ---
        jpeg_s_time = time.perf_counter()
        s, jpeg_bytes = cv2.imencode('.jpg', raw_frame_4k, [cv2.IMWRITE_JPEG_QUALITY, JPEG_QUALITY])
        jpeg_encode_times.append((time.perf_counter() - jpeg_s_time) * 1000)
        if s: frame_buffer.append((jpeg_bytes.tobytes(), current_frame_bboxes_for_buffer, current_capture_time))
        else: print("Warn: JPEG enc failed."); continue 

        # --- Display Logic ---
        display_frame = raw_frame_4k.copy()
        if display_frame.shape[1] > DISPLAY_MAX_WIDTH: # Resize for display
            s_ = DISPLAY_MAX_WIDTH / display_frame.shape[1]; nh_ = int(display_frame.shape[0]*s_)
            display_frame = cv2.resize(display_frame, (DISPLAY_MAX_WIDTH, nh_), cv2.INTER_AREA)
        
        d_h, d_w = display_frame.shape[:2]; d_sx,d_sy = d_w/orig_w, d_h/orig_h
        for (x1,y1,x2,y2,_,cnf) in last_successful_bboxes: # Draw bboxes
            dx1,dy1=int(x1*d_sx),int(y1*d_sy); dx2,dy2=int(x2*d_sx),int(y2*d_sy)
            cv2.rectangle(display_frame,(dx1,dy1),(dx2,dy2),(0,255,0),2)
            cv2.putText(display_frame,f"prod {cnf:.2f}",(dx1,dy1-5),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,255,0),1)

        l_td = current_capture_time - last_display_time; d_fps = 1.0/l_td if l_td > 0 else 0; last_display_time = current_capture_time
        avg_jpeg_ms=sum(jpeg_encode_times)/len(jpeg_encode_times) if jpeg_encode_times else 0
        avg_yolo_ms=sum(yolo_infer_times)/len(yolo_infer_times) if yolo_infer_times else 0

        with shared_data_lock:
            status_disp, ai_resp_disp = gui_status_message, last_ai_response_for_display
            vad_conf_disp, vad_speak_disp_raw, vad_rec_main_disp = vad_confidence_current, vad_is_currently_speaking, vad_is_recording_audio
            pipeline_active_disp = processing_active
        
        update_display_overlays(display_frame, status_disp, ai_resp_disp, len(frame_buffer), FRAME_BUFFER_MAXLEN,
                                d_fps, avg_jpeg_ms, avg_yolo_ms, 
                                VAD_ENABLED and vad_setup_success,
                                vad_conf_disp, vad_speak_disp_raw, vad_rec_main_disp, pipeline_active_disp)

        cv2.imshow(DISPLAY_WINDOW_NAME, display_frame)
        key = cv2.waitKey(1) & 0xFF

        # --- Trigger Processing ---
        should_trigger_now, trigger_source = False, None
        if key == ord('q'): print("Q pressed, exiting..."); vad_thread_running = False; break
        if key == ord(' '): should_trigger_now, trigger_source = True, "manual"
        
        with shared_data_lock:
            if vad_trigger_processing_after_speech:
                should_trigger_now, trigger_source = True, "vad"
                vad_trigger_processing_after_speech = False
        
        if should_trigger_now:
            with shared_data_lock:
                if processing_active: print("Processing already active. Trigger ignored.")
                else:
                    min_frames_needed = int(TARGET_FPS * RECORD_DURATION_SECONDS * 0.5)
                    if len(frame_buffer) < min_frames_needed :
                        print(f"Buffering... ({len(frame_buffer)}/{min_frames_needed})")
                        gui_status_message = f"Buffering... ({len(frame_buffer)})"
                    else:
                        processing_active = True
                        gui_status_message = f"Triggered ({trigger_source})! Preparing data..."
                        frames_snapshot, trigger_ts = list(frame_buffer), time.time()
                        
                        # Package the raw VAD audio chunks to saved
                        raw_vad_audio_data_for_pipeline: typing.Optional[tuple[list, dict]] = None
                        if trigger_source == "vad" and actual_audio_stream_params:
                            audio_chunks_to_save = list(vad_audio_chunks_for_processing)
                            vad_audio_chunks_for_processing.clear() 
                            
                            if audio_chunks_to_save:
                                
                                raw_vad_audio_data_for_pipeline = (
                                    audio_chunks_to_save, 
                                    dict(actual_audio_stream_params) # Pass a copy
                                )
                                print(f"Collected {len(audio_chunks_to_save)} VAD audio chunks for processing.")
                            else:
                                print("VAD triggered, but no audio chunks were collected.")
                        
                        print(f"Spawning processing thread (VAD audio chunks: {'Yes' if raw_vad_audio_data_for_pipeline else 'No'})...")
                        thread = threading.Thread(target=processing_pipeline_thread_function, 
                                                  args=(frames_snapshot, trigger_ts, raw_vad_audio_data_for_pipeline))
                        thread.daemon = True; thread.start()
    
    # Signal all running threads to stop
    vad_thread_running = False 
    if VAD_ENABLED and 'vad_thread' in locals() and vad_thread.is_alive():
        print("Waiting for VAD thread to join..."); vad_thread.join(timeout=2.0)
    
    cap.release(); cv2.destroyAllWindows(); print("Application closed.")

if __name__ == "__main__":
    if not os.path.exists(OUTPUT_BASE_PATH):
        try: os.makedirs(OUTPUT_BASE_PATH, exist_ok=True)
        except OSError as e: print(f"Error creating output path {OUTPUT_BASE_PATH}: {e}"); exit(1)
    main_interactive_loop()

playsound is relying on another python subprocess. Please use `pip install pygobject` if you want playsound to run more efficiently.


--- Initial Configuration ---


ALSA lib pcm_direct.c:2048:(snd1_pcm_direct_parse_open_conf) The field ipc_gid must be a valid group (create group audio)
ALSA lib pcm_direct.c:2048:(snd1_pcm_direct_parse_open_conf) The field ipc_gid must be a valid group (create group audio)
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:404:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:404:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:481:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:481:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib pcm_direct.c:2048:(snd1_pcm_direct_parse_open_conf) The field ipc_gid must be a valid group (create gr


Available audio input devices:
  Idx: 4, Name: HD-Audio Generic: ALC245 Analog (hw:1,0), Channels: 2, Rate: 44100
  Idx: 6, Name: pulse, Channels: 32, Rate: 44100
  Idx: 7, Name: default, Channels: 32, Rate: 44100
------------------------------
Initializing models and audio...
Loading YOLO: detection_model_02.pt
  PyTorch CUDA available: True. YOLO will attempt to use: cuda
  YOLO model parameters are on device: cpu
YOLO model loaded.
Gemini API client configured.
Loading TTS model: tts_models/en/ljspeech/tacotron2-DDC...
  TTS will attempt to use device: cuda
 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_li

Using cache found in /home/quintuplestuffed/.var/app/com.visualstudio.code/cache/torch/hub/snakers4_silero-vad_master
ALSA lib pcm_direct.c:2048:(snd1_pcm_direct_parse_open_conf) The field ipc_gid must be a valid group (create group audio)
ALSA lib pcm_direct.c:2048:(snd1_pcm_direct_parse_open_conf) The field ipc_gid must be a valid group (create group audio)
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_oss.c:404:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:404:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:481:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:481:(_snd_pcm_usb_stream_open) Invalid card 

VAD model loaded.
Using VAD Device: pulse (Index: 6)
Audio stream opened: 48000Hz, 2ch, Format Int16, Buffer 1536 frames.
🎤 VAD Monitor Thread Started.
Attempting to open camera: /dev/video0
Cam using: 3840x2160 @ 30.0 FPS (Codec: YU12)
Buffer: JPEG bytes (Max: 240 for 8s), Q:85
VAD Active. Speak to trigger processing.


Qt: Session management error: Could not open network socket


Q pressed, exiting...
Waiting for VAD thread to join...
🎤 VAD Monitor Thread Stopped.
Application closed.
