In [1]:
# Complete AI Services with SAM Segmentation - OCR, Segmentation, and Text-to-Speech
# This is a complete solution using Segment Anything Model instead of YOLO
# Run this in Google Colab for instant public access

# Install required packages
!pip install gradio pytesseract pillow gtts pyttsx3 edge-tts opencv-python-headless numpy requests asyncio
!pip install segment-anything torch torchvision
!pip install 'git+https://github.com/facebookresearch/segment-anything.git'

# Additional system dependencies for Tesseract
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-eng espeak espeak-data libespeak1 libespeak-dev

# Download SAM models
print("📥 Downloading SAM model checkpoints...")
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
print("✅ SAM models downloaded successfully!")

import gradio as gr
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
from gtts import gTTS
import tempfile
import os
import base64
import io
import json
import threading
import time
import asyncio
import subprocess
import sys
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor

# Global variables
sam_models = {}  # Store multiple SAM models
current_sam_model = "sam_vit_b"
temp_dir = tempfile.mkdtemp()
models_loaded = {
    "sam_vit_b": False,
    "sam_vit_l": False,
    "sam_vit_h": False
}
tts_engines = {"gtts": True, "pyttsx3": False, "edge-tts": False}  # Track TTS engine availability

def initialize_tts_engines():
    """Initialize and check available TTS engines"""
    global tts_engines

    # Google TTS (always available with internet)
    tts_engines["gtts"] = True

    # Try to initialize pyttsx3 (offline TTS)
    try:
        import pyttsx3
        engine = pyttsx3.init()
        engine.stop()  # Stop any existing engine
        tts_engines["pyttsx3"] = True
        print("✅ Pyttsx3 TTS engine available")
    except Exception as e:
        tts_engines["pyttsx3"] = False
        print(f"⚠️ Pyttsx3 TTS not available: {e}")

    # Try to check edge-tts availability
    try:
        import edge_tts
        tts_engines["edge-tts"] = True
        print("✅ Edge TTS engine available")
    except Exception as e:
        tts_engines["edge-tts"] = False
        print(f"⚠️ Edge TTS not available: {e}")

    return f"TTS Engines Status:\n✅ Google TTS: Available\n{'✅' if tts_engines['pyttsx3'] else '❌'} Pyttsx3: {'Available' if tts_engines['pyttsx3'] else 'Not Available'}\n{'✅' if tts_engines['edge-tts'] else '❌'} Edge TTS: {'Available' if tts_engines['edge-tts'] else 'Not Available'}"

def initialize_sam_models(model_name="sam_vit_b"):
    """Initialize specific SAM model"""
    global sam_models, models_loaded

    model_map = {
        "sam_vit_b": {
            "checkpoint": "sam_vit_b_01ec64.pth",
            "model_type": "vit_b",
            "description": "Base model - Fast and efficient"
        },
        "sam_vit_l": {
            "checkpoint": "sam_vit_l_0b3195.pth",
            "model_type": "vit_l",
            "description": "Large model - Better accuracy"
        },
        "sam_vit_h": {
            "checkpoint": "sam_vit_h_4b8939.pth",
            "model_type": "vit_h",
            "description": "Huge model - Best accuracy"
        }
    }

    if model_name not in model_map:
        return f"❌ Unknown SAM model: {model_name}"

    try:
        print(f"🔄 Loading {model_name} model...")

        model_info = model_map[model_name]
        checkpoint_path = model_info["checkpoint"]

        # Check if checkpoint exists
        if not os.path.exists(checkpoint_path):
            return f"❌ Model checkpoint not found: {checkpoint_path}. Please download it first."

        # Load SAM model
        device = "cuda" if torch.cuda.is_available() else "cpu"
        sam = sam_model_registry[model_info["model_type"]](checkpoint=checkpoint_path)
        sam.to(device=device)

        sam_models[model_name] = {
            "model": sam,
            "predictor": SamPredictor(sam),
            "mask_generator": SamAutomaticMaskGenerator(sam),
            "device": device
        }
        models_loaded[model_name] = True

        result = f"✅ {model_name.upper()} loaded successfully!\n"
        result += f"📊 Model: Segment Anything {model_info['model_type'].upper()}\n"
        result += f"🎯 Description: {model_info['description']}\n"
        result += f"💾 Device: {device.upper()}\n"
        result += f"🔧 Status: Ready for segmentation"

        print(f"✅ {model_name} model loaded successfully on {device}!")
        return result

    except Exception as e:
        models_loaded[model_name] = False
        error_msg = f"❌ Error loading {model_name}: {str(e)}"
        print(error_msg)
        return error_msg

def load_all_sam_models():
    """Load all available SAM models"""
    results = "🚀 Loading all SAM models...\n\n"

    for model_name in ["sam_vit_b"]:  # Start with base model only for speed
        result = initialize_sam_models(model_name)
        results += f"**{model_name.upper()}:**\n{result}\n\n"

    return results

def perform_ocr(image, language="eng"):
    """Extract text from image using Tesseract OCR"""
    if image is None:
        return "❌ No image provided", "", 0, 0

    try:
        # Perform OCR
        extracted_text = pytesseract.image_to_string(image, lang=language)

        # Get additional OCR data for confidence
        ocr_data = pytesseract.image_to_data(image, lang=language, output_type=pytesseract.Output.DICT)

        # Calculate confidence
        confidences = [int(conf) for conf in ocr_data['conf'] if int(conf) > 0]
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0

        # Count words
        word_count = len(extracted_text.split()) if extracted_text.strip() else 0

        if extracted_text.strip():
            status = f"✅ OCR completed successfully! Found {word_count} words with {avg_confidence:.1f}% confidence"
        else:
            status = "⚠️ No text found in the image"

        return status, extracted_text.strip(), word_count, round(avg_confidence, 1)

    except Exception as e:
        return f"❌ OCR Error: {str(e)}", "", 0, 0

def segment_objects_with_sam(image, model_name="sam_vit_b", min_area=500, max_segments=50):
    """Segment objects using SAM and return masks and visualizations"""
    if image is None:
        return "❌ No image provided", "", image, image, 0, 0

    if model_name not in sam_models or not models_loaded[model_name]:
        return f"❌ SAM model {model_name} not loaded. Please load the model first.", "", image, image, 0, 0

    try:
        # Get the selected model
        sam_info = sam_models[model_name]
        mask_generator = sam_info["mask_generator"]

        # Convert PIL to numpy array
        image_array = np.array(image)

        # Generate masks automatically
        print(f"🔄 Generating masks with {model_name}...")
        masks = mask_generator.generate(image_array)

        # Filter masks by area
        filtered_masks = [mask for mask in masks if mask['area'] >= min_area]

        # Sort by area (largest first) and limit number
        filtered_masks = sorted(filtered_masks, key=lambda x: x['area'], reverse=True)[:max_segments]

        # Create visualizations
        segmented_image, overlay_image = create_sam_visualizations(image_array, filtered_masks)

        # Create text summary
        if filtered_masks:
            result_text = f"🤖 **{model_name.upper()} Segmentation Results**\n\n"
            result_text += f"🎯 **Found {len(filtered_masks)} segments**\n\n"

            # Add summary statistics
            total_area = sum(mask['area'] for mask in filtered_masks)
            image_area = image_array.shape[0] * image_array.shape[1]
            coverage = (total_area / image_area) * 100

            result_text += "📊 **Segmentation Statistics:**\n"
            result_text += f"• **Total Segments**: {len(filtered_masks)}\n"
            result_text += f"• **Image Coverage**: {coverage:.1f}%\n"
            result_text += f"• **Largest Segment**: {filtered_masks[0]['area']:,} pixels\n"
            result_text += f"• **Smallest Segment**: {filtered_masks[-1]['area']:,} pixels\n\n"

            # Add detailed list
            result_text += f"📋 **Detailed Segment List:**\n"
            for i, mask in enumerate(filtered_masks[:10], 1):  # Show top 10
                area = mask['area']
                stability_score = mask.get('stability_score', 0)
                bbox = mask['bbox']  # [x, y, w, h]

                result_text += f"{i}. **Segment {i}** 📐 Area: {area:,}px | "
                result_text += f"📊 Stability: {stability_score:.3f} | "
                result_text += f"📦 Box: ({int(bbox[0])}, {int(bbox[1])}) {int(bbox[2])}×{int(bbox[3])}\n"

            if len(filtered_masks) > 10:
                result_text += f"... and {len(filtered_masks) - 10} more segments\n"

            status = f"✅ {model_name.upper()} segmentation completed! Found {len(filtered_masks)} segments"
        else:
            result_text = f"🤖 **{model_name.upper()} Segmentation Results**\n\n"
            result_text += "🔍 No segments found with the current area threshold.\n\n"
            result_text += "💡 **Try:**\n"
            result_text += "- Lowering the minimum area threshold\n"
            result_text += "- Using a different SAM model\n"
            result_text += "- Using an image with more distinct objects"

            status = f"⚠️ {model_name.upper()}: No segments found with current threshold"
            segmented_image = image
            overlay_image = image

        return status, result_text, segmented_image, overlay_image, len(filtered_masks), len(filtered_masks)

    except Exception as e:
        error_msg = f"❌ {model_name.upper()} Segmentation Error: {str(e)}"
        return error_msg, "", image, image, 0, 0

def create_sam_visualizations(image_array, masks):
    """Create visualizations for SAM segmentation results"""

    # Create segmented image (each segment with different color)
    segmented_image = image_array.copy()
    overlay_image = image_array.copy()

    # Color palette for segments
    colors = [
        [255, 0, 0], [0, 255, 0], [0, 0, 255], [255, 255, 0], [255, 0, 255],
        [0, 255, 255], [128, 0, 0], [0, 128, 0], [0, 0, 128], [128, 128, 0],
        [128, 0, 128], [0, 128, 128], [255, 128, 0], [255, 0, 128], [128, 255, 0],
        [192, 192, 192], [128, 128, 128], [255, 165, 0], [255, 20, 147], [0, 191, 255]
    ]

    for i, mask in enumerate(masks):
        color = colors[i % len(colors)]

        # Get mask
        m = mask['segmentation']

        # Apply color to segmented image
        segmented_image[m] = color

        # Create overlay (blend with original)
        alpha = 0.5
        for c in range(3):
            overlay_image[m, c] = (alpha * color[c] + (1 - alpha) * overlay_image[m, c])

    # Convert to PIL Images
    segmented_pil = Image.fromarray(segmented_image.astype(np.uint8))
    overlay_pil = Image.fromarray(overlay_image.astype(np.uint8))

    # Add text overlay showing model info
    draw = ImageDraw.Draw(overlay_pil)
    try:
        font = ImageFont.load_default()
    except:
        font = None

    text = f"SAM Segments: {len(masks)}"
    draw.text((10, 10), text, fill=(255, 255, 255), font=font)
    draw.text((11, 11), text, fill=(0, 0, 0), font=font)  # Shadow

    return segmented_pil, overlay_pil

def compare_sam_models(image, min_area=500):
    """Compare results from multiple SAM models"""
    if image is None:
        return "❌ No image provided", image, image, image

    # Get available models
    available_models = [name for name, loaded in models_loaded.items() if loaded]

    if len(available_models) < 1:
        return "❌ Need at least 1 SAM model loaded for segmentation", image, image, image

    comparison_text = f"# 🔥 **SAM Model Comparison Results**\n\n"
    comparison_text += f"**Image analyzed with {len(available_models)} different SAM models:**\n\n"

    segmented_images = []

    for model_name in available_models[:3]:  # Compare up to 3 models
        try:
            status, results, segmented_img, overlay_img, total_segments, _ = segment_objects_with_sam(
                image, model_name, min_area
            )

            comparison_text += f"## 🤖 {model_name.upper()}\n"
            comparison_text += f"- **Segments Found:** {total_segments}\n"
            comparison_text += f"- **Model Type:** {model_name.replace('sam_vit_', 'ViT-').upper()}\n"
            comparison_text += f"- **Status:** {status.split(':')[-1].strip() if ':' in status else status}\n\n"

            segmented_images.append(overlay_img)

        except Exception as e:
            comparison_text += f"## ❌ {model_name.upper()}\n"
            comparison_text += f"- **Error:** {str(e)}\n\n"
            segmented_images.append(image)

    # Pad with original image if needed
    while len(segmented_images) < 3:
        segmented_images.append(image)

    comparison_text += "\n💡 **Tips for SAM Model Selection:**\n"
    comparison_text += "- **SAM ViT-B**: Fastest, good for real-time applications\n"
    comparison_text += "- **SAM ViT-L**: Balanced speed and accuracy\n"
    comparison_text += "- **SAM ViT-H**: Most accurate, slower processing\n"
    comparison_text += "- **Area Threshold**: Lower values detect smaller segments\n"

    return comparison_text, segmented_images[0], segmented_images[1], segmented_images[2]

def text_to_speech_multi_engine(text, language="en", tts_engine="gtts", voice_option="default"):
    """Convert text to speech using multiple TTS engines"""
    if not text.strip():
        return "❌ No text provided", None

    try:
        audio_file = None

        if tts_engine == "gtts" and tts_engines["gtts"]:
            # Google Text-to-Speech (online)
            try:
                # Map language codes for gTTS
                gtts_lang_map = {
                    "en": "en", "es": "es", "fr": "fr", "de": "de", "it": "it",
                    "pt": "pt", "ru": "ru", "zh": "zh", "ja": "ja", "ko": "ko",
                    "hi": "hi", "ar": "ar", "nl": "nl", "sv": "sv", "no": "no"
                }

                gtts_lang = gtts_lang_map.get(language, "en")

                # Different voice options for gTTS (using different TLDs for variety)
                tld_options = {
                    "default": "com",
                    "uk": "co.uk",
                    "australia": "com.au",
                    "india": "co.in",
                    "canada": "ca"
                }

                tld = tld_options.get(voice_option, "com")

                tts = gTTS(text=text, lang=gtts_lang, slow=False, tld=tld)
                audio_file = os.path.join(temp_dir, f"gtts_{hash(text)}.mp3")
                tts.save(audio_file)

                status = f"✅ Google TTS ({voice_option}) generated successfully! ({len(text)} characters)"

            except Exception as e:
                return f"❌ Google TTS Error: {str(e)}", None

        elif tts_engine == "pyttsx3" and tts_engines["pyttsx3"]:
            # Pyttsx3 (offline TTS)
            try:
                import pyttsx3

                engine = pyttsx3.init()

                # Get available voices
                voices = engine.getProperty('voices')

                # Voice selection logic
                selected_voice = None
                if voice_option == "male" and voices:
                    # Try to find male voice
                    for voice in voices:
                        if 'male' in voice.name.lower() or 'david' in voice.name.lower():
                            selected_voice = voice.id
                            break
                elif voice_option == "female" and voices:
                    # Try to find female voice
                    for voice in voices:
                        if 'female' in voice.name.lower() or 'zira' in voice.name.lower() or 'susan' in voice.name.lower():
                            selected_voice = voice.id
                            break

                if selected_voice:
                    engine.setProperty('voice', selected_voice)

                # Set speech rate and volume
                rate_map = {"slow": 150, "default": 200, "fast": 250}
                rate = rate_map.get(voice_option, 200)
                engine.setProperty('rate', rate)
                engine.setProperty('volume', 0.9)

                audio_file = os.path.join(temp_dir, f"pyttsx3_{hash(text)}.wav")
                engine.save_to_file(text, audio_file)
                engine.runAndWait()

                status = f"✅ Pyttsx3 TTS ({voice_option}) generated successfully! ({len(text)} characters)"

            except Exception as e:
                return f"❌ Pyttsx3 TTS Error: {str(e)}", None

        elif tts_engine == "edge-tts" and tts_engines["edge-tts"]:
            # Microsoft Edge TTS (high quality)
            try:
                import edge_tts

                # Voice mapping for Edge TTS
                edge_voices = {
                    "en": {
                        "female": "en-US-AriaNeural",
                        "male": "en-US-GuyNeural",
                        "child": "en-US-AnaNeural",
                        "default": "en-US-JennyNeural"
                    },
                    "es": {
                        "female": "es-ES-ElviraNeural",
                        "male": "es-ES-AlvaroNeural",
                        "default": "es-ES-ElviraNeural"
                    },
                    "fr": {
                        "female": "fr-FR-DeniseNeural",
                        "male": "fr-FR-HenriNeural",
                        "default": "fr-FR-DeniseNeural"
                    },
                    "de": {
                        "female": "de-DE-KatjaNeural",
                        "male": "de-DE-ConradNeural",
                        "default": "de-DE-KatjaNeural"
                    }
                }

                voice_id = edge_voices.get(language, edge_voices["en"]).get(voice_option, edge_voices.get(language, edge_voices["en"])["default"])

                audio_file = os.path.join(temp_dir, f"edge_{hash(text)}.mp3")

                # Run async function
                async def generate_edge_tts():
                    communicate = edge_tts.Communicate(text, voice_id)
                    await communicate.save(audio_file)

                # Run in event loop
                try:
                    loop = asyncio.get_event_loop()
                    loop.run_until_complete(generate_edge_tts())
                except RuntimeError:
                    asyncio.run(generate_edge_tts())

                status = f"✅ Edge TTS ({voice_option}) generated successfully! ({len(text)} characters)"

            except Exception as e:
                return f"❌ Edge TTS Error: {str(e)}", None
        else:
            return f"❌ TTS Engine '{tts_engine}' not available", None

        return status, audio_file

    except Exception as e:
        return f"❌ TTS Error: {str(e)}", None

def get_tts_voice_options(tts_engine):
    """Get available voice options for selected TTS engine"""
    if tts_engine == "gtts":
        return [
            ("Default", "default"),
            ("UK English", "uk"),
            ("Australian", "australia"),
            ("Indian", "india"),
            ("Canadian", "canada")
        ]
    elif tts_engine == "pyttsx3":
        return [
            ("Default Speed", "default"),
            ("Male Voice", "male"),
            ("Female Voice", "female"),
            ("Slow Speed", "slow"),
            ("Fast Speed", "fast")
        ]
    elif tts_engine == "edge-tts":
        return [
            ("Default", "default"),
            ("Female Voice", "female"),
            ("Male Voice", "male"),
            ("Child Voice", "child")
        ]
    else:
        return [("Default", "default")]

def process_image_complete(image, ocr_lang, min_area, tts_lang, selected_model, tts_engine, voice_option):
    """Process image with OCR, SAM segmentation, and TTS"""
    if image is None:
        return "❌ Please upload an image first", "", "", "", None, None, None, 0, 0, 0, 0

    # Perform OCR
    ocr_status, extracted_text, word_count, ocr_confidence = perform_ocr(image, ocr_lang)

    # Perform SAM Segmentation with visualization
    segment_status, segmentation_results, segmented_image, overlay_image, total_segments, _ = segment_objects_with_sam(
        image, selected_model, min_area
    )

    # Create integrated results combining text and segments
    integrated_results = create_integrated_results_sam(extracted_text, segmentation_results, total_segments, selected_model, tts_engine)

    # Generate TTS if text was found
    tts_status = ""
    audio_file = None
    if extracted_text.strip():
        tts_status, audio_file = text_to_speech_multi_engine(extracted_text, tts_lang, tts_engine, voice_option)
    else:
        tts_status = "⚠️ No text found for speech synthesis"

    # Combined status with better formatting
    overall_status = f"""## 🎉 **Processing Complete!**

### 📝 **OCR Results:**
{ocr_status}

### 🧩 **SAM Segmentation ({selected_model.upper()}):**
{segment_status}

### 🔊 **Text-to-Speech ({tts_engine.upper()}):**
{tts_status}

---
**💡 Tip:** Check the segmented images to see individual object masks!"""

    return (overall_status, extracted_text, segmentation_results, integrated_results, audio_file,
            segmented_image, overlay_image, word_count, ocr_confidence, total_segments, total_segments)

def create_integrated_results_sam(extracted_text, segmentation_results, total_segments, model_name, tts_engine):
    """Create an integrated view of text and SAM segmentation results"""
    integrated = f"# 🔍 **Complete Analysis Results**\n\n"

    # Summary section with model info
    integrated += "## 📊 **Quick Summary**\n"
    integrated += f"- 📝 **Text**: {len(extracted_text.split()) if extracted_text.strip() else 0} words extracted\n"
    integrated += f"- 🧩 **Segments**: {total_segments} segments detected\n"
    integrated += f"- 🤖 **SAM Model**: {model_name.upper()}\n"
    integrated += f"- 🔊 **TTS Engine**: {tts_engine.upper()}\n"
    integrated += f"- 🎵 **Audio**: {'Available' if extracted_text.strip() else 'No text to convert'}\n\n"

    # Text section
    if extracted_text.strip():
        integrated += "## 📝 **Extracted Text**\n"
        integrated += f"```\n{extracted_text}\n```\n\n"
    else:
        integrated += "⚠️ No text or segments detected in this image\n"
        integrated += "💡 **Suggestions:**\n"
        integrated += "- Try using a clearer, higher-resolution image\n"
        integrated += "- Lower the area threshold for more segments\n"
        integrated += "- Try a different SAM model (some are more sensitive)\n"
        integrated += "- Ensure the image contains recognizable objects or text\n"

    # Segments section with model info
    integrated += f"## 🧩 **Image Segmentation ({model_name.upper()})**\n"
    if total_segments > 0:
        integrated += segmentation_results + "\n\n"
    else:
        integrated += "*No segments detected with current area threshold*\n\n"

    # Audio section with engine info
    integrated += f"## 🔊 **Audio Generation ({tts_engine.upper()})**\n"
    if extracted_text.strip():
        integrated += f"✅ Speech generated using **{tts_engine.upper()}** engine\n"
        if tts_engine == "gtts":
            integrated += "🌐 Google TTS: Natural-sounding online synthesis\n"
        elif tts_engine == "pyttsx3":
            integrated += "💻 Pyttsx3: Fast offline synthesis\n"
        elif tts_engine == "edge-tts":
            integrated += "🎤 Edge TTS: High-quality Microsoft neural voices\n"
        integrated += "🎵 Click the audio player to listen to the extracted text\n\n"
    else:
        integrated += "⚠️ No text available for speech synthesis\n\n"

    # Visual indication
    integrated += "## 👁️ **Visual Results**\n"
    integrated += "✅ Check the **'Segmented Image'** and **'Overlay'** tabs to see different visualizations\n"
    integrated += "🎨 **Segmented**: Each segment shown in different colors\n"
    integrated += "🎭 **Overlay**: Segments blended with original image\n"
    integrated += f"🤖 SAM model: **{model_name.upper()}** with {total_segments} segments\n\n"

    # Combined insights
    integrated += "## 💡 **Insights & Tips**\n"
    if extracted_text.strip() and total_segments > 0:
        integrated += "✅ This image contains both **text content** and **segmentable objects**\n"
        integrated += f"🔊 Listen to the extracted text using **{tts_engine.upper()}** synthesis\n"
        integrated += "🧩 Try different SAM models to compare segmentation quality\n"
        integrated += "🎤 Try different TTS engines for varied audio quality and voices\n"
        integrated += "📊 Adjust area threshold to control segment sensitivity\n"
    elif extracted_text.strip():
        integrated += "📝 This image primarily contains **text content**\n"
        integrated += f"🔊 Listen to the text using **{tts_engine.upper()}** engine\n"
        integrated += "🎤 Try different TTS engines for different voice options\n"
        integrated += "🧩 Consider using images with more distinct objects for segmentation demos\n"
    elif total_segments > 0:
        integrated += "🧩 This image contains **segmentable objects** but no readable text\n"
        integrated += "📷 Try using images with text for OCR and TTS functionality\n"
        integrated += f"🤖 **{model_name.upper()}** successfully segmented {total_segments} regions\n"
        integrated += "🔄 Try different SAM models to see how segmentation results vary\n"
    else:
        integrated += "success"
    return integrated

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="🤖 AI Services Hub - SAM Edition", theme=gr.themes.Soft()) as interface:

        gr.Markdown("""
        # 🤖 AI Services Hub - SAM Edition
        ### OCR Text Extraction • SAM Segmentation • Text-to-Speech

        Upload an image to extract text, segment objects with pixel-perfect precision, and generate speech from the extracted text!
        """)

        # Model loading and selection
        with gr.Row():
            with gr.Column(scale=2):
                model_status = gr.Textbox(
                    label="🤖 SAM Model Status",
                    value="Click buttons below to load Segment Anything models...",
                    interactive=False,
                    lines=3
                )
            with gr.Column(scale=1):
                gr.Markdown("### 🚀 **Load SAM Models**")
                with gr.Row():
                    load_sam_b_btn = gr.Button("📦 SAM ViT-B (Fast)", variant="secondary", size="sm")
                    load_sam_l_btn = gr.Button("⚡ SAM ViT-L (Better)", variant="secondary", size="sm")
                with gr.Row():
                    load_sam_h_btn = gr.Button("🎯 SAM ViT-H (Best)", variant="secondary", size="sm")
                    load_all_sam_btn = gr.Button("🔥 Load All SAM", variant="primary", size="sm")

        # TTS Engine Status and Loading
        with gr.Row():
            with gr.Column(scale=2):
                tts_status = gr.Textbox(
                    label="🔊 TTS Engine Status",
                    value="Click button to initialize TTS engines...",
                    interactive=False,
                    lines=3
                )
            with gr.Column(scale=1):
                gr.Markdown("### 🎤 **Initialize TTS Engines**")
                init_tts_btn = gr.Button("🔊 Initialize TTS Engines", variant="secondary", size="lg")

        with gr.Row():
            # Left column - Input
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Input")

                image_input = gr.Image(
                    label="📷 Upload Image",
                    type="pil",
                    height=300
                )

                gr.Markdown("### ⚙️ Settings")

                # SAM model selection dropdown
                model_selector = gr.Dropdown(
                    choices=[
                        ("SAM ViT-B (Fastest)", "sam_vit_b"),
                        ("SAM ViT-L (Better Quality)", "sam_vit_l"),
                        ("SAM ViT-H (Best Quality)", "sam_vit_h")
                    ],
                    value="sam_vit_b",
                    label="🧩 Select SAM Model",
                    info="Different models offer different speed/quality tradeoffs"
                )

                # TTS Engine Selection
                tts_engine_selector = gr.Dropdown(
                    choices=[
                        ("Google TTS (Online)", "gtts"),
                        ("Pyttsx3 (Offline)", "pyttsx3"),
                        ("Edge TTS (High Quality)", "edge-tts")
                    ],
                    value="gtts",
                    label="🎤 Select TTS Engine",
                    info="Different engines offer different voice qualities and options"
                )

                # Voice Options (dynamic based on TTS engine)
                voice_selector = gr.Dropdown(
                    choices=[("Default", "default")],
                    value="default",
                    label="🎵 Voice Options",
                    info="Available options vary by TTS engine"
                )

                with gr.Row():
                    ocr_lang = gr.Dropdown(
                        choices=[
                            ("English", "eng"),
                            ("Spanish", "spa"),
                            ("French", "fra"),
                            ("German", "deu"),
                            ("Italian", "ita"),
                            ("Portuguese", "por"),
                            ("Russian", "rus"),
                            ("Chinese (Simplified)", "chi_sim"),
                            ("Japanese", "jpn"),
                            ("Korean", "kor")
                        ],
                        value="eng",
                        label="🔤 OCR Language"
                    )

                    tts_lang = gr.Dropdown(
                        choices=[
                            ("English", "en"),
                            ("Spanish", "es"),
                            ("French", "fr"),
                            ("German", "de"),
                            ("Italian", "it"),
                            ("Portuguese", "pt"),
                            ("Russian", "ru"),
                            ("Chinese", "zh"),
                            ("Japanese", "ja"),
                            ("Korean", "ko")
                        ],
                        value="en",
                        label="🔊 TTS Language"
                    )

                min_area_threshold = gr.Slider(
                    minimum=100,
                    maximum=5000,
                    step=100,
                    value=500,
                    label="🧩 Minimum Segment Area (pixels)",
                    info="Lower = more small segments, Higher = only large segments"
                )

                process_btn = gr.Button(
                    "🚀 Process Image (All Services)",
                    variant="primary",
                    size="lg"
                )

            # Right column - Output
            with gr.Column(scale=1):
                gr.Markdown("### 📤 Results")

                status_output = gr.Markdown(label="📊 Status")

                # Statistics
                with gr.Row():
                    word_count_out = gr.Number(label="📝 Words Found", interactive=False)
                    ocr_confidence_out = gr.Number(label="🎯 OCR Confidence %", interactive=False)
                    segments_count_out = gr.Number(label="🧩 Segments Found", interactive=False)
                    coverage_out = gr.Number(label="📊 Coverage %", interactive=False)

        # SAM Model Information Panel
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                ### 🧩 **SAM Model Comparison**

                | Model | Speed | Quality | Memory | Use Case |
                |-------|-------|---------|--------|----------|
                | **SAM ViT-B** | 🌟🌟🌟🌟🌟 | 🌟🌟🌟 | Low | Real-time, mobile |
                | **SAM ViT-L** | 🌟🌟🌟 | 🌟🌟🌟🌟 | Medium | Balanced applications |
                | **SAM ViT-H** | 🌟🌟 | 🌟🌟🌟🌟🌟 | High | Best quality, research |

                **SAM vs YOLO**: SAM provides pixel-perfect segmentation masks while YOLO gives bounding boxes with object classification.
                """)

        # Image Results Section (Full Width) - Updated for SAM
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🖼️ **Visual Results - Original vs Segmented**")
                with gr.Tab("📷 Original Image"):
                    original_display = gr.Image(label="Original Image", interactive=False)
                with gr.Tab("🧩 Segmented Image"):
                    segmented_output = gr.Image(
                        label="Segmented Image (Color-coded)",
                        interactive=False,
                        height=400
                    )
                with gr.Tab("🎭 Overlay Image"):
                    overlay_output = gr.Image(
                        label="Overlay (Segments + Original)",
                        interactive=False,
                        height=400
                    )

        # Model Comparison Section - Updated for SAM
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🔥 **SAM Model Comparison** (Compare Different SAM Models)")
                compare_btn = gr.Button("⚔️ Compare All Loaded SAM Models", variant="secondary", size="lg")

                with gr.Row():
                    with gr.Tab("SAM ViT-B Result"):
                        comparison_img1 = gr.Image(label="ViT-B Segmentation", interactive=False)
                    with gr.Tab("SAM ViT-L Result"):
                        comparison_img2 = gr.Image(label="ViT-L Segmentation", interactive=False)
                    with gr.Tab("SAM ViT-H Result"):
                        comparison_img3 = gr.Image(label="ViT-H Segmentation", interactive=False)

                comparison_results = gr.Markdown(label="📊 SAM Comparison Results")

        # Integrated Results Section (Full Width)
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🔍 **Integrated Results - Text & Segments Together**")
                integrated_output = gr.Markdown(
                    label="Complete Analysis",
                    value="Upload an image and click 'Process Image' to see integrated results here..."
                )

        # Audio and Individual Results (Side by Side)
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🔊 **Generated Audio**")
                audio_out = gr.Audio(
                    label="🎵 Listen to Extracted Text",
                    interactive=False
                )

                # TTS Testing Section
                gr.Markdown("### 🎤 **Test TTS Engines**")
                test_text = gr.Textbox(
                    label="Test Text",
                    value="Hello, this is a test of the text-to-speech system.",
                    lines=2
                )
                test_tts_btn = gr.Button("🎵 Test Selected TTS Engine", variant="secondary")

            with gr.Column(scale=1):
                gr.Markdown("### 📝 **Raw Text Output**")
                extracted_text_out = gr.Textbox(
                    label="Extracted Text (Raw)",
                    lines=4,
                    max_lines=8,
                    interactive=False,
                    placeholder="Extracted text will appear here..."
                )

        # Segmentation Results (Full Width)
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🧩 **Segmentation Details**")
                segmentation_out = gr.Textbox(
                    label="Segmentation Results (Detailed View)",
                    lines=6,
                    max_lines=10,
                    interactive=False,
                    placeholder="Segmentation results will appear here..."
                )

        # Individual service buttons
        with gr.Row():
            ocr_btn = gr.Button("📝 OCR Only", variant="secondary")
            segment_btn = gr.Button("🧩 Segment Only", variant="secondary")
            tts_btn = gr.Button("🔊 TTS Only", variant="secondary")

        # Event handlers for TTS engine initialization
        init_tts_btn.click(
            fn=initialize_tts_engines,
            outputs=[tts_status]
        )

        # Update voice options when TTS engine changes
        def update_voice_options(engine):
            return gr.Dropdown.update(choices=get_tts_voice_options(engine), value="default")

        tts_engine_selector.change(
            fn=update_voice_options,
            inputs=[tts_engine_selector],
            outputs=[voice_selector]
        )

        # Event handlers for SAM model loading
        load_sam_b_btn.click(
            fn=lambda: initialize_sam_models("sam_vit_b"),
            outputs=[model_status]
        )

        load_sam_l_btn.click(
            fn=lambda: initialize_sam_models("sam_vit_l"),
            outputs=[model_status]
        )

        load_sam_h_btn.click(
            fn=lambda: initialize_sam_models("sam_vit_h"),
            outputs=[model_status]
        )

        load_all_sam_btn.click(
            fn=load_all_sam_models,
            outputs=[model_status]
        )

        # Main processing event handler
        process_btn.click(
            fn=process_image_complete,
            inputs=[image_input, ocr_lang, min_area_threshold, tts_lang, model_selector, tts_engine_selector, voice_selector],
            outputs=[status_output, extracted_text_out, segmentation_out, integrated_output, audio_out,
                    segmented_output, overlay_output, word_count_out, ocr_confidence_out, segments_count_out, coverage_out]
        )

        # Update original image display when new image is uploaded
        image_input.change(
            fn=lambda img: img,
            inputs=[image_input],
            outputs=[original_display]
        )

        # Model comparison handler
        compare_btn.click(
            fn=compare_sam_models,
            inputs=[image_input, min_area_threshold],
            outputs=[comparison_results, comparison_img1, comparison_img2, comparison_img3]
        )

        # Individual service handlers
        ocr_btn.click(
            fn=lambda img, lang: perform_ocr(img, lang),
            inputs=[image_input, ocr_lang],
            outputs=[status_output, extracted_text_out, word_count_out, ocr_confidence_out]
        )

        segment_btn.click(
            fn=lambda img, model, area: segment_objects_with_sam(img, model, area),
            inputs=[image_input, model_selector, min_area_threshold],
            outputs=[status_output, segmentation_out, segmented_output, overlay_output, segments_count_out, coverage_out]
        )

        tts_btn.click(
            fn=lambda text, lang, engine, voice: text_to_speech_multi_engine(text, lang, engine, voice),
            inputs=[extracted_text_out, tts_lang, tts_engine_selector, voice_selector],
            outputs=[status_output, audio_out]
        )

        # Test TTS handler
        test_tts_btn.click(
            fn=lambda text, lang, engine, voice: text_to_speech_multi_engine(text, lang, engine, voice),
            inputs=[test_text, tts_lang, tts_engine_selector, voice_selector],
            outputs=[status_output, audio_out]
        )

        # Add examples
        gr.Examples(
            examples=[
                ["Sample image with text and objects", "eng", 500, "en", "sam_vit_b", "gtts", "default"]
            ],
            inputs=[image_input, ocr_lang, min_area_threshold, tts_lang, model_selector, tts_engine_selector, voice_selector],
            label="💡 Try these examples (upload your own images)"
        )

        gr.Markdown("""
        ### 📖 How to Use:
        1. **Initialize TTS engines** using the TTS initialization button
        2. **Load SAM models** using the buttons above (start with SAM ViT-B for speed)
        3. **Upload an image** using the image input
        4. **Select your preferred models** from the dropdowns (SAM for segmentation, TTS for speech)
        5. **Choose voice options** based on your selected TTS engine
        6. **Adjust area threshold** for segment sensitivity
        7. **Click "Process Image"** to run all AI services at once
        8. **View visual results** in the Original, Segmented, and Overlay tabs
        9. **Compare models** using the SAM comparison feature
        10. **Test TTS engines** using the test section with custom text

        ### 🧩 SAM Model Information:
        - **SAM ViT-B (Base)**: Fastest processing, good for real-time applications, ~375MB
        - **SAM ViT-L (Large)**: Balanced speed and quality, good all-around choice, ~1.25GB
        - **SAM ViT-H (Huge)**: Highest quality, slower processing, best for detailed analysis, ~2.4GB

        ### 🎯 SAM vs YOLO Comparison:

        | Feature | YOLO | SAM |
        |---------|------|-----|
        | **Output** | Bounding boxes + labels | Pixel-perfect masks |
        | **Speed** | Very fast | Moderate |
        | **Precision** | Object-level | Pixel-level |
        | **Use Case** | Object detection/counting | Image segmentation/editing |
        | **Classes** | Predefined (80 classes) | Class-agnostic (any object) |

        ### 🎤 TTS Engine Details:

        #### **Google TTS (gTTS)**
        - 🌐 **Online synthesis** with natural voices
        - 🗺️ **Regional accents**: US, UK, Australian, Indian, Canadian
        - 🎯 **Best for**: Natural-sounding speech, multiple language support
        - ⚡ **Speed**: Moderate (requires internet)

        #### **Pyttsx3 (Offline TTS)**
        - 💻 **Offline synthesis** - no internet required
        - 🎚️ **Voice control**: Male/Female voices, speed adjustment
        - 🎯 **Best for**: Privacy, offline use, fast processing
        - ⚡ **Speed**: Very fast (local processing)

        #### **Edge TTS (Microsoft)**
        - 🧠 **Neural voices** with high quality synthesis
        - 👥 **Multiple personas**: Female, Male, Child voices per language
        - 🎯 **Best for**: Highest quality speech, professional applications
        - ⚡ **Speed**: Fast (requires internet)

        ### 🛠️ Individual Services:
        - **OCR Only**: Extract text from the image
        - **Segment Only**: Generate pixel-perfect object masks using SAM
        - **TTS Only**: Convert the extracted text to speech using selected engine
        - **Test TTS**: Try different engines with custom text
        - **Compare SAM Models**: See how different SAM versions perform on the same image

        ### ✨ Key Features:
        - 🧩 **Pixel-Perfect Segmentation**: SAM provides precise object boundaries, not just boxes
        - 🤖 **Multiple SAM Models**: Compare SAM ViT-B, ViT-L, and ViT-H variants
        - 🎨 **Dual Visualizations**: Color-coded segments and overlay views
        - 🎤 **Multiple TTS Engines**: Choose from Google TTS, Pyttsx3, or Edge TTS
        - 🎵 **Voice Variety**: Different voice options for each TTS engine
        - 🔍 **Model Comparison**: Side-by-side results from different SAM models
        - 📊 **Detailed Segment Info**: Area, stability scores, and bounding boxes
        - 🎭 **Visual Overlays**: See segments blended with original image
        - 🔊 **Instant Audio**: Generated speech plays automatically
        - 📊 **Real-time Stats**: Live word count, confidence, and segment counts
        - 🌍 **Multi-language**: Support for 10+ languages in both OCR and TTS
        - 📱 **Mobile-friendly**: Works perfectly on phones and tablets

        ### 💡 Tips:
        - **Start with SAM ViT-B** for fast results, then try ViT-L or ViT-H for better quality
        - **Lower area threshold** (100-300) to catch smaller segments, higher (1000+) for only large objects
        - **Compare SAM models** on the same image to see how segmentation quality varies
        - **Try different TTS engines** to find your preferred voice quality and style
        - **Google TTS** for natural voices, **Pyttsx3** for offline use, **Edge TTS** for highest quality
        - **Use high-resolution images** for better text OCR and more precise segmentation
        - **Check both visualizations**: Segmented shows pure masks, Overlay shows context
        - **SAM is class-agnostic**: It finds object boundaries without knowing what objects are
        - **Test TTS engines** with your own text to compare voice quality
        - **SAM works best** on images with clear object boundaries and good contrast
        """)

    return interface

def find_free_gradio_port(start_port=7860, max_attempts=50):
    """Find a free port for Gradio starting from start_port"""
    import socket
    for port in range(start_port, start_port + max_attempts):
        try:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.bind(('', port))
                return port
        except OSError:
            continue
    raise RuntimeError(f"Could not find a free port after {max_attempts} attempts starting from {start_port}")

# Main execution
print("🚀 Starting Complete AI Services with SAM Segmentation...")

# Create and launch interface
interface = create_interface()

# Launch with automatic port detection
print("🌐 Launching Gradio interface with SAM integration...")

try:
    available_port = find_free_gradio_port()
    print(f"🔍 Found available port: {available_port}")
except RuntimeError as e:
    print(f"❌ Port error: {e}")
    available_port = None

if available_port:
    try:
        interface.launch(
            share=True,  # Creates public link automatically
            server_name="0.0.0.0",
            server_port=available_port,
            show_error=True,
            quiet=False,
            prevent_thread_lock=True  # Allows notebook to remain interactive
        )

        print("🎉 Gradio interface is now live!")
        print(f"🌐 Running on port: {available_port}")
        print("📱 You can access it from the public URL shown above")
        print("🧩 SAM-powered segmentation interface ready!")
        print("🔗 The interface provides OCR, SAM segmentation, and TTS services")

    except Exception as e:
        print(f"❌ Failed to launch Gradio: {e}")
        print("🔧 Try restarting your Colab runtime and re-running the cell")

else:
    print("❌ Could not find available port for Gradio")
    print("🔧 Please restart your Colab runtime and try again")

print("\n🚀 **Complete SAM-Powered AI Services Ready!**")
print("🧩 Features: OCR + Segment Anything + Multi-TTS")
print("💡 Click 'Initialize TTS Engines' and 'Load SAM Models' in the interface first!")
print("📖 Instructions:")
print("1. Initialize TTS engines")
print("2. Load SAM ViT-B model (fastest)")
print("3. Upload an image")
print("4. Adjust area threshold (500 is good start)")
print("5. Click 'Process Image' to run all services")
print("6. View results in Original/Segmented/Overlay tabs")
print("7. Listen to generated audio from extracted text")

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting pyttsx3
  Downloading pyttsx3-2.99-py3-none-any.whl.metadata (6.2 kB)
Collecting edge-tts
  Downloading edge_tts-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting asyncio
  Downloading asyncio-4.0.0-py3-none-any.whl.metadata (994 bytes)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading pyttsx3-2.99-py3-none-any.whl (32 kB)
Downloading edge_tts-7.2.0-py3-none-any.whl (30 kB)
Downloading asyncio-4.0.0-py3-none-any.whl (5.6 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyttsx3, pytesseract, 

🎉 Gradio interface is now live!
🌐 Running on port: 7860
📱 You can access it from the public URL shown above
🧩 SAM-powered segmentation interface ready!
🔗 The interface provides OCR, SAM segmentation, and TTS services

🚀 **Complete SAM-Powered AI Services Ready!**
🧩 Features: OCR + Segment Anything + Multi-TTS
💡 Click 'Initialize TTS Engines' and 'Load SAM Models' in the interface first!
📖 Instructions:
1. Initialize TTS engines
2. Load SAM ViT-B model (fastest)
3. Upload an image
4. Adjust area threshold (500 is good start)
5. Click 'Process Image' to run all services
6. View results in Original/Segmented/Overlay tabs
7. Listen to generated audio from extracted text
