# Image Captioning with Generative AI
### Using BLIP Model with Conceptual Captions Approach

This notebook implements an advanced image captioning system using the BLIP (Bootstrapping Language-Image Pre-training) model, which incorporates training methodologies similar to Conceptual Captions dataset.

## 1. Install Required Dependencies

In [None]:
!pip install torch transformers gradio Pillow requests numpy

## 2. Import Libraries

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import gradio as gr
import requests
from io import BytesIO
import numpy as np
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 3. Load Pre-trained BLIP Model

We're using the Salesforce BLIP model which incorporates Conceptual Captions training methodology for superior image understanding and caption generation.

In [None]:
def load_model():
    """
    Load the BLIP model and processor for image captioning.
    Returns:
        tuple: (processor, model) - Loaded BLIP processor and model
    """
    try:
        print("Loading BLIP model and processor...")
        
        # Load processor and model
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        
        # Set model to evaluation mode for inference
        model.eval()
        
        # Move to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        
        print(f"Model loaded successfully on {device}!")
        return processor, model, device
        
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None, None

# Load the model
processor, model, device = load_model()

## 4. Image Preprocessing and Caption Generation Functions

In [None]:
def preprocess_image(image_input):
    """
    Preprocess image for the BLIP model.
    
    Args:
        image_input: PIL Image, numpy array, or file path
    
    Returns:
        PIL.Image: Preprocessed image
    """
    try:
        if isinstance(image_input, str):
            # If it's a URL
            if image_input.startswith(('http://', 'https://')):
                response = requests.get(image_input)
                image = Image.open(BytesIO(response.content))
            else:
                # If it's a local file path
                image = Image.open(image_input)
        elif isinstance(image_input, np.ndarray):
            # If it's a numpy array (from Gradio)
            image = Image.fromarray(image_input)
        else:
            # If it's already a PIL Image
            image = image_input
        
        # Convert to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        return image
    
    except Exception as e:
        print(f"Error preprocessing image: {str(e)}")
        return None

def generate_caption(image_input, max_length=50, num_beams=5):
    """
    Generate caption for the given image using BLIP model.
    
    Args:
        image_input: Input image (PIL Image, numpy array, or file path)
        max_length (int): Maximum length of generated caption
        num_beams (int): Number of beams for beam search
    
    Returns:
        str: Generated caption
    """
    if model is None or processor is None:
        return "Error: Model not loaded properly"
    
    try:
        # Preprocess the image
        image = preprocess_image(image_input)
        if image is None:
            return "Error: Could not process the image"
        
        # Process image and generate caption
        inputs = processor(image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode the generated caption
        caption = processor.decode(generated_ids[0], skip_special_tokens=True)
        
        return caption
    
    except Exception as e:
        return f"Error generating caption: {str(e)}"

print("Caption generation functions defined successfully!")

## 5. Test Caption Generation

In [None]:
# Test with a sample image URL
test_image_url = "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500"
test_caption = generate_caption(test_image_url)
print(f"Test Caption: {test_caption}")

## 6. Create Advanced Gradio Interface

In [None]:
def create_gradio_interface():
    """
    Create and return a Gradio interface for image captioning.
    
    Returns:
        gr.Interface: Configured Gradio interface
    """
    
    def caption_with_options(image, max_length, num_beams):
        """
        Wrapper function for Gradio interface with configurable options.
        """
        if image is None:
            return "Please upload an image first."
        
        caption = generate_caption(image, max_length=int(max_length), num_beams=int(num_beams))
        return caption
    
    # Create Gradio interface
    interface = gr.Interface(
        fn=caption_with_options,
        inputs=[
            gr.Image(type="numpy", label="Upload Image"),
            gr.Slider(minimum=20, maximum=100, value=50, step=5, label="Max Caption Length"),
            gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Number of Beams")
        ],
        outputs=gr.Textbox(label="Generated Caption", lines=3),
        title="🖼️ AI Image Captioning with BLIP",
        description="Upload an image and get an AI-generated caption using the BLIP model trained on Conceptual Captions methodology.",
        article="### How it works: Upload your image, adjust parameters if needed, and get your AI-generated caption instantly!",
        theme=gr.themes.Soft(),
        allow_flagging="never"
    )
    
    return interface

# Create the interface
demo = create_gradio_interface()
print("Gradio interface created successfully!")

## 7. Launch the Application

In [None]:
# Launch the Gradio interface
if __name__ == "__main__":
    print("Launching Image Captioning Application...")
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )