In [None]:
!pip install torch  transformers pillow

In [None]:
"""
This script generates a caption for an image using the BLIP (Bootstrapped Language-Image Pretraining) model.
It employs the Hugging Face `transformers` library and the `PIL` library for image handling.

### Overall Process:
1. Load the BLIP processor and model (`Salesforce/blip-image-captioning-base`), which is a pretrained vision-language model.
2. Open and preprocess the image using PIL, ensuring it's in RGB format.
3. Convert the image into tensors using the processor, making it compatible with the model.
4. Pass the processed image to the BLIP model to generate a caption.
5. Decode the generated caption and print the result.

### Libraries Used:
- `transformers`: Provides BLIP's processor and model for image captioning.
- `PIL (Pillow)`: Handles image loading and processing.
- `torch`: Supports tensor operations required for model input and output.

"""

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load BLIP-2 Processor & Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load Image
image = Image.open("/content/20250326_232107.jpg").convert("RGB")

# Preprocess Image
inputs = processor(images=image, return_tensors="pt")

# Generate Caption
caption_ids = model.generate(**inputs)
caption = processor.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)

In [None]:
#very large model --required more than 15 gb gpu for large model can use its small version if possible
"""
This script generates a highly detailed caption for an image using BLIP-2 (Bootstrapped Language-Image Pretraining 2).
It utilizes the `transformers` library from Hugging Face and the `PIL` library for image handling.

### Overall Process:
1. Load the BLIP-2 processor and model (`Salesforce/blip2-opt-6.7b`), a vision-language model that integrates image and text processing.
2. Open and preprocess the image using PIL, ensuring it's in RGB format.
3. Convert the image into tensors and provide a text prompt to guide the caption generation.
4. Use beam search (`num_beams=5`) and top-p sampling (`top_p=0.9`) to refine caption quality.
5. Decode the generated output and print the final caption.

### Libraries Used:
- `transformers`: Provides BLIP-2's processor and model for advanced image captioning.
- `PIL (Pillow)`: Handles image loading and processing.
- `torch`: Supports tensor operations required for model input and output.

### Notes:
- The `text` parameter allows for prompt-based captioning, enabling more control over the generated description.
- `max_length=100` ensures the caption is detailed.
- `num_beams=5` improves caption quality using beam search.
- `top_p=0.9` controls the randomness of predictions for better fluency.

"""

from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

# Load BLIP-2 Processor & Model (Pretrained)
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-6.7b")  # Smaller models: "Salesforce/blip2-opt-6.7b"
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b")

# Load Image
image = Image.open("/content/20250326_232107.jpg").convert("RGB")  # Change to your image path

# Preprocess Image with a Descriptive Prompt
inputs = processor(images=image, text="A highly detailed description of the image:", return_tensors="pt")

# Generate Caption with Detailed Settings
caption_ids = model.generate(**inputs, max_length=100, num_beams=5, top_p=0.9)
caption = processor.tokenizer.decode(caption_ids[0], skip_special_tokens=True)

print("Generated Caption:", caption)


In [None]:
# =========================
# 🔧 INSTALL DEPENDENCIES
# =========================
!pip install -q transformers accelerate timm

# =========================
# 📦 IMPORT LIBRARIES
# =========================
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from google.colab import files
from io import BytesIO

# =========================
# ⚙️ SET DEVICE (T4 GPU FRIENDLY)
# =========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# =========================
# 🧠 LOAD BLIP-2 MODEL (FLAN-T5-XL + ViT-G/14)
# =========================
print("⏳ Loading BLIP-2 FLAN-T5-XL with ViT-G/14...")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    device_map="auto",       # Spread model across available devices
    torch_dtype=dtype        # Use float16 if on GPU
)
model.eval()
print("✅ Model loaded.")

# =========================
# 🖼️ UPLOAD IMAGE
# =========================
print("📤 Upload an image...")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
image = Image.open(BytesIO(uploaded[image_path])).convert("RGB")

# =========================
# ✏️ ASK FOR MAX TOKEN LENGTH
# =========================
try:
    max_tokens = int(input("Enter max caption length (e.g., 20, 30, 50): "))
except:
    max_tokens = 30
    print("⚠️ Invalid input, defaulting to 30 tokens.")

# =========================
# 📝 GENERATE CAPTION
# =========================
print("🧠 Generating caption...")
inputs = processor(images=image, return_tensors="pt").to(device, dtype)
generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)

caption = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("🖋️ Caption:", caption)
