In [None]:
# ============== [CELL 1] ULTRA-OPTIMIZED SETUP ==============
!pip install -q --no-cache-dir ultralytics torchvision Pillow matplotlib accelerate bitsandbytes transformers>=4.40


In [None]:
import torch
from ultralytics import YOLO
from PIL import Image, ImageDraw, ImageOps,ImageFont
import matplotlib.pyplot as plt
import numpy as np
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration, BitsAndBytesConfig
from matplotlib.patches import Rectangle
import io

In [None]:
# Hardware optimization
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision('high')
print("setup complete")


In [None]:

# ============== [CELL 2] TURBO IMAGE LOADING ==============
def load_image(image_path, max_size=1024):
    try:
        img = Image.open(image_path)
        if hasattr(img, '_getexif'):
            img = ImageOps.exif_transpose(img)
        img = img.convert("RGB")
        if max(img.size) > max_size:
            img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
        print(f"Image loaded .Size: {img.size}")
        return img
    except Exception as e:
        print(f"Error loading image: {str(e)}")
        return None




In [None]:
# ============== [CELL 3] LIGHTNING OBJECT DETECTION ==============
def init_detector(model_size='n'):
    model = YOLO(f"yolov8{model_size}.pt", verbose=False)
    model.fuse()  # Fuse layers for 30% speedup
    if torch.cuda.is_available():
        model.cuda()
    return model

# Initialize once (cache this)
detector = init_detector('n')  # 'nano' model - fastest

# Turbo detection function
def detect_objects(model, image_path):
    results = model(image_path,
                   imgsz=640,
                   stream=True,  # Memory efficient
                   half=True,    # FP16 inference
                   device=0 if torch.cuda.is_available() else 'cpu')
    return next(results)




In [None]:
def draw_boxes(original_image, detections, conf_threshold=0.3):

    try:
        # ===== 1. INPUT VALIDATION =====
        if not isinstance(original_image, Image.Image):
            raise ValueError("Input must be PIL Image")

        orig_width, orig_height = original_image.size
        print(f"Original dimensions: {orig_width}x{orig_height}")

        # ===== 2. CREATE WORKING COPY =====
        # Maintain original size for accurate coordinates
        working_image = original_image.copy()
        draw = ImageDraw.Draw(working_image)

        # ===== 3. COORDINATE VERIFICATION =====
        boxes = detections.boxes
        if len(boxes) == 0:
            print("⚠️ No detections to draw!")
            return working_image

        # Convert to numpy once
        boxes_np = boxes.xyxy.cpu().numpy()
        print(f"First box coordinates (raw): {boxes_np[0]}")

        # ===== 4. SCALE CHECK =====
        # Verify if boxes match image dimensions
        box_scale_x = orig_width / boxes_np[0][2] if boxes_np[0][2] > orig_width else 1.0
        box_scale_y = orig_height / boxes_np[0][3] if boxes_np[0][3] > orig_height else 1.0

        if box_scale_x != 1.0 or box_scale_y != 1.0:
            print(f"⚠️ Scaling required: X={box_scale_x:.2f}, Y={box_scale_y:.2f}")
            boxes_np[:, [0,2]] *= box_scale_x  # Scale x coordinates
            boxes_np[:, [1,3]] *= box_scale_y  # Scale y coordinates
            print(f"Scaled first box: {boxes_np[0]}")
        else:
            print("✅ Coordinates match image dimensions")

        # ===== 5. DRAWING PARAMETERS =====
        # Dynamic sizing based on original dimensions
        font_size = max(12, int(orig_height / 50))
        box_width = max(2, int(orig_height / 300))

        try:
            font = ImageFont.truetype("arial.ttf", font_size)
        except:
            font = ImageFont.load_default()

        # ===== 6. DRAWING LOOP =====
        for box, conf, cls in zip(boxes_np, boxes.conf.cpu().numpy(),
                                 boxes.cls.cpu().numpy().astype(int)):
            if conf < conf_threshold:
                continue

            # Convert coordinates to integers
            x1, y1, x2, y2 = map(int, box)

            # Draw bounding box
            draw.rectangle([x1, y1, x2, y2],
                          outline=(255,0,0),
                          width=box_width)

            # Prepare label
            label = f"{detections.names[cls]} {conf:.2f}"
            text_bbox = draw.textbbox((0, 0), label, font=font)

            # Label positioning with boundary checks
            label_x = min(max(x1, 5), orig_width - (text_bbox[2]-text_bbox[0]) - 5)
            label_y = max(y1 - (text_bbox[3]-text_bbox[1]) - 5, 5)

            # Draw label background
            draw.rectangle(
                [label_x-2, label_y-2,
                 label_x + (text_bbox[2]-text_bbox[0]) + 2,
                 label_y + (text_bbox[3]-text_bbox[1]) + 2],
                fill=(0,0,0)
            )

            # Draw text
            draw.text((label_x, label_y), label, fill=(255,255,255), font=font)

        print("✅ Drawing completed successfully")
        return working_image

    except Exception as e:
        print(f"🚨 Critical error: {str(e)}")
        import traceback
        traceback.print_exc()
        return original_image  # Return original if drawing fails

In [None]:
# ============== [CELL 5] HYPER-PROMPT ENGINEERING ==============
def generate_prompt(results):
    if not hasattr(results, 'boxes') or len(results.boxes) == 0:
        return "Describe this image in detail."

    boxes = results.boxes.cpu()
    high_conf = boxes.conf > 0.35
    names = [results.names[int(cls)] for cls in boxes.cls[high_conf]]
    unique_objs = np.unique(names)
    print(unique_objs)

    if len(unique_objs) == 1:
        return f"Detail the {unique_objs[0]}'s appearance and surroundings concisely."
    elif len(unique_objs) == 2:
        return f"Describe the relationship between the {unique_objs[0]} and {unique_objs[1]} in this image."
    elif len(unique_objs) > 2:
        objects_str = ", ".join(unique_objs[:-1]) + f", and {unique_objs[-1]}"
        return f"Describe this image containing {objects_str}. Include appearance, position, and interaction."
    return f"Summarize this scene with {len(unique_objs)} objects."



In [None]:
# ============== [CELL 6] BLIP-ROCKET CAPTIONING ==============
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [None]:
processor = InstructBlipProcessor.from_pretrained(
    "Salesforce/instructblip-flan-t5-xl",
    truncation_side="left"  # Important for instruction prompts
)
model = InstructBlipForConditionalGeneration.from_pretrained(
    "Salesforce/instructblip-flan-t5-xl",
    quantization_config=quant_config,
    torch_dtype=torch.float16,
    device_map="auto",
    # attn_implementation="flash_attention_2"  # If available and compatible
)

In [None]:
@torch.inference_mode()
def generate_caption(image, prompt):
    try:
        inputs = processor(images=image, text=prompt, return_tensors="pt", truncation=True , padding=True,)
        inputs = inputs.to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            num_beams=3,
            temperature=0.7,
            top_p=0.9,          # Add top-p sampling
            repetition_penalty=1.25,  # Reduce repetition
            early_stopping=True,
            do_sample=True,      # Enable sampling for more diverse outputs
            use_cache=True       # Enable KV cache for faster generation
        )
        return processor.decode(outputs[0], skip_special_tokens=True).strip()

    except Exception as e:
        print(f"Caption generation failed: {str(e)}")
        return "Could not generate description."




In [None]:


image_path = "/content/images/000000000139.jpg"  # <--- CHANGE THIS
image = load_image(image_path)

if image:
    plt.imshow(image)
    plt.axis('off')
    plt.show()

detections = detect_objects(detector, image_path)
# print(detections)
# print(f"Detected {len(detections.boxes)} objects ")

# After your detection code
result_image = draw_boxes(image, detections)

if result_image:
    # Display without whitespace (for notebooks)
    plt.figure(figsize=(result_image.width/100, result_image.height/100), dpi=100)
    plt.imshow(result_image)
    plt.axis('off')
    plt.tight_layout(pad=0)  # Critical for removing whitespace
    plt.show()
else:
    print("Failed to generate result image")

prompt = generate_prompt(detections)
print(f"Optimized Prompt: {prompt}")

caption = generate_caption(image, prompt)
print("\nTurbo Caption:")
print("=" * 60)
print(caption)
print("=" * 60)




In [None]:
!pip install fpdf2  # For PDF generation

In [None]:
import os
from PIL import Image
from fpdf import FPDF
import matplotlib.pyplot as plt
import numpy as np

# Directory setup
image_dir = "/content/images/test"  # <-- Your input directory
output_dir = "/content/results"  # <-- Output directory
os.makedirs(output_dir, exist_ok=True)

class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Image Analysis Comprehensive Report', 0, 1, 'C')
        self.ln(10)

    def add_image_section(self, title, image_path, max_width=160, max_height=120):
        """
        Adds an image section with consistent sizing and proper layout
        - Automatically handles portrait/landscape orientation
        - Maintains aspect ratio
        - Centers images horizontally
        - Ensures consistent spacing
        """
        self.set_font('Arial', 'B', 12)
        self.cell(0, 8, title, 0, 1)  # Reduced title spacing

        # Get image dimensions while maintaining aspect ratio
        with Image.open(image_path) as img:
            img_width, img_height = img.size
            aspect_ratio = img_width / img_height

            # Calculate dimensions to fit within our max bounds
            if aspect_ratio > 1:  # Landscape
                width = min(max_width, img_width)
                height = width / aspect_ratio
            else:  # Portrait
                height = min(max_height, img_height)
                width = height * aspect_ratio

            # Center horizontally (A4 page is 210mm wide)
            x_pos = (210 - width) / 2

            # Add image with calculated dimensions
            self.image(image_path,
                    x=x_pos,
                    w=width,
                    h=height,
                    keep_aspect_ratio=True)

        # Consistent vertical spacing
        self.ln(8 if height < 80 else 12)  # More space after tall images

        # Add subtle divider line (light gray)
        self.set_draw_color(200, 200, 200)
        self.line(10, self.get_y(), 200, self.get_y())
        self.ln(5)

    def add_text_section(self, title, body):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1)
        self.set_font('Arial', '', 10)
        self.multi_cell(0, 5, body)
        self.ln(8)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

# Initialize single PDF report
pdf = PDFReport()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()

# Process each image
for idx, filename in enumerate(sorted(os.listdir(image_dir))):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(image_dir, filename)
        print(f"\n{'='*60}\nProcessing: {filename}\n{'='*60}")

        # Add section separator for new image
        if idx > 0:
            pdf.add_page()
            pdf.set_font('Arial', 'B', 14)
            pdf.cell(0, 10, f"Image Analysis: {filename}", 0, 1)
            pdf.ln(5)

        # Load image
        image = load_image(image_path)
        if not image:
            continue

        # Save original image temporarily
        original_path = os.path.join(output_dir, f"temp_original_{idx}.jpg")
        image.save(original_path)

        # Add original image to report
        pdf.add_image_section("Original Image:", original_path)

        # Run detection
        detections = detect_objects(detector, image_path)
        print(f"⚡ Detected {len(detections.boxes)} objects")

        # Generate detection list
        detected_objects = "\n".join(
            [f"- {detections.names[int(cls)]}: {conf:.2f} confidence"
             for cls, conf in zip(detections.boxes.cls.cpu().numpy(),
                                detections.boxes.conf.cpu().numpy())]
        )
        pdf.add_text_section(
            f"Detected Objects ({len(detections.boxes)}):",
            detected_objects
        )

        # Generate and add annotated image
        annotated_img = draw_boxes(image, detections)
        if annotated_img:
            annotated_path = os.path.join(output_dir, f"temp_annotated_{idx}.jpg")
            annotated_img.save(annotated_path)
            pdf.add_image_section("Annotated Image:", annotated_path)

        # Generate and add prompt
        prompt = generate_prompt(detections)
        pdf.add_text_section("Generated Prompt:", prompt)

        # Generate and add caption
        caption = generate_caption(image, prompt)
        pdf.add_text_section("Generated Caption:", caption)

        # Clean up temp files
        os.remove(original_path)
        if annotated_img:
            os.remove(annotated_path)

# Save final report
report_path = os.path.join(output_dir, "comprehensive_report.pdf")
pdf.output(report_path)
print(f"\n{'='*60}\n✅ Saved comprehensive report to: {report_path}\n{'='*60}")