In [None]:
!pip install 'git+https://github.com/facebookresearch/detectron2.git'


In [None]:
!pip install numpy

In [None]:
# Detectron2 (Mask R-CNN with FPN and ResNet-50 Backbone)

"""
# Object Detection and Matching Pipeline using Detectron2 and CLIP

## Overview:
This script performs object detection on an image using Detectron2's **Mask R-CNN** model and matches detected objects to text descriptions using **CLIP (Contrastive Language-Image Pretraining)**. The goal is to find objects in an image that match the words extracted from a given caption.

## Steps:
1. **Load Dependencies**
   - OpenCV (cv2) for image processing and visualization
   - Detectron2 for object detection (Mask R-CNN with FPN and ResNet-50 backbone)
   - CLIP from Hugging Face Transformers for text-image similarity
   - spaCy for extracting object-related words from captions

2. **Load Pre-trained Models**
   - Detectron2 model (`mask_rcnn_R_50_FPN_3x`) for instance segmentation
   - CLIP model (`openai/clip-vit-base-patch32`) for text-image similarity
   - `spaCy` NLP model (`en_core_web_sm`) for extracting nouns from text

3. **Process Image & Caption**
   - Convert input caption into object-related keywords using spaCy
   - Run Detectron2 model on the image to detect objects and extract bounding boxes
   - Use CLIP to compute the similarity between detected objects and caption words

4. **Draw Results**
   - Draw bounding boxes around matched objects
   - Display the final image with detected and matched objects

## Notes:
- **Detectron2** is used for object detection, which provides bounding boxes and object labels.
- **CLIP** helps to match detected objects to textual descriptions based on image-text similarity.
- The pipeline aims to identify objects from a caption and visually highlight them in an image.

"""

import cv2
import torch
import spacy
import numpy as np
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load CLIP model for better text-image similarity
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load Detectron2 model
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.DEVICE = "cpu"

predictor = DefaultPredictor(cfg)


def extract_objects(text):
    """Extract nouns (objects) from caption."""
    doc = nlp(text)
    objects = [token.text for token in doc if token.pos_ == "NOUN"]
    return objects


def detect_objects(image_path):
    """Use Detectron2 to detect objects in image."""
    image = cv2.imread(image_path)
    outputs = predictor(image)
    instances = outputs["instances"]

    detected_objects = []
    boxes = instances.pred_boxes.tensor.cpu().numpy()
    scores = instances.scores.cpu().numpy()
    labels = instances.pred_classes.cpu().numpy()
    metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])

    for i, box in enumerate(boxes):
        obj_name = metadata.thing_classes[labels[i]]
        detected_objects.append((obj_name, box, scores[i]))

    return detected_objects


def clip_similarity(image_path, detected_objects, text_objects):
    """Use CLIP to match detected objects with text objects."""
    image = cv2.imread(image_path)

    detected_labels = [obj[0] for obj in detected_objects]

    # Process with CLIP
    inputs = clip_processor(
        text=detected_labels + text_objects,  # Flatten list
        images=image,
        return_tensors="pt",
        padding=True
    )

    outputs = clip_model(**inputs)

    # Extract similarity scores
    text_features = outputs.text_embeds
    image_features = outputs.image_embeds

    # Compute cosine similarity
    similarity_matrix = torch.matmul(text_features, image_features.T).cpu().detach().numpy()

    matched = []
    for i, text_obj in enumerate(text_objects):
        best_match_idx = np.argmax(similarity_matrix[i])
        best_match_label = detected_labels[best_match_idx]
        matched.append((best_match_label, detected_objects[best_match_idx][1], detected_objects[best_match_idx][2]))

    return matched


def draw_boxes(image_path, matches):
    """Draw bounding boxes around matched objects."""
    image = cv2.imread(image_path)

    for obj_name, bbox, prob in matches:
        x1, y1, x2, y2 = map(int, bbox)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, f"{obj_name} ({prob:.2f})", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()


# --- RUN PIPELINE ---
image_path = "/root/.cache/kagglehub/datasets/adityajn105/flickr8k/versions/1/Images/109738916_236dc456ac.jpg"
caption = "person and motorcycles"

text_objects = extract_objects(caption)
detected_objects = detect_objects(image_path)
matched_objects = clip_similarity(image_path, detected_objects, text_objects)
draw_boxes(image_path, matched_objects)


In [None]:
import os
import cv2
import torch
import spacy
import numpy as np
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load CLIP model for better text-image similarity
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load Detectron2 config and model
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.MODEL.DEVICE = "cpu"

predictor = DefaultPredictor(cfg)


def extract_objects(text):
    """Extract nouns (objects) from caption."""
    doc = nlp(text)
    objects = [token.text for token in doc if token.pos_ == "NOUN"]
    return objects


def detect_objects(image_path):
    """Use Detectron2 to detect objects in image."""
    print(f"[INFO] Checking image path: {image_path}")
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"[ERROR] Image not found at path: {image_path}")

    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"[ERROR] Failed to load image at path: {image_path}")

    outputs = predictor(image)
    instances = outputs["instances"]

    detected_objects = []
    boxes = instances.pred_boxes.tensor.cpu().numpy()
    scores = instances.scores.cpu().numpy()
    labels = instances.pred_classes.cpu().numpy()

    metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])

    for i, box in enumerate(boxes):
        obj_name = metadata.thing_classes[labels[i]]
        detected_objects.append((obj_name, box, scores[i]))

    return detected_objects


def clip_similarity(image_path, detected_objects, text_objects):
    """Use CLIP to match detected objects with text objects."""
    image = cv2.imread(image_path)

    detected_labels = [obj[0] for obj in detected_objects]

    inputs = clip_processor(
        text=detected_labels + text_objects,
        images=image,
        return_tensors="pt",
        padding=True
    )

    outputs = clip_model(**inputs)

    text_features = outputs.text_embeds
    image_features = outputs.image_embeds

    similarity_matrix = torch.matmul(text_features, image_features.T).cpu().detach().numpy()

    matched = []
    for i, text_obj in enumerate(text_objects):
        best_match_idx = np.argmax(similarity_matrix[i])
        best_match_label = detected_labels[best_match_idx]
        matched.append((best_match_label, detected_objects[best_match_idx][1], detected_objects[best_match_idx][2]))

    return matched


def draw_boxes(image_path, matches):
    """Draw bounding boxes around matched objects."""
    image = cv2.imread(image_path)

    for obj_name, bbox, prob in matches:
        x1, y1, x2, y2 = map(int, bbox)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, f"{obj_name} ({prob:.2f})", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()


# --- RUN PIPELINE ---

# ✅ Set a valid image path (verify this path exists and loads correctly)
image_path = "/content/20250326_232148.jpg"
caption = "person and laptop , bottle ,bed , mat , chair , table"

# Run pipeline
text_objects = extract_objects(caption)
detected_objects = detect_objects(image_path)
matched_objects = clip_similarity(image_path, detected_objects, text_objects)
draw_boxes(image_path, matched_objects)
