In [None]:
!pip install torch torchvision torchaudio
!pip install transformers opencv-python
!pip install timm


In [None]:
#  DINOv2 (DEtection Transformer V2) by Meta AI.
"""
# DINOv2 (DEtection Transformer V2) - Object Detection with Meta AI's DETR

## Overview:
This script performs **object detection** using **DINOv2 (DEtection Transformer V2)**, a powerful model by **Meta AI** based on **DETR (DEtection TRansformer)**. It leverages a **ResNet-101 backbone** for feature extraction and **transformers for end-to-end object detection**.

## Steps:
1. **Load Dependencies & DINOv2 Model**
   - Uses **PyTorch** and **Hugging Face Transformers** to load `facebook/detr-resnet-101`.
   - Runs on **GPU (CUDA) if available, otherwise uses CPU**.

2. **Preprocess Input Image**
   - Loads and converts the image to **RGB** using PIL.
   - Uses **DETR's image processor** to prepare the image tensor for inference.

3. **Object Detection**
   - Passes the processed image through the **DINOv2 model** for inference.
   - Extracts bounding boxes, object labels, and confidence scores.

4. **Post-Processing & Visualization**
   - Filters results based on a confidence `threshold`.
   - Draws bounding boxes and object labels on the original image.
   - Saves and displays the **annotated output image** using OpenCV & Matplotlib.

## Notes:
- **DETR (DEtection TRansformer)** eliminates the need for hand-crafted anchors and post-processing steps like **NMS (Non-Maximum Suppression)**.
- **DINOv2 improves detection accuracy and generalization** compared to previous transformer-based models.
- **Adjustable detection threshold** helps control false positives and precision.

"""

import torch
import torchvision.transforms as T
import numpy as np
import cv2
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection
import matplotlib.pyplot as plt
# Load DINOv2 Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-101")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-101").to(device)

# Image Preprocessing
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return image

def detect_objects(image_path, threshold=0.2):
    image = load_image(image_path)
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract results
    target_sizes = torch.tensor([image.size[::-1]]).to(device)
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=threshold)[0]

    # Draw boxes on image
    image_cv = cv2.imread(image_path)
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        if score > threshold:
            box = [int(i) for i in box]
            cv2.rectangle(image_cv, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
            cv2.putText(image_cv, f"{model.config.id2label[label.item()]}: {score:.2f}",
                        (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imwrite("output.jpg", image_cv)
    return "output.jpg"

# Example Usage
image_path =  "/content/20250326_232148.jpg"
output_image_ = detect_objects(image_path)
output_image = cv2.imread(output_image_)
output_image = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(10, 6))
plt.imshow(output_image)
plt.axis("off")  # Hide axes
plt.show()


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # This is also required for wordnet to work properly


In [None]:
"""
# Object Detection & NLP-based Object Matching using DINO (DEtection Transformer) and NLP
----------------------------------------------------------------------------------------
## Overview:
This script performs object detection in images using Meta AI's **DINO (DETR ResNet-101)** model and
matches the detected objects with user-specified textual descriptions. It leverages **Natural Language Processing (NLP)**
for extracting object names from text and **WordNet** for finding synonyms. Fuzzy string matching is also used to improve matching accuracy.

## Key Technologies & Libraries Used:
1. **PyTorch & Hugging Face Transformers**:
   - Loads and runs the **DINOv2 (DETR ResNet-101)** object detection model.
   - Uses `AutoProcessor` for preprocessing images before feeding them into the model.
   - Uses `AutoModelForObjectDetection` for detecting objects in the image.

2. **OpenCV (cv2)**:
   - Reads and processes images.
   - Draws bounding boxes and labels on detected objects.

3. **spaCy (Natural Language Processing)**:
   - Extracts nouns from a user-provided text description of objects to be found in the image.

4. **WordNet (NLTK)**:
   - Generates synonyms for detected and user-provided object names to improve matching accuracy.

5. **RapidFuzz (Fuzzy Matching for Object Names)**:
   - Matches detected object names with caption words based on similarity percentage.

## Process Flow:
1. **Load DINO Object Detection Model**:
   - Load the **DINOv2 model (DETR ResNet-101)** for object detection.
   - Load the **Hugging Face AutoProcessor** for image preprocessing.

2. **Extract Object Names from Text Input (Caption Matching)**:
   - Parse the text input using `spaCy` to extract **nouns** (e.g., "chair", "bottle").
   - Retrieve synonyms for each word using `WordNet`.

3. **Perform Object Detection on Image**:
   - Load and preprocess the image.
   - Pass the image through the **DINOv2 model** to obtain detected objects and bounding boxes.

4. **Match Detected Objects with Caption Words**:
   - Compare detected object names with caption words using **synonym matching (WordNet)**.
   - Apply **fuzzy matching (RapidFuzz)** to detect close matches.

5. **Draw Bounding Boxes on Matched Objects**:
   - If an object from the image matches a user-provided object name, draw a bounding box on the image.
   - Save and display the processed image.

## Expected Output:
- An image with bounding boxes drawn around detected objects that match the user-provided caption.
- A printed list of detected objects and matched objects.
"""


import cv2
import torch
import numpy as np
import spacy
import difflib
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.corpus import wordnet as wn
from transformers import AutoProcessor, AutoModelForObjectDetection

# Load Spacy NLP model for extracting nouns
nlp = spacy.load("en_core_web_sm")

# Load DINO model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("facebook/detr-resnet-101") # use 50 for resent-50
model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-101").to(device)

def extract_objects_from_caption(caption):
    """Extract objects (nouns) from user caption."""
    doc = nlp(caption)
    return [token.lemma_ for token in doc if token.pos_ == "NOUN"]

def get_synonyms(word):
    """Find synonyms using WordNet."""
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower().replace("_", " "))
    return synonyms

def match_objects(detected_objects, caption_objects):
    """Match detected objects with caption objects using synonyms and fuzzy matching."""
    matched_objects = set()
    object_dict = defaultdict(set)

    # Precompute synonyms for detected objects
    for obj in detected_objects:
        object_dict[obj].update(get_synonyms(obj))

    for caption_obj in caption_objects:
        caption_synonyms = get_synonyms(caption_obj)

        for detected_obj, synonyms in object_dict.items():
            if detected_obj in caption_synonyms or caption_obj in synonyms:
                matched_objects.add(detected_obj)
            else:
                # Use fuzzy matching for closely related names
                if difflib.get_close_matches(caption_obj, [detected_obj], cutoff=0.7):
                    matched_objects.add(detected_obj)

    return matched_objects

def detect_objects(image_path):
    """Detect objects in the image using DINO."""
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Error: Unable to load image at {image_path}. Check if the file exists.")

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    inputs = processor(images=image_rgb, return_tensors="pt").to(device)
    outputs = model(**inputs)

    scores = outputs.logits.softmax(-1)[0, :, :-1].detach().cpu().numpy()
    labels = outputs.logits.argmax(-1)[0].detach().cpu().numpy()
    boxes = outputs.pred_boxes[0].detach().cpu().numpy()

    detected_objects = []
    height, width, _ = image.shape

    for i in range(len(scores)):
        score = scores[i].max()
        label = labels[i]
        box = boxes[i]

        if score > 0.25:
            cx, cy, w, h = box  # Corrected to use center x, y
            x = int((cx - w / 2) * width)
            y = int((cy - h / 2) * height)
            w = int(w * width)
            h = int(h * height)
            object_name = model.config.id2label.get(label, f"Unknown_{label}")
            detected_objects.append((object_name, (x, y, w, h)))

    return detected_objects, image

def draw_boxes(image, detected_objects, matched_objects):
    """Draw bounding boxes around matched objects with thin sharp borders."""
    for obj, box in detected_objects:
        if obj in matched_objects:
            x, y, w, h = box
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Thinner border
            cv2.putText(image, obj, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

    cv2.imwrite("output.jpg", image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 6))
    plt.imshow(image)
    plt.axis("off")
    plt.show()

def main():
    image_path = "/content/20250326_232148.jpg"
    caption = "person , bottle , cloths , bed , laptop , pages , paper , bag ,chair , box , wires , charger , mat , window  "

    text_objects = extract_objects_from_caption(caption)
    detected_objects, image = detect_objects(image_path)
    matched_objects = match_objects([obj for obj, _ in detected_objects], text_objects)

    print(f"Detected Objects: {[obj for obj, _ in detected_objects]}")
    print(f"Matched Objects: {matched_objects}")

    draw_boxes(image, detected_objects, matched_objects)

if __name__ == "__main__":
    main()

In [None]:
#smaller resnet
"""
# Object Detection & NLP-based Object Matching using DINO (DEtection Transformer) and NLP
----------------------------------------------------------------------------------------
## Overview:
This script performs object detection in images using Meta AI's **DINO (DETR ResNet-101)** model and
matches the detected objects with user-specified textual descriptions. It leverages **Natural Language Processing (NLP)**
for extracting object names from text and **WordNet** for finding synonyms. Fuzzy string matching is also used to improve matching accuracy.

## Key Technologies & Libraries Used:
1. **PyTorch & Hugging Face Transformers**:
   - Loads and runs the **DINOv2 (DETR ResNet-101)** object detection model.
   - Uses `AutoProcessor` for preprocessing images before feeding them into the model.
   - Uses `AutoModelForObjectDetection` for detecting objects in the image.

2. **OpenCV (cv2)**:
   - Reads and processes images.
   - Draws bounding boxes and labels on detected objects.

3. **spaCy (Natural Language Processing)**:
   - Extracts nouns from a user-provided text description of objects to be found in the image.

4. **WordNet (NLTK)**:
   - Generates synonyms for detected and user-provided object names to improve matching accuracy.

5. **RapidFuzz (Fuzzy Matching for Object Names)**:
   - Matches detected object names with caption words based on similarity percentage.

## Process Flow:
1. **Load DINO Object Detection Model**:
   - Load the **DINOv2 model (DETR ResNet-101)** for object detection.
   - Load the **Hugging Face AutoProcessor** for image preprocessing.

2. **Extract Object Names from Text Input (Caption Matching)**:
   - Parse the text input using `spaCy` to extract **nouns** (e.g., "chair", "bottle").
   - Retrieve synonyms for each word using `WordNet`.

3. **Perform Object Detection on Image**:
   - Load and preprocess the image.
   - Pass the image through the **DINOv2 model** to obtain detected objects and bounding boxes.

4. **Match Detected Objects with Caption Words**:
   - Compare detected object names with caption words using **synonym matching (WordNet)**.
   - Apply **fuzzy matching (RapidFuzz)** to detect close matches.

5. **Draw Bounding Boxes on Matched Objects**:
   - If an object from the image matches a user-provided object name, draw a bounding box on the image.
   - Save and display the processed image.

## Expected Output:
- An image with bounding boxes drawn around detected objects that match the user-provided caption.
- A printed list of detected objects and matched objects.
"""


import cv2
import torch
import numpy as np
import spacy
import difflib
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.corpus import wordnet as wn
from transformers import AutoProcessor, AutoModelForObjectDetection

# Load Spacy NLP model for extracting nouns
nlp = spacy.load("en_core_web_sm")

# Load DINO model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("facebook/detr-resnet-50") # use 50 for resent-50
model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)

def extract_objects_from_caption(caption):
    """Extract objects (nouns) from user caption."""
    doc = nlp(caption)
    return [token.lemma_ for token in doc if token.pos_ == "NOUN"]

def get_synonyms(word):
    """Find synonyms using WordNet."""
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower().replace("_", " "))
    return synonyms

def match_objects(detected_objects, caption_objects):
    """Match detected objects with caption objects using synonyms and fuzzy matching."""
    matched_objects = set()
    object_dict = defaultdict(set)

    # Precompute synonyms for detected objects
    for obj in detected_objects:
        object_dict[obj].update(get_synonyms(obj))

    for caption_obj in caption_objects:
        caption_synonyms = get_synonyms(caption_obj)

        for detected_obj, synonyms in object_dict.items():
            if detected_obj in caption_synonyms or caption_obj in synonyms:
                matched_objects.add(detected_obj)
            else:
                # Use fuzzy matching for closely related names
                if difflib.get_close_matches(caption_obj, [detected_obj], cutoff=0.7):
                    matched_objects.add(detected_obj)

    return matched_objects

def detect_objects(image_path):
    """Detect objects in the image using DINO."""
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Error: Unable to load image at {image_path}. Check if the file exists.")

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    inputs = processor(images=image_rgb, return_tensors="pt").to(device)
    outputs = model(**inputs)

    scores = outputs.logits.softmax(-1)[0, :, :-1].detach().cpu().numpy()
    labels = outputs.logits.argmax(-1)[0].detach().cpu().numpy()
    boxes = outputs.pred_boxes[0].detach().cpu().numpy()

    detected_objects = []
    height, width, _ = image.shape

    for i in range(len(scores)):
        score = scores[i].max()
        label = labels[i]
        box = boxes[i]

        if score > 0.25:
            cx, cy, w, h = box  # Corrected to use center x, y
            x = int((cx - w / 2) * width)
            y = int((cy - h / 2) * height)
            w = int(w * width)
            h = int(h * height)
            object_name = model.config.id2label.get(label, f"Unknown_{label}")
            detected_objects.append((object_name, (x, y, w, h)))

    return detected_objects, image

def draw_boxes(image, detected_objects, matched_objects):
    """Draw bounding boxes around matched objects with thin sharp borders."""
    for obj, box in detected_objects:
        if obj in matched_objects:
            x, y, w, h = box
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Thinner border
            cv2.putText(image, obj, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

    cv2.imwrite("output.jpg", image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 6))
    plt.imshow(image)
    plt.axis("off")
    plt.show()

def main():
    image_path = "/20250326_232148.jpg"
    caption = "person , bottle , cloths , bed , laptop , pages , paper , bag ,chair , box , wires , charger , mat , window  "

    text_objects = extract_objects_from_caption(caption)
    detected_objects, image = detect_objects(image_path)
    matched_objects = match_objects([obj for obj, _ in detected_objects], text_objects)

    print(f"Detected Objects: {[obj for obj, _ in detected_objects]}")
    print(f"Matched Objects: {matched_objects}")

    draw_boxes(image, detected_objects, matched_objects)

if __name__ == "__main__":
    main()