<a href="https://colab.research.google.com/github/melowmelon/Grounding-Dino-Moondream2-Object-Extraction-from-Video/blob/main/dino_moon_clip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üè† Smart Room Scanner

This notebook detects and identifies objects in a room scan video using:
- **Grounding DINO** for object detection
- **Moondream2** for visual identification
- **CLIP** for duplicate detection


---

## üì¶ Step 1: Install Dependencies

In [25]:
# @title 1. Install Dependencies and make sure we are connected to T4 GPU
# We need specific libraries for Moondream and Grounding DINO
!pip install -q transformers torch opencv-python-headless pillow accelerate einops timm
print("‚úÖ Dependencies installed.")


‚úÖ Dependencies installed.


## üßπ Step 2: Clear GPU Memory. Run this before every iteration or else no mem error

In [22]:
import gc
import torch

# Clear any existing models from memory
gc.collect()
torch.cuda.empty_cache()

# Enable memory optimization
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

print("‚úÖ GPU memory cleared and optimized!")

‚úÖ GPU memory cleared and optimized!


## üìÅ Step 3: Set Video Path

In [23]:
import os
#since colab files upload were being inconsistent, open files manually on left bar, then upload video
#file and place inside "sample_data" folder

# Set your video filename here
video_filename = "/content/sample_data/room_scan3.mp4"


In [24]:
# @title 3. Run Smart Room Scanner
import cv2
import json
import torch
import numpy as np
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from google.colab import files  # wrapper for downloading results

# ================= CONFIGURATION =================
# Use the filename obtained from the upload step
VIDEO_PATH = video_filename
OUTPUT_JSON = "detected_objects_smart.json"

# Hardware Check (Modified for Colab/CUDA)
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

print(f"üöÄ Running on: {DEVICE.upper()}")

class SmartRoomScanner:
    def __init__(self):
        # 1. OBJECT DETECTION - Grounding DINO
        print("‚è≥ Loading Grounding DINO (Advanced detector)...")
        self.detector = AutoModelForZeroShotObjectDetection.from_pretrained(
            "IDEA-Research/grounding-dino-tiny"
        ).to(DEVICE)
        self.detector_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")

        # 2. VISUAL IDENTIFICATION
        print("‚è≥ Loading Moondream2...")
        self.vlm_model_id = "vikhyatk/moondream2"
        # Moondream runs faster/lighter on float16 in Colab
        self.vlm_model = AutoModelForCausalLM.from_pretrained(
            self.vlm_model_id,
            trust_remote_code=True,
            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
        ).to(DEVICE)
        self.vlm_tokenizer = AutoTokenizer.from_pretrained(self.vlm_model_id)

        # 3. DUPLICATE DETECTION
        print("‚è≥ Loading CLIP for similarity...")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        self.detected_registry = []
        self.next_id = 1
        print("‚úÖ All models loaded!\n")

    def detect_objects_grounding_dino(self, frame):
        """
        Use Grounding DINO with a broad open-ended prompt
        """
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)

        text_prompt = "furniture . decoration . objects . items . appliances . fixtures"

        inputs = self.detector_processor(images=pil_image, text=text_prompt, return_tensors="pt").to(DEVICE)

        with torch.no_grad():
            outputs = self.detector(**inputs)

        results = self.detector_processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            threshold=0.15,
            text_threshold=0.15,
            target_sizes=[pil_image.size[::-1]]
        )[0]

        detections = []
        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            detections.append({
                "box": box.cpu().numpy(),
                "score": score.item(),
                "label": label
            })

        return detections

    def get_visual_embedding(self, cv2_image):
        image_rgb = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)

        inputs = self.clip_processor(images=pil_image, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            image_features = self.clip_model.get_image_features(**inputs)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

        return image_features.cpu().numpy().flatten()

    def extract_object_type_from_description(self, description):
        desc_lower = description.lower()
        for starter in ["this is a ", "this is an ", "this appears to be a ", "this appears to be an "]:
            if starter in desc_lower:
                start_idx = desc_lower.find(starter) + len(starter)
                rest = description[start_idx:].split('.')[0].split(',')[0].split(' with')[0]
                words = rest.strip().split()[:4]
                return ' '.join(words)
        return ' '.join(description.split()[:3])

    def is_duplicate(self, new_box, new_embedding, frame_width, frame_height):
        x1, y1, x2, y2 = new_box
        new_cx = (x1 + x2) / 2
        new_cy = (y1 + y2) / 2

        norm_cx = new_cx / frame_width
        norm_cy = new_cy / frame_height

        for obj in self.detected_registry:
            old_cx = obj['norm_center_x']
            old_cy = obj['norm_center_y']

            dist = np.sqrt((norm_cx - old_cx)**2 + (norm_cy - old_cy)**2)

            if dist < 0.25:
                old_embedding = obj['embedding']
                similarity = np.dot(new_embedding, old_embedding)
                if similarity > 0.82:
                    return True
        return False

    def identify_object_open(self, cv2_image):
        image_rgb = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(image_rgb)

        prompt = "What is this object? Describe it in detail including brand, color, and distinguishing features."

        try:
            enc_image = self.vlm_model.encode_image(pil_image)
            answer = self.vlm_model.answer_question(enc_image, prompt, self.vlm_tokenizer)
            return answer
        except Exception as e:
            return "Analysis failed"

    def non_max_suppression(self, detections, iou_threshold=0.5):
        if len(detections) == 0:
            return []

        boxes = np.array([d['box'] for d in detections])
        scores = np.array([d['score'] for d in detections])

        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]

        areas = (x2 - x1) * (y2 - y1)
        order = scores.argsort()[::-1]

        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)

            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])

            w = np.maximum(0, xx2 - xx1)
            h = np.maximum(0, yy2 - yy1)

            inter = w * h
            iou = inter / (areas[i] + areas[order[1:]] - inter)

            inds = np.where(iou <= iou_threshold)[0]
            order = order[inds + 1]

        return [detections[i] for i in keep]

    def process_video(self, video_path):
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            print(f"‚ùå Error: Could not open video file {video_path}")
            return

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        STRIDE = 4

        print(f"üé• Scanning {width}x{height} video ({total_frames} frames)...")

        frame_idx = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % STRIDE != 0:
                frame_idx += 1
                continue

            if frame_idx % 40 == 0:
                print(f"   üìç Processing frame {frame_idx}/{total_frames}...")

            detections = self.detect_objects_grounding_dino(frame)
            detections = self.non_max_suppression(detections, iou_threshold=0.4)

            h, w = frame.shape[:2]

            for detection in detections:
                box = detection['box']
                x1, y1, x2, y2 = map(int, box)

                pad = 15
                x1, y1 = max(0, x1-pad), max(0, y1-pad)
                x2, y2 = min(w, x2+pad), min(h, y2+pad)

                obj_crop = frame[y1:y2, x1:x2]

                if obj_crop.size < 3000:
                    continue

                embedding = self.get_visual_embedding(obj_crop)

                if self.is_duplicate(box, embedding, w, h):
                    continue

                print(f"   ‚ö° Found NEW object (DINO detected: {detection['label']}, conf: {detection['score']:.2f})")

                description = self.identify_object_open(obj_crop)
                object_type = self.extract_object_type_from_description(description)

                cx = (x1 + x2) / 2
                cy = (y1 + y2) / 2

                self.detected_registry.append({
                    "id": self.next_id,
                    "object_type": object_type,
                    "description": description,
                    "dino_label": detection['label'],
                    "confidence": detection['score'],
                    "embedding": embedding,
                    "norm_center_x": cx / w,
                    "norm_center_y": cy / h
                })

                print(f"      ‚úÖ Saved as: {object_type}")
                print(f"         {description[:70]}...")

                self.next_id += 1

            frame_idx += 1

        cap.release()

    def save_results(self):
        final_output = [{
            "id": d['id'],
            "object_type": d['object_type'],
            "description": d['description'],
            "confidence": d['confidence']
        } for d in self.detected_registry]

        with open(OUTPUT_JSON, 'w') as f:
            json.dump(final_output, f, indent=4)
        print(f"\n‚úÖ Scan Complete. {len(final_output)} unique objects detected.")

        # Auto-download in Colab
        try:
            files.download(OUTPUT_JSON)
        except Exception as e:
            print("Download manually from file explorer on the left.")

if __name__ == "__main__":
    scanner = SmartRoomScanner()
    scanner.process_video(VIDEO_PATH)
    scanner.save_results()

üöÄ Running on: CUDA
‚è≥ Loading Grounding DINO (Advanced detector)...
‚è≥ Loading Moondream2...
‚è≥ Loading CLIP for similarity...
‚úÖ All models loaded!

üé• Scanning 480x854 video (451 frames)...
   üìç Processing frame 0/451...
   ‚ö° Found NEW object (DINO detected: furniture, conf: 0.49)
      ‚úÖ Saved as: wooden nightstand
         This is a wooden nightstand with a single drawer and a shelf underneat...
   ‚ö° Found NEW object (DINO detected: furniture, conf: 0.44)
      ‚úÖ Saved as: wooden bed
         This is a wooden bed with a rustic, natural finish. The bed frame is m...
   ‚ö° Found NEW object (DINO detected: decoration objects items appliances, conf: 0.35)
      ‚úÖ Saved as: framed abstract painting displayed
         This is a framed abstract painting displayed on a wall. The painting f...
   ‚ö° Found NEW object (DINO detected: decoration objects items, conf: 0.31)
      ‚úÖ Saved as: small potted plant
         This is a small potted plant with white pot and dar

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>