In [None]:
!pip install torch torchvision transformers pillow

In [None]:
!pip install ultralytics

In [None]:
#  yolo
"""
Object Detection with Text-Caption Matching using YOLOv8 and spaCy
===================================================================


Goal:
-----
This script combines Natural Language Processing (NLP) and Computer Vision (CV) to:
1. Understand the contents of an image.
2. Understand the contents of a caption (text description).
3. Match objects mentioned in the caption with objects actually detected in the image.
4. Highlight those matched objects in the image visually.

High-Level Workflow:
--------------------
1. **Text Object Extraction**:
   - The caption (e.g., "A person sitting on a bed with a laptop") is processed using spaCy.
   - We extract **nouns** from the caption, assuming nouns represent physical objects (like "person", "bed", "laptop").

2. **Image Object Detection**:
   - The image is processed using **YOLOv8**, a state-of-the-art object detection model.
   - It detects all objects in the image and gives us:
     - Object class names (e.g., "person", "bottle")
     - Bounding box coordinates
     - Confidence scores

3. **Object Matching**:
   - We compare the list of **nouns from the caption** with the list of **detected objects from YOLO**.
   - If there's a match (e.g., "person" is in both lists), we consider it a valid detection relevant to the caption.

4. **Visualization**:
   - For each matched object, we draw a **bounding box** on the image.
   - The box is labeled with the object name and detection confidence (e.g., "person (0.92)").

Use Cases:
----------
- Visual Grounding: Verifying if objects described in a caption actually exist in an image.
- Caption-Aware Filtering: Show only those detections that are relevant to a given description.
- Dataset Validation: Ensure that caption annotations are consistent with image content.

Dependencies:
-------------
- `cv2` (OpenCV): For image reading and drawing boxes.
- `spacy`: For natural language parsing and POS tagging.
- `torch`, `ultralytics`: For loading and running YOLOv8 model.
- `matplotlib.pyplot`: To display the final image with overlaid detections.

"""

import cv2
import torch
import spacy
from ultralytics import YOLO  # For object detection
import matplotlib.pyplot as plt

import spacy

nlp = spacy.load("en_core_web_sm")

def extract_objects(text):
    doc = nlp(text)
    objects = []
    for token in doc:
        if token.pos_ in ["NOUN"]:
            objects.append(token.text)
    print(objects)
    return objects

# caption = "A child in a pink dress is climbing up a set of stairs in an entryway."
# print(extract_objects(caption))
model = YOLO("yolov8x.pt")  # Use a larger model


def detect_objects(image_path):
    results = model(image_path, conf=0.3)
    print(results)
    return results

def match_objects(results, text_objects):
    matched = []
    for result in results:
        for box in result.boxes:
            cls = result.names[int(box.cls.item())]  # Object name
            prob = box.conf.item()  # Probability
            if cls in text_objects:  # If detected object matches caption objects
                matched.append((cls, box.xyxy.tolist()[0], prob))
    print(matched)
    return matched


def draw_boxes(image_path, matches):
    image = cv2.imread(image_path)

    for obj_name, bbox, prob in matches:
        x1, y1, x2, y2 = map(int, bbox)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Green Box
        cv2.putText(image, f"{obj_name} ({prob:.2f})", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()
image_path = "/content/20250326_232148.jpg"
caption = "person sitting on bed with a laptop and window at background with few bottles placed there"

text_objects = extract_objects(caption)
print(text_objects)
results = detect_objects(image_path)
# print(results)
matched_objects = match_objects(results, text_objects)
draw_boxes(image_path, matched_objects)


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # This is also required for wordnet to work properly


In [None]:
"""
# Object Detection and Matching using YOLOv8 and CLIP

## Overview:
This script detects objects in an image using **YOLOv8 (You Only Look Once)** and matches them with text descriptions using **CLIP (Contrastive Language-Image Pretraining)**. It improves object-text matching by expanding textual descriptions with **WordNet synonyms** and **spaCy NLP processing**.

## Steps:
1. **Load Dependencies & Models**
   - YOLOv8 for object detection (`yolov8x.pt`)
   - CLIP for computing text-image similarity
   - spaCy for text parsing and noun extraction
   - WordNet (NLTK) to expand object names with synonyms

2. **Download and Prepare Dataset**
   - Uses `kagglehub` to dynamically download images from the **Flickr8k dataset**

3. **Extract Objects from Caption**
   - Uses **spaCy** to extract nouns and proper nouns
   - Expands object names using **WordNet synonyms**

4. **Detect Objects in Image (YOLOv8)**
   - Runs **YOLOv8** on the input image to detect objects
   - Extracts bounding boxes and confidence scores

5. **Match Detected Objects to Caption Objects (CLIP)**
   - Computes text-image similarity between detected objects and expanded caption words
   - Uses CLIP embeddings and cosine similarity to find best matches

6. **Draw Results**
   - Draws bounding boxes around matched objects
   - Displays the final image with detected and matched objects

## Notes:
- **YOLOv8** provides fast, real-time object detection with bounding boxes.
- **CLIP** enhances object matching by comparing image embeddings with text descriptions.
- **WordNet** helps in expanding object names, improving text-object recognition.
- The pipeline ensures better understanding of images based on **both detection and semantic matching**.

"""



import os
import cv2
import torch
import spacy
import kagglehub
import numpy as np
import matplotlib.pyplot as plt
from ultralytics import YOLO
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from collections import Counter
from nltk.corpus import wordnet

# Load models
nlp = spacy.load("en_core_web_sm")
model = YOLO("yolov8x.pt")  # Large model for object detection
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Download dataset dynamically
# path = kagglehub.dataset_download("adityajn105/flickr8k")
# dataset_path = os.path.join(path, "Images")

def extract_objects(text):
    """Extracts important object names from a caption and handles synonyms using WordNet."""
    doc = nlp(text)
    words = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop]

    # Get noun chunks (multi-word objects)
    noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks]

    # Use a Counter to track object occurrences
    objects = Counter(words + noun_chunks)

    # Expand words with synonyms to improve matching
    expanded_objects = {}
    for word in objects:
        synonyms = set()
        for synset in wordnet.synsets(word, pos=wordnet.NOUN):
            for lemma in synset.lemmas():
                synonyms.add(lemma.name().replace("_", " "))
        expanded_objects[word] = synonyms | {word}

    return expanded_objects

def detect_objects(image_path):
    """Detect objects in an image using YOLOv8."""
    results = model(image_path, conf=0.3)  # Adjusted threshold for better accuracy
    return results

def clip_similarity(image_path, detected_objects, text_objects):
    """Match detected objects with caption objects using CLIP embeddings."""
    matched_objects = []

    # Convert detected objects into text labels
    detected_labels = Counter([model.names[int(box.cls.item())] for result in detected_objects for box in result.boxes])

    if not detected_labels:
        print("No objects detected!")
        return []

    # Load image
    image = Image.open(image_path).convert("RGB")

    # Prepare text queries: detected labels and expanded caption objects
    caption_words = list({syn for syns in text_objects.values() for syn in syns})
    queries = list(detected_labels.keys()) + caption_words

    # CLIP processing
    inputs = clip_processor(text=queries, images=image, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)

    # Compute similarity scores
    logits_per_image = outputs.logits_per_image
    scores = logits_per_image.softmax(dim=1).detach().numpy()[0]

    # Map text similarity back to objects
    detected_count = len(detected_labels)
    for i, label in enumerate(queries[:detected_count]):
        highest_sim = max(scores[i + detected_count:])  # Compare detected objects to caption objects
        if highest_sim > 0.15:  # Adjusted threshold
            matched_objects.append(label)

    return matched_objects

def draw_boxes(image_path, results, matched_objects):
    """Draw bounding boxes around matched objects."""
    image = cv2.imread(image_path)
    if image is None:
        print("Error: Image not found.")
        return

    for result in results:
        for box in result.boxes:
            obj_name = model.names[int(box.cls.item())]
            if obj_name not in matched_objects:
                continue  # Ignore unmatched objects

            x1, y1, x2, y2 = map(int, box.xyxy.tolist()[0])
            prob = box.conf.item()

            # Draw bounding box
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(image, f"{obj_name} ({prob:.2f})", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()

# Test with an image and caption
# image_path = os.path.join(dataset_path, "109738916_236dc456ac.jpg")
image_path = "/content/20250326_232148.jpg"
caption = "file , laptop , bottle , mouse , book , id , hearphone , document , page , table , chair , mat "

text_objects = extract_objects(caption)
results = detect_objects(image_path)
matched_objects = clip_similarity(image_path, results, text_objects)
draw_boxes(image_path, results, matched_objects)


In [None]:
#object detection in video using yolo nano

"""
This script performs **real-time object detection and optical flow tracking** on a video using **YOLOv8** and **Lucas-Kanade Optical Flow**.
It processes video frames efficiently and overlays bounding boxes, labels, and motion tracking arrows.

### Overall Process:
1. **Initialize YOLOv8-Nano**: A lightweight object detection model that runs efficiently on CPU/GPU.
2. **Load and process video**: Reads a video file and extracts its properties (FPS, resolution).
3. **Frame skipping for efficiency**: Reduces FPS to `target_fps` (10 FPS) to speed up processing.
4. **Run YOLOv8 on each frame**: Detects objects and extracts bounding boxes.
5. **Apply Optical Flow Tracking**:
   - Tracks motion of detected objects across frames using Lucas-Kanade Optical Flow.
   - Draws motion arrows to indicate movement.
6. **Draw Bounding Boxes & Labels**:
   - Labels detected objects with their names and confidence scores.
   - Uses alternating colors for better visualization.
7. **Save and Display Processed Frames**:
   - Saves output video with annotated detections.
   - Displays each processed frame in **Google Colab**.
   - Downloads the final processed video.

### Libraries Used:
- `cv2 (OpenCV)`: Handles video processing, drawing, and optical flow tracking.
- `torch`: Enables hardware acceleration (CUDA support if available).
- `ultralytics.YOLO`: Loads and runs YOLOv8 for object detection.
- `numpy`: Handles array operations for bounding box and point tracking.
- `google.colab.patches`: Displays frames in Google Colab.
- `google.colab.files`: Downloads the processed video.

### Performance Optimizations:
- Uses **YOLOv8-Nano** (`yolov8n.pt`) for faster processing.
- Implements **frame skipping** to reduce computational load.
- Uses **Lucas-Kanade Optical Flow** for efficient object motion tracking.

"""
#u will get error at end bcz the display pipeline fo colab break
#if cv2 used repeatedly for each frame and show the boxed


import cv2
import torch
import numpy as np
from ultralytics import YOLO
from google.colab.patches import cv2_imshow  # For displaying images in Colab
from google.colab import files

# Load YOLOv8-Nano model (smallest version, fast on CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
yolo_model = YOLO("yolov8n.pt")  # No need to explicitly move to device

# Load Video
video_path = "/content/1044-142621375_medium.mp4"  # Change to your video file
cap = cv2.VideoCapture(video_path)

# Get video properties
original_fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Target FPS
target_fps = 10
frame_skip = max(1, original_fps // target_fps)  # Ensure at least 1 frame is processed

# Define video writer
fourcc = cv2.VideoWriter_fourcc(*'avc1')  # MP4 format (H.264)
output_file = "output.mp4"
out = cv2.VideoWriter(output_file, fourcc, target_fps, (frame_width, frame_height))

# Optical Flow Parameters
lk_params = dict(winSize=(15, 15), maxLevel=2, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
prev_gray, prev_points = None, None
frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % frame_skip != 0:
        frame_count += 1
        continue
    frame_count += 1

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    results = yolo_model(frame)
    detections = results[0].boxes.data.cpu().numpy()

    if prev_gray is not None and prev_points is not None and len(prev_points) > 0:
        new_points, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, gray, prev_points, None, **lk_params)
        for i, (new, old) in enumerate(zip(new_points, prev_points)):
            a, b = new.ravel()
            c, d = old.ravel()
            cv2.arrowedLine(frame, (int(c), int(d)), (int(a), int(b)), (255, 0, 0), 2)

    prev_gray = gray.copy()
    prev_points = np.array([[x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2] for (x1, y1, x2, y2, _, _) in detections], dtype=np.float32)

    for (x1, y1, x2, y2, score, cls) in detections:
        color = (0, 255, 255) if int(cls) % 2 == 0 else (255, 0, 255)  # Alternate colors
        label = f"{results[0].names[int(cls)]} ({score:.2f})"

        # Draw bounding box with thicker lines
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), color, 3)

        # Draw label outside the box with an arrow
        label_x, label_y = int(x1), max(int(y1) - 15, 10)
        cv2.putText(frame, label, (label_x, label_y), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2, cv2.LINE_AA)
        cv2.arrowedLine(frame, (label_x + 30, label_y), (int(x1), int(y1)), color, 2)

    out.write(frame)
    cv2_imshow(frame)

cap.release()
out.release()
cv2.destroyAllWindows()
files.download(output_file)