<a href="https://colab.research.google.com/github/paws4code/CLIP4Clip/blob/master/CLIP_for_video_text_retrieval%2C_small_sample_on_50_videos_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Install required dependencies
!pip install torch torchvision transformers
!pip install ftfy regex tqdm decord pandas numpy
!pip install git+https://github.com/openai/CLIP.git

[0mCollecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-igqb75s8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-igqb75s8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

In [2]:
# 2. Load pre-trained CLIP model and processor using transformers
from transformers import CLIPProcessor, CLIPModel
import torch

# Initialize model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = model.to("cuda")  # Move model to GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# 3. Core functions for video-text retrieval
import torch
import cv2
import numpy as np
from PIL import Image
import os

# Extract frames from video file
def extract_frames(video_path, num_frames=8):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = np.linspace(0, total_frames-1, num_frames, dtype=int)

    for i in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)
    cap.release()
    return frames

# Compute video embedding by averaging frame embeddings
def get_video_embedding(model, processor, video_path, num_frames=8):
    frames = extract_frames(video_path, num_frames)
    if not frames:
        return None

    with torch.no_grad():
        inputs = processor(images=frames, return_tensors="pt").to("cuda")
        outputs = model.get_image_features(**inputs)
        video_embedding = outputs.mean(dim=0)
        video_embedding = video_embedding / video_embedding.norm()

    return video_embedding

# Compute text embedding
def get_text_embedding(model, processor, text):
    with torch.no_grad():
        inputs = processor(text=text, return_tensors="pt").to("cuda")
        outputs = model.get_text_features(**inputs)
        text_embedding = outputs[0]
        text_embedding = text_embedding / text_embedding.norm()

    return text_embedding

# Process multiple videos
def process_videos(model, processor, video_dir, video_files):
    video_embeddings = []
    for video_file in video_files:
        video_path = os.path.join(video_dir, video_file)
        embedding = get_video_embedding(model, processor, video_path)
        if embedding is not None:
            video_embeddings.append(embedding)
    return torch.stack(video_embeddings)

# Process multiple text queries
def process_texts(model, processor, texts):
    text_embeddings = []
    for text in texts:
        embedding = get_text_embedding(model, processor, text)
        text_embeddings.append(embedding)
    return torch.stack(text_embeddings)

# Calculate similarity and retrieve top-k matches
def retrieve_videos(video_embeddings, text_embeddings, video_files, k=5):
    similarity = torch.matmul(text_embeddings, video_embeddings.T)

    results = []
    for i in range(similarity.shape[0]):
        values, indices = torch.topk(similarity[i], k=min(k, len(video_files)))
        matches = [(video_files[idx], values[j].item()) for j, idx in enumerate(indices)]
        results.append(matches)

    return results

In [4]:
# 4. Example usage with sample queries
video_dir = "/content/MSRVTT/videos/all"
video_files = os.listdir(video_dir)[:20]  # Start with a small subset
texts = ["a person cooking", "a dog playing", "a car driving on a road"]

# Process videos and texts
video_embeddings = process_videos(model, processor, video_dir, video_files)
text_embeddings = process_texts(model, processor, texts)

# Retrieve top videos for each text
results = retrieve_videos(video_embeddings, text_embeddings, video_files)

# Print results
for i, text in enumerate(texts):
    print(f"Query: {text}")
    for video, score in results[i]:
        print(f"  {video}: {score:.4f}")
    print()

Query: a person cooking
  video4855.mp4: 0.3235
  video6340.mp4: 0.2804
  video4666.mp4: 0.2769
  video8729.mp4: 0.2711
  video3071.mp4: 0.2610

Query: a dog playing
  video5869.mp4: 0.2484
  video131.mp4: 0.2407
  video9965.mp4: 0.2280
  video1852.mp4: 0.2168
  video4855.mp4: 0.2139

Query: a car driving on a road
  video2635.mp4: 0.3043
  video131.mp4: 0.2491
  video5869.mp4: 0.2434
  video7488.mp4: 0.2370
  video3071.mp4: 0.2176



In [5]:
# 5. Evaluation on MSRVTT dataset
import json

# Load annotation file
with open('/content/MSRVTT/annotation/MSR_VTT.json', 'r') as f:
    msrvtt_data = json.load(f)

# Map video IDs to captions
video_to_captions = {}
for annotation in msrvtt_data['annotations']:
    video_id = annotation['image_id']
    caption = annotation['caption']
    if video_id not in video_to_captions:
        video_to_captions[video_id] = []
    video_to_captions[video_id].append(caption)

# Match to actual video files
video_dir = "/content/MSRVTT/videos/all"
all_video_files = set(os.listdir(video_dir))

# Select a subset for evaluation (50 videos)
eval_videos = []
eval_captions = []
for video_id, captions in list(video_to_captions.items())[:50]:
    numeric_id = video_id.replace("video", "")
    filename = f"video{numeric_id}.mp4"

    if filename in all_video_files:
        eval_videos.append(filename)
        eval_captions.append(captions[0])  # Just use first caption

# Get embeddings for videos and captions
video_embeddings = []
for video_file in eval_videos:
    video_path = os.path.join(video_dir, video_file)
    emb = get_video_embedding(model, processor, video_path)
    if emb is not None:
        video_embeddings.append(emb)

# Match successful videos with captions
successful_videos = eval_videos[:len(video_embeddings)]
successful_captions = eval_captions[:len(video_embeddings)]

# Calculate text embeddings and similarity matrix
text_embeddings = process_texts(model, processor, successful_captions)
similarity = torch.matmul(text_embeddings, torch.stack(video_embeddings).T)

# Calculate retrieval metrics (R@1, R@5, R@10)
r1, r5, r10 = 0, 0, 0
for i in range(len(successful_videos)):
    correct_scores = similarity[i, i].item()
    rank = (similarity[i] > correct_scores).sum().item() + 1
    if rank <= 1: r1 += 1
    if rank <= 5: r5 += 1
    if rank <= 10: r10 += 1

total = len(successful_videos)
print(f"Results on {total} videos:")
print(f"R@1: {r1/total:.4f}")
print(f"R@5: {r5/total:.4f}")
print(f"R@10: {r10/total:.4f}")

Results on 50 videos:
R@1: 0.6200
R@5: 0.8600
R@10: 0.9400
