In [1]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import cv2
import numpy as np
import argparse
from tqdm.notebook import tqdm
from PIL import Image
from pathlib import Path
import os
import time
import clip

In [2]:
# --- 1. SET YOUR LOCAL DIRECTORIES ---
# We are in the .../visual_model/ folder, so we go up one level
os.chdir("..")
print(f"Working directory set to: {os.getcwd()}")

# Path(".") is now your main project folder (e.g., 'context-aware-video-retrieval')
INPUT_DIR = Path(".") / "media"
OUTPUT_DIR = Path(".") / "embeddings_out" / "video2048"

# --- 2. SET YOUR MODEL PARAMETERS ---
FRAME_SAMPLE_RATE = 30
BATCH_SIZE = 32 # (You can try 64 for CLIP if you have enough VRAM)
CLIP_MODEL_NAME = "ViT-B/32"

# --- 3. DEFINE VIDEO EXTENSIONS TO FIND ---
VIDEO_EXTENSIONS = [".mp4", ".mov", ".avi", ".mkv", ".webm"]

Working directory set to: c:\Users\Shanette\Downloads\COLLEGE\CSST Y4-T1\THS-ST2\context-aware-video-retrieval


In [3]:
print("Setting up model and device...")

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load(CLIP_MODEL_NAME, device=device)
model.eval()
print(f"Using device: {device}")

Setting up model and device...
Using device: cpu


In [4]:
def extract_clip_embeddings(
    video_path: Path, 
    model, 
    preprocess, 
    device: str, 
    frame_sample_rate: int = 30, 
    batch_size: int = 32
) -> np.ndarray:
    """Extracts mean-pooled CLIP visual embeddings from a single video file."""
    if not video_path.exists():
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise IOError(f"Cannot open video file: {video_path}")

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    all_features = []
    frame_batch = []
    frame_idx = 0

    pbar = tqdm(total=frame_count, desc=f"Frames for {video_path.name}", leave=False)

    with torch.no_grad():
        while True:
            ret, frame = cap.read()
            if not ret: break
            pbar.update(1)

            if frame_idx % frame_sample_rate == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(frame_rgb)
                frame_batch.append(pil_img)

                if len(frame_batch) == batch_size:
                    image_inputs = torch.stack(
                        [preprocess(img) for img in frame_batch]
                    ).to(device)
                    image_features = model.encode_image(image_inputs)
                    image_features /= image_features.norm(dim=-1, keepdim=True)
                    all_features.append(image_features.cpu())
                    frame_batch = []
            frame_idx += 1

        if frame_batch:
            image_inputs = torch.stack(
                [preprocess(img) for img in frame_batch]
            ).to(device)
            image_features = model.encode_image(image_inputs)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            all_features.append(image_features.cpu())

    cap.release()
    pbar.close()
    if not all_features:
        raise ValueError(f"No frames sampled for {video_path.name}")

    embeddings = torch.cat(all_features)
    mean_embedding = embeddings.mean(dim=0).numpy()
    return mean_embedding

In [5]:
# Create the output directory if it doesn't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reading videos from: {INPUT_DIR.resolve()}")
print(f"Saving embeddings to: {OUTPUT_DIR.resolve()}")

# Find all video files
video_files = []
for ext in VIDEO_EXTENSIONS:
    video_files.extend(INPUT_DIR.glob(f"*{ext}"))
print(f"Found {len(video_files)} videos.")

# Get list of files ALREADY in the output folder to skip them
existing_embeddings = {f.name for f in OUTPUT_DIR.glob('*_clip.npy')}
print(f"Found {len(existing_embeddings)} existing CLIP embeddings.")

for video_path in tqdm(video_files, desc="Processing Videos (CLIP)"):
    output_filename = f"{video_path.stem}_clip.npy"

    # Skip if already processed
    if output_filename in existing_embeddings:
        continue
    
    output_path = OUTPUT_DIR / output_filename
    
    try:
        print(f"Processing {video_path.name}...")
        mean_embedding = extract_clip_embeddings(
            video_path=video_path,
            model=model,
            preprocess=preprocess,
            device=device,
            frame_sample_rate=FRAME_SAMPLE_RATE,
            batch_size=BATCH_SIZE
        )
        np.save(output_path, mean_embedding)

    except Exception as e:
        print(f"\n[ERROR] Failed to process {video_path.name}: {e}")

print("\n--- Batch processing complete. ---")

Reading videos from: C:\Users\Shanette\Downloads\COLLEGE\CSST Y4-T1\THS-ST2\context-aware-video-retrieval\media
Saving embeddings to: C:\Users\Shanette\Downloads\COLLEGE\CSST Y4-T1\THS-ST2\context-aware-video-retrieval\embeddings_out\video2048
Found 100 videos.
Found 0 existing CLIP embeddings.


Processing Videos (CLIP):   0%|          | 0/100 [00:00<?, ?it/s]

Processing airball_1.mp4...


Frames for airball_1.mp4:   0%|          | 0/300 [00:00<?, ?it/s]

Processing airball_10.mp4...


Frames for airball_10.mp4:   0%|          | 0/492 [00:00<?, ?it/s]

Processing airball_11.mp4...


Frames for airball_11.mp4:   0%|          | 0/348 [00:00<?, ?it/s]

Processing airball_12.mp4...


Frames for airball_12.mp4:   0%|          | 0/303 [00:00<?, ?it/s]

Processing airball_13.mp4...


Frames for airball_13.mp4:   0%|          | 0/271 [00:00<?, ?it/s]

Processing airball_2.mp4...


Frames for airball_2.mp4:   0%|          | 0/443 [00:00<?, ?it/s]

Processing airball_3.mp4...


Frames for airball_3.mp4:   0%|          | 0/378 [00:00<?, ?it/s]

Processing airball_4.mp4...


Frames for airball_4.mp4:   0%|          | 0/508 [00:00<?, ?it/s]

Processing airball_5.mp4...


Frames for airball_5.mp4:   0%|          | 0/399 [00:00<?, ?it/s]

Processing airball_6.mp4...


Frames for airball_6.mp4:   0%|          | 0/417 [00:00<?, ?it/s]

Processing airball_7.mp4...


Frames for airball_7.mp4:   0%|          | 0/508 [00:00<?, ?it/s]

Processing airball_8.mp4...


Frames for airball_8.mp4:   0%|          | 0/620 [00:00<?, ?it/s]

Processing airball_9.mp4...


Frames for airball_9.mp4:   0%|          | 0/401 [00:00<?, ?it/s]

Processing box_1.mp4...


Frames for box_1.mp4:   0%|          | 0/344 [00:00<?, ?it/s]

Processing box_11.mp4...


Frames for box_11.mp4:   0%|          | 0/348 [00:00<?, ?it/s]

Processing box_2.mp4...


Frames for box_2.mp4:   0%|          | 0/337 [00:00<?, ?it/s]

Processing box_3.mp4...


Frames for box_3.mp4:   0%|          | 0/379 [00:00<?, ?it/s]

Processing box_4.mp4...


Frames for box_4.mp4:   0%|          | 0/305 [00:00<?, ?it/s]

Processing box_5.mp4...


Frames for box_5.mp4:   0%|          | 0/600 [00:00<?, ?it/s]

Processing box_6.mp4...


Frames for box_6.mp4:   0%|          | 0/212 [00:00<?, ?it/s]

Processing box_7.mp4...


Frames for box_7.mp4:   0%|          | 0/311 [00:00<?, ?it/s]

Processing box_8.mp4...


Frames for box_8.mp4:   0%|          | 0/203 [00:00<?, ?it/s]

Processing box_9.mp4...


Frames for box_9.mp4:   0%|          | 0/222 [00:00<?, ?it/s]

Processing exchange_1.mp4...


Frames for exchange_1.mp4:   0%|          | 0/1771 [00:00<?, ?it/s]

Processing exchange_10.mp4...


Frames for exchange_10.mp4:   0%|          | 0/2645 [00:00<?, ?it/s]

Processing exchange_2.mp4...


Frames for exchange_2.mp4:   0%|          | 0/1525 [00:00<?, ?it/s]

Processing exchange_3.mp4...


Frames for exchange_3.mp4:   0%|          | 0/2263 [00:00<?, ?it/s]

Processing exchange_4.mp4...


Frames for exchange_4.mp4:   0%|          | 0/1824 [00:00<?, ?it/s]

Processing exchange_5.mp4...


Frames for exchange_5.mp4:   0%|          | 0/1634 [00:00<?, ?it/s]

Processing exchange_6.mp4...


Frames for exchange_6.mp4:   0%|          | 0/1771 [00:00<?, ?it/s]

Processing exchange_7.mp4...


Frames for exchange_7.mp4:   0%|          | 0/4400 [00:00<?, ?it/s]

Processing exchange_8.mp4...


Frames for exchange_8.mp4:   0%|          | 0/1794 [00:00<?, ?it/s]

Processing exchange_9.mp4...


Frames for exchange_9.mp4:   0%|          | 0/3003 [00:00<?, ?it/s]

Processing fit_1.mp4...


Frames for fit_1.mp4:   0%|          | 0/272 [00:00<?, ?it/s]

Processing fit_10.mp4...


Frames for fit_10.mp4:   0%|          | 0/238 [00:00<?, ?it/s]

Processing fit_2.mp4...


Frames for fit_2.mp4:   0%|          | 0/341 [00:00<?, ?it/s]

Processing fit_3.mp4...


Frames for fit_3.mp4:   0%|          | 0/204 [00:00<?, ?it/s]

Processing fit_4.mp4...


Frames for fit_4.mp4:   0%|          | 0/319 [00:00<?, ?it/s]

Processing fit_5.mp4...


Frames for fit_5.mp4:   0%|          | 0/330 [00:00<?, ?it/s]

Processing fit_6.mp4...


Frames for fit_6.mp4:   0%|          | 0/329 [00:00<?, ?it/s]

Processing fit_7.mp4...


Frames for fit_7.mp4:   0%|          | 0/340 [00:00<?, ?it/s]

Processing fit_8.mp4...


Frames for fit_8.mp4:   0%|          | 0/347 [00:00<?, ?it/s]

Processing fit_9.mp4...


Frames for fit_9.mp4:   0%|          | 0/600 [00:00<?, ?it/s]

Processing freewill_1.mp4...


Frames for freewill_1.mp4:   0%|          | 0/331 [00:00<?, ?it/s]

Processing freewill_10.mp4...


Frames for freewill_10.mp4:   0%|          | 0/181 [00:00<?, ?it/s]

Processing freewill_11.mp4...


Frames for freewill_11.mp4:   0%|          | 0/404 [00:00<?, ?it/s]

Processing freewill_12.mp4...


Frames for freewill_12.mp4:   0%|          | 0/202 [00:00<?, ?it/s]

Processing freewill_13.mp4...


Frames for freewill_13.mp4:   0%|          | 0/321 [00:00<?, ?it/s]

Processing freewill_14.mp4...


Frames for freewill_14.mp4:   0%|          | 0/1373 [00:00<?, ?it/s]

Processing freewill_15.mp4...


Frames for freewill_15.mp4:   0%|          | 0/590 [00:00<?, ?it/s]

Processing freewill_16.mp4...


Frames for freewill_16.mp4:   0%|          | 0/224 [00:00<?, ?it/s]

Processing freewill_17.mp4...


Frames for freewill_17.mp4:   0%|          | 0/358 [00:00<?, ?it/s]

Processing freewill_2.mp4...


Frames for freewill_2.mp4:   0%|          | 0/201 [00:00<?, ?it/s]

Processing freewill_3.mp4...


Frames for freewill_3.mp4:   0%|          | 0/254 [00:00<?, ?it/s]

Processing freewill_4.mp4...


Frames for freewill_4.mp4:   0%|          | 0/177 [00:00<?, ?it/s]

Processing freewill_5.mp4...


Frames for freewill_5.mp4:   0%|          | 0/213 [00:00<?, ?it/s]

Processing freewill_6.mp4...


Frames for freewill_6.mp4:   0%|          | 0/364 [00:00<?, ?it/s]

Processing freewill_7.mp4...


Frames for freewill_7.mp4:   0%|          | 0/1831 [00:00<?, ?it/s]

Processing freewill_8.mp4...


Frames for freewill_8.mp4:   0%|          | 0/297 [00:00<?, ?it/s]

Processing freewill_9.mp4...


Frames for freewill_9.mp4:   0%|          | 0/665 [00:00<?, ?it/s]

Processing fun_1.mp4...


Frames for fun_1.mp4:   0%|          | 0/234 [00:00<?, ?it/s]

Processing fun_10.mp4...


Frames for fun_10.mp4:   0%|          | 0/318 [00:00<?, ?it/s]

Processing fun_2.mp4...


Frames for fun_2.mp4:   0%|          | 0/278 [00:00<?, ?it/s]

Processing fun_3.mp4...


Frames for fun_3.mp4:   0%|          | 0/280 [00:00<?, ?it/s]

Processing fun_4.mp4...


Frames for fun_4.mp4:   0%|          | 0/174 [00:00<?, ?it/s]

Processing fun_5.mp4...


Frames for fun_5.mp4:   0%|          | 0/320 [00:00<?, ?it/s]

Processing fun_6.mp4...


Frames for fun_6.mp4:   0%|          | 0/247 [00:00<?, ?it/s]

Processing fun_7.mp4...


Frames for fun_7.mp4:   0%|          | 0/484 [00:00<?, ?it/s]

Processing fun_8.mp4...


Frames for fun_8.mp4:   0%|          | 0/320 [00:00<?, ?it/s]

Processing fun_9.mp4...


Frames for fun_9.mp4:   0%|          | 0/214 [00:00<?, ?it/s]

Processing kidnap_1.mp4...


Frames for kidnap_1.mp4:   0%|          | 0/233 [00:00<?, ?it/s]

Processing kidnap_10.mp4...


Frames for kidnap_10.mp4:   0%|          | 0/451 [00:00<?, ?it/s]

Processing kidnap_2.mp4...


Frames for kidnap_2.mp4:   0%|          | 0/230 [00:00<?, ?it/s]

Processing kidnap_3.mp4...


Frames for kidnap_3.mp4:   0%|          | 0/173 [00:00<?, ?it/s]

Processing kidnap_4.mp4...


Frames for kidnap_4.mp4:   0%|          | 0/303 [00:00<?, ?it/s]

Processing kidnap_5.mp4...


Frames for kidnap_5.mp4:   0%|          | 0/232 [00:00<?, ?it/s]

Processing kidnap_6.mp4...


Frames for kidnap_6.mp4:   0%|          | 0/280 [00:00<?, ?it/s]

Processing kidnap_7.mp4...


Frames for kidnap_7.mp4:   0%|          | 0/255 [00:00<?, ?it/s]

Processing kidnap_8.mp4...


Frames for kidnap_8.mp4:   0%|          | 0/272 [00:00<?, ?it/s]

Processing kidnap_9.mp4...


Frames for kidnap_9.mp4:   0%|          | 0/272 [00:00<?, ?it/s]

Processing rps_1.mp4...


Frames for rps_1.mp4:   0%|          | 0/262 [00:00<?, ?it/s]

Processing rps_10.mp4...


Frames for rps_10.mp4:   0%|          | 0/269 [00:00<?, ?it/s]

Processing rps_11.mp4...


Frames for rps_11.mp4:   0%|          | 0/279 [00:00<?, ?it/s]

Processing rps_2.mp4...


Frames for rps_2.mp4:   0%|          | 0/279 [00:00<?, ?it/s]

Processing rps_3.mp4...


Frames for rps_3.mp4:   0%|          | 0/420 [00:00<?, ?it/s]

Processing rps_4.mp4...


Frames for rps_4.mp4:   0%|          | 0/215 [00:00<?, ?it/s]

Processing rps_5.mp4...


Frames for rps_5.mp4:   0%|          | 0/282 [00:00<?, ?it/s]

Processing rps_6.mp4...


Frames for rps_6.mp4:   0%|          | 0/280 [00:00<?, ?it/s]

Processing rps_7.mp4...


Frames for rps_7.mp4:   0%|          | 0/261 [00:00<?, ?it/s]

Processing rps_8.mp4...


Frames for rps_8.mp4:   0%|          | 0/284 [00:00<?, ?it/s]

Processing where_1.mp4...


Frames for where_1.mp4:   0%|          | 0/1113 [00:00<?, ?it/s]

Processing where_11.mp4...


Frames for where_11.mp4:   0%|          | 0/393 [00:00<?, ?it/s]

Processing where_12.mp4...


Frames for where_12.mp4:   0%|          | 0/561 [00:00<?, ?it/s]

Processing where_2.mp4...


Frames for where_2.mp4:   0%|          | 0/412 [00:00<?, ?it/s]

Processing where_3.mp4...


Frames for where_3.mp4:   0%|          | 0/414 [00:00<?, ?it/s]

Processing where_4.mp4...


Frames for where_4.mp4:   0%|          | 0/654 [00:00<?, ?it/s]

Processing where_5.mp4...


Frames for where_5.mp4:   0%|          | 0/1066 [00:00<?, ?it/s]

Processing where_6.mp4...


Frames for where_6.mp4:   0%|          | 0/339 [00:00<?, ?it/s]

Processing where_7.mp4...


Frames for where_7.mp4:   0%|          | 0/641 [00:00<?, ?it/s]

Processing where_8.mp4...


Frames for where_8.mp4:   0%|          | 0/487 [00:00<?, ?it/s]


--- Batch processing complete. ---
