In [1]:
# import sys
# !{sys.executable} -m pip install torch torchvision opencv-python numpy tqdm Pillow PyDrive2 pandas scikit-learn openai-clip

In [2]:
# !pip install torch torchvision
# !pip install opencv-python numpy tqdm Pillow PyDrive2
# !pip install pandas scikit-learn
# !pip install openai-clip

In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import cv2
import numpy as np
import argparse
from tqdm.notebook import tqdm
from PIL import Image
from pathlib import Path
import os
import time



In [None]:
# --- 1. SET YOUR LOCAL DIRECTORIES ---
# We are in the .../visual_model/ folder, so we go up one level
os.chdir("..")
print(f"Working directory set to: {os.getcwd()}")

# Path(".") is now your main project folder (e.g., 'context-aware-video-retrieval')
INPUT_DIR = Path(".") / "media"
OUTPUT_DIR = Path(".") / "embeddings_out" / "video2048"

# --- 2. SET YOUR MODEL PARAMETERS ---
FRAME_SAMPLE_RATE = 30
BATCH_SIZE = 32

# --- 3. DEFINE VIDEO EXTENSIONS TO FIND ---
VIDEO_EXTENSIONS = [".mp4", ".mov", ".avi", ".mkv", ".webm"]

In [6]:
def get_resnet_model(device: str):
    """Loads the pre-trained ResNet-50 model and its associated transforms."""
    weights = models.ResNet50_Weights.DEFAULT
    model = models.resnet50(weights=weights)
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model.eval()
    model.to(device)
    preprocess = weights.transforms()
    return model, preprocess

print("Setting up model and device...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = get_resnet_model(device)
print(f"Using device: {device}")

Setting up model and device...
Using device: cpu


In [None]:
def extract_resnet_embeddings(
    video_path: Path, 
    model, 
    preprocess, 
    device: str, 
    frame_sample_rate: int = 30, 
    batch_size: int = 32
) -> np.ndarray:
    if not video_path.exists():
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise IOError(f"Cannot open video file: {video_path}")

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    all_features = []
    frame_batch = []
    frame_idx = 0
    
    pbar = tqdm(total=frame_count, desc=f"Frames for {video_path.name}", leave=False)

    with torch.no_grad():
        while True:
            ret, frame = cap.read()
            if not ret: break
            pbar.update(1)
            
            if frame_idx % frame_sample_rate == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(frame_rgb)
                frame_batch.append(pil_img)

                if len(frame_batch) == batch_size:
                    image_inputs = torch.stack(
                        [preprocess(img) for img in frame_batch]
                    ).to(device)
                    image_features = model(image_inputs)
                    all_features.append(image_features.squeeze().cpu().numpy())
                    frame_batch = []
            frame_idx += 1
        
        if frame_batch:
            image_inputs = torch.stack(
                [preprocess(img) for img in frame_batch]
            ).to(device)
            image_features = model(image_inputs)
            all_features.append(image_features.squeeze().cpu().numpy())

    cap.release()
    pbar.close()
    if not all_features:
        raise ValueError(f"No frames sampled for {video_path.name}")

    embeddings = np.vstack(all_features)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

In [None]:
# Create the output directory if it doesn't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Reading videos from: {INPUT_DIR.resolve()}")
print(f"Saving embeddings to: {OUTPUT_DIR.resolve()}")

# Find all video files
video_files = []
for ext in VIDEO_EXTENSIONS:
    video_files.extend(INPUT_DIR.glob(f"*{ext}"))
print(f"Found {len(video_files)} videos.")

# Get list of files ALREADY in the output folder to skip them
existing_embeddings = {f.name for f in OUTPUT_DIR.glob('*_resnet.npy')}
print(f"Found {len(existing_embeddings)} existing ResNet embeddings.")

for video_path in tqdm(video_files, desc="Processing Videos (ResNet)"):
    output_filename = f"{video_path.stem}_resnet.npy"

    # Skip if already processed
    if output_filename in existing_embeddings:
        continue
    
    output_path = OUTPUT_DIR / output_filename
    
    try:
        print(f"Processing {video_path.name}...")
        mean_embedding = extract_resnet_embeddings(
            video_path=video_path,
            model=model,
            preprocess=preprocess,
            device=device,
            frame_sample_rate=FRAME_SAMPLE_RATE,
            batch_size=BATCH_SIZE
        )
        np.save(output_path, mean_embedding)

    except Exception as e:
        print(f"\n[ERROR] Failed to process {video_path.name}: {e}")

print("\n--- Batch processing complete. ---")

Listing files from Google Drive...
Found 100 videos in Drive.
Found 100 existing embeddings in output.


Processing Videos:   0%|          | 0/100 [00:00<?, ?it/s]


--- Batch processing complete. ---
