In [2]:
import cv2
import torch
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
import clip
import torchvision.transforms as transforms
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def extract_all_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        frames.append(pil_image)
    cap.release()
    return frames

def get_clip_embeddings(frames):
    all_embeddings = []
    with torch.no_grad():
        for frame in tqdm(frames, desc="Extracting CLIP features"):
            image_input = preprocess(frame).unsqueeze(0).to(device)
            embedding = model.encode_image(image_input)
            all_embeddings.append(embedding.cpu().numpy())
    return np.vstack(all_embeddings)

def truncated_svd(Q, s):
    svd = TruncatedSVD(n_components=s)
    Qs = svd.fit_transform(Q)
    return Qs

def rectangular_maxvol(Qs, r, tol=1e-6):
    n, s = Qs.shape #total number of frames and features per frame
    selected_indices = [] #selected indices
    remaining = list(range(n)) #not yet selected indices
    for _ in range(r): #num of indices I want to select
        max_score = -np.inf
        best_idx = -1
        for i in remaining:
            candidate = selected_indices + [i] #adds a new row to the already selected rows and checks 
            sub = Qs[candidate, :]
            score = np.linalg.det(sub @ sub.T + tol * np.eye(len(candidate))) #checks if the determinant is larger and adds tolerance
            if score > max_score:
                max_score = score
                best_idx = i
        selected_indices.append(best_idx)
        remaining.remove(best_idx)
    return selected_indices

def maxinfo_frame_selection(video_path, svd_dim=32, num_frames=32):
    frames = extract_all_frames(video_path)
    Q = get_clip_embeddings(frames)
    Qs = truncated_svd(Q, s=svd_dim)
    indices = rectangular_maxvol(Qs, r=num_frames)
    selected_frames = [frames[i] for i in indices]
    return selected_frames, indices


In [4]:
video_path = "Videos\\fragrance_on_the_trail_of_coco_mademoiselle_mp4.mp4"

Extracting CLIP features: 100%|██████████| 375/375 [00:19<00:00, 19.43it/s]


In [24]:
from IPython.display import display

selected_frames, indices = maxinfo_frame_selection(video_path, svd_dim=32, num_frames=16)

'''for i, frame in enumerate(selected_frames):
    print(f"Frame index: {indices[i]}")
    display(frame)'''


Extracting CLIP features: 100%|██████████| 375/375 [00:19<00:00, 19.64it/s]


'for i, frame in enumerate(selected_frames):\n    print(f"Frame index: {indices[i]}")\n    display(frame)'

In [32]:
indices = np.sort(indices)
indices

array([  4,  41,  48,  58,  74, 101, 103, 116, 178, 182, 185, 207, 231,
       273, 290, 323])