In [None]:
import cv2
from skimage.metrics import structural_similarity as ssim
import numpy as np
from tqdm import tqdm


def ssim_sampling(video_path, ssim_threshold:float = 0.90):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    prev_gray = None
    frame_idx = 0
    sampled_frames = []  # List to store frames
    with tqdm(total= total_frames, desc="Processing Frames") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
            # Always save first frame
            if prev_gray is None:
                sampled_frames.append(frame)
                prev_gray = gray
                frame_idx += 1
                continue
            if frame_idx % int(fps) != 0:
                frame_idx +=1 
                pbar.update(1)
                continue
            score = ssim(prev_gray, gray)            
            if score < ssim_threshold:
                sampled_frames.append(frame)
                prev_gray = gray  # Update previous frame


            frame_idx += 1
            pbar.update(1)

    cap.release()
    print(f"Sampled {len(sampled_frames)} frames from the video.")
    return sampled_frames

In [3]:
sampled_frames = ssim_sampling("../This Integral Breaks Math.mp4")

Processing Frames: 100%|█████████▉| 7792/7793 [03:26<00:00, 37.76it/s]

Sampled 227 frames from the video.





In [6]:
from scenedetect import open_video, SceneManager
import scenedetect
from scenedetect.detectors import ContentDetector
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
import torch
from tqdm import tqdm
import cv2
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32",use_safetensors=True).to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

Loading weights: 100%|██████████| 398/398 [00:01<00:00, 275.88it/s, Materializing param=visual_projection.weight]                                
[1mCLIPModel LOAD REPORT[0m from: openai/clip-vit-base-patch32
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
text_model.embeddings.position_ids   | UNEXPECTED |  | 
vision_model.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
The image processor of type `CLIPImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 
The image processor of type `BlipImageProcessor` is now loaded as a fast processor by default, even if the model checkpo

In [8]:
import cv2
from skimage.metrics import structural_similarity as ssim
import numpy as np
from tqdm import tqdm

def frame_captioning(sampled_frames : list):
    embeddings = []
    metadatas = []
    # ids = []
    with tqdm(total= len(sampled_frames), desc="Captioning and Embedding") as pbar:
        for frame in sampled_frames:
            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(image_rgb)
            inputs = processor(images = image, return_tensors = 'pt').to(device)
            blip_input = blip_processor(images = image, return_tensors = 'pt').to(device)
            with torch.no_grad():
                outputs = model.get_image_features(inputs.pixel_values)
                blip_outputs = blip_model.generate(**blip_input,
                                                # max_length = 500,
                                                # min_length = 150,
                                                # no_repeat_ngram_size=2,
                                                # num_beams = 5,
                                                )
        
            caption = blip_processor.decode(blip_outputs[0], skip_special_tokens=True)
            image_embedding = outputs.pooler_output
            image_embedding = image_embedding / image_embedding.norm(dim = -1, keepdim= True)
            image_embedding = image_embedding.squeeze(0).cpu().numpy().tolist()
            # timestamp_sec = t*1000
            # frame_id = f"{video_path}:{timestamp_sec}"

            # ids.append(frame_id)
            embeddings.append(image_embedding)

            metadatas.append({
            # "frame_idx": f"frame_no_{i}_{label}",
            "caption": caption,
            # "timestamp_ms": timestamp_sec,
            # "source_path": video_path
        })
            pbar.update(1)
    
    return embeddings, metadatas

In [9]:
emb, met = frame_captioning(sampled_frames= sampled_frames)

Captioning and Embedding: 100%|██████████| 227/227 [05:34<00:00,  1.48s/it]
