In [1]:
# This file is to generate embeddings from images

%matplotlib inline
import numpy as np
import open_clip
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence  # For padding sequences
import open_clip

# Load the model
model_Vit, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32-quickgelu', pretrained='openai')
device = "cuda" if torch.cuda.is_available() else "cpu"
model_Vit.to(device)
model_Vit.eval()

path_to_folders = 'video/Frames/'
folders = os.listdir(path_to_folders)

# Function to load frames from a folder and keep them in order
def load_frames_from_folder(folder):
    frames = []
    for filename in sorted(os.listdir(folder)):  # Sorting ensures temporal order
        img_path = os.path.join(folder, filename)
        img = Image.open(img_path)
        frames.append(img)
    return frames

# Hyperparameters
batch_size = 16  # GPU capacity dependent
window_size = 16  # Number of frames per temporal window

# Temporal extraction and embedding calculation
result = {}

for frames_folder in tqdm(folders):
    frames = load_frames_from_folder(os.path.join(path_to_folders, frames_folder))
    
    # Preprocess frames and create batches of windows
    preprocessed_frames = [preprocess(frame).to(device) for frame in frames]
    
    # Split frames into windows of size `window_size`
    windows = [preprocessed_frames[i:i + window_size] for i in range(0, len(preprocessed_frames), window_size)]
    
    embeddings = []
    
    with torch.no_grad():
        for window in windows:
            # Stack the frames in the window and pad to handle variable window size
            window_tensor = torch.stack(window)  # Shape: (window_size, 3, H, W)
            window_dataset = TensorDataset(window_tensor)
            window_loader = DataLoader(window_dataset, batch_size=batch_size)

            window_embeddings = []
            for batch in window_loader:
                batch_frames = batch[0].to(device)  # Move batch to GPU
                image_features = model_Vit.encode_image(batch_frames)
                window_embeddings.append(image_features.cpu())  # Save embeddings

            # Aggregate embeddings in the window (e.g., mean pooling)
            window_embedding = torch.mean(torch.vstack(window_embeddings), dim=0)
            embeddings.append(window_embedding)

    # Stack all window embeddings for this folder
    embeddings_tensor = torch.stack(embeddings)  # Shape: (num_windows, embedding_dim)

    
    # Save the result for the current folder
    result[frames_folder] = embeddings_tensor.cpu().numpy().tolist()


100%|██████████| 690/690 [11:57<00:00,  1.04s/it]


In [14]:
import json

with open('video_embeddings_complete.json', 'w') as f:
    json.dump(result, f, indent=4)

In [None]:
video_list_path = 'video/Framse/'
for key, value in result.items():
    video_list = os.listdir(video_list_path+key)