In [1]:
# This file is to generate embeddings from images

%matplotlib inline
import numpy as np
import open_clip
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import TimesformerModel, TimesformerConfig
from torchvision import transforms

# Initialize the TimeSformer model
model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Define video preprocessing
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225])
])

def load_frames_from_folder(folder):
    frames = []
    for filename in sorted(os.listdir(folder)):
        img_path = os.path.join(folder, filename)
        img = Image.open(img_path).convert('RGB')
        frames.append(preprocess(img))
    return frames

# Hyperparameters
batch_size = 4  # Reduced batch size due to model size
num_frames = 8  # TimeSformer typically uses 8 or 16 frames

path_to_folders = 'video/Frames/'
folders = os.listdir(path_to_folders)
result = {}

for frames_folder in tqdm(folders):
    folder_path = os.path.join(path_to_folders, frames_folder)
    frames = load_frames_from_folder(folder_path)
    
    # Process video in segments of num_frames
    segments = []
    for i in range(0, len(frames), num_frames):
        segment = frames[i:i + num_frames]
        
        # If segment is shorter than num_frames, pad with zeros
        if len(segment) < num_frames:
            padding = [torch.zeros_like(segment[0]) for _ in range(num_frames - len(segment))]
            segment.extend(padding)
        
        # Stack frames for TimeSformer input [batch, num_frames, channels, height, width]
        segment_tensor = torch.stack(segment).unsqueeze(0)
        segments.append(segment_tensor)
    
    # Process segments in batches
    all_features = []
    for i in range(0, len(segments), batch_size):
        batch_segments = segments[i:i + batch_size]
        if len(batch_segments) > 0:
            # Stack segments into a batch
            batch = torch.cat(batch_segments, dim=0).to(device)
            
            with torch.no_grad():
                # TimeSformer expects input shape: [batch_size, num_frames, channels, height, width]
                # Rearrange dimensions if needed
                batch = batch.permute(0, 1, 2, 3, 4)
                
                # Get features
                outputs = model(batch, output_hidden_states=True)
                # Use the final hidden state as features
                features = outputs.last_hidden_state.mean(dim=1)  # Average over sequence length
                all_features.append(features.cpu())
    
    # Concatenate all features for this video
    if all_features:
        video_features = torch.cat(all_features, dim=0)
        # Average all segment features to get video-level representation
        video_features = torch.mean(video_features, dim=0)
        result[frames_folder] = video_features.numpy().tolist()
    
    # Clear GPU memory
    torch.cuda.empty_cache()

# Save results
import json
with open("video_features_timesformer.json", "w") as f:
    json.dump(result, f)

100%|██████████| 690/690 [1:30:53<00:00,  7.90s/it]


In [None]:
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence  # For padding sequences
import torchvision.transforms as transforms
import torchvision.models as models

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the pre-trained ResNet-50 model and remove the final classification layer
resnet_model = models.resnet50(pretrained=True)
# Remove the last fully-connected layer so that we get features
resnet_model = torch.nn.Sequential(*(list(resnet_model.children())[:-1]))
resnet_model.to(device)
resnet_model.eval()

# Define preprocessing transform for ResNet
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # ImageNet means
        std=[0.229, 0.224, 0.225]    # ImageNet stds
    ),
])

# Path to the folders containing frames (each folder represents one video)
path_to_folders = 'video/Frames/'
folders = os.listdir(path_to_folders)

# Function to load frames from a folder in order
def load_frames_from_folder(folder):
    frames = []
    for filename in sorted(os.listdir(folder)):  # Sorting ensures temporal order
        img_path = os.path.join(folder, filename)
        img = Image.open(img_path).convert('RGB')  # Ensure 3-channel image
        frames.append(img)
    return frames

# Hyperparameters
batch_size = 16   # Adjust based on your GPU capacity
window_size = 16  # Number of frames per temporal window

# Dictionary to store the result
result = {}

# Process each folder (video)
for frames_folder in tqdm(folders):
    folder_path = os.path.join(path_to_folders, frames_folder)
    frames = load_frames_from_folder(folder_path)
    
    # Preprocess frames using the defined transform and move them to device
    preprocessed_frames = [preprocess(frame).to(device) for frame in frames]
    
    # Split frames into windows of size `window_size`
    windows = [preprocessed_frames[i:i + window_size] for i in range(0, len(preprocessed_frames), window_size)]
    
    embeddings = []
    
    with torch.no_grad():
        for window in windows:
            # Stack the frames in the window; shape: (window_length, 3, H, W)
            window_tensor = torch.stack(window)
            window_dataset = TensorDataset(window_tensor)
            window_loader = DataLoader(window_dataset, batch_size=batch_size)
            
            window_embeddings = []
            for batch in window_loader:
                batch_frames = batch[0].to(device)  # Batch of frames
                # Forward pass through ResNet; output shape: (batch_size, 2048, 1, 1)
                features = resnet_model(batch_frames)
                # Flatten the output to shape: (batch_size, 2048)
                features = features.view(features.size(0), -1)
                window_embeddings.append(features.cpu())
            
            # Aggregate embeddings in the window (e.g., using mean pooling)
            window_embedding = torch.mean(torch.vstack(window_embeddings), dim=0)
            embeddings.append(window_embedding)
    
    # Stack all window embeddings for the current folder (video)
    embeddings_tensor = torch.stack(embeddings)  # Shape: (num_windows, 2048)
    
    
    # Save the result for the current folder
    result[frames_folder] = embeddings_tensor.cpu().numpy().tolist()

# Optionally, save 'result' to a file using pickle or json

with open('video_embeddings_resnet.json', 'w') as f:
    json.dump(result, f)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\yingq/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:17<00:00, 5.71MB/s]
100%|██████████| 690/690 [38:15<00:00,  3.33s/it] 


NameError: name 'json' is not defined