## Open the refined dataset

In [1]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,source_video,target_video,source_start,source_end,target_start,target_end,group,category,is_duplicate
0,2e246b45e7dfba0a7cfb2beb557c40d81dc02c99.flv,3ed4f5c0eb04c94353594e8be1a72bcc657e27c7.flv,24.0,25.0,15.0,22.0,210.0,maradona_hand_of_god,True
1,1e2598afd4d6a8728d6c0076354477db59702a5a.flv,,186.0,198.0,,,,the_last_samurai_last_battle,False
2,c6d6d37c73f364e3902407e1da07c8e354f66c13.flv,,677.0,692.0,,,,president_obama_takes_oath,False
3,458db5aa227ae49ceb8bc1bed5f9cf5b4bed63f2.flv,6d1466ebc4de7e5ddb229bde090b5c5acac15c0c.flv,8.0,10.0,4.0,6.0,0.0,baggio_penalty_1994,True
4,09b682c899b0727e9990d8e347cdce3df7c5550e.flv,d2015b438b70f022967713d6f977ebc67a16839e.flv,15.0,25.0,193.0,203.0,71.0,david_beckham_lights_the_olympic_torch,True


## Define the alexnet

In [2]:
import torch
import torchvision.models as models
from torchvision.transforms import v2
from torch import nn
import cv2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

alexnet = models.alexnet(weights="AlexNet_Weights.DEFAULT").to(device)
conv_layers = nn.Sequential(*list(alexnet.features.children()))
alexnet.eval()

transforms = v2.Compose([
    v2.Resize(size=(224, 224), antialias=True),
    v2.ToTensor(),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])



## Functions to retrieve video embeddings

In [3]:
import cv2

def retrieve_frames(video_path, start_time, end_time, fps=1):
    """
    Retrieves frames from a video at a specified frame rate between a start and end time
    and returns them as a list of torch tensors.

    :param video_path: Path to the video file
    :param start_time: Start time in seconds
    :param end_time: End time in seconds
    :param fps: Frame rate at which to extract frames (default is 2 fps)
    :return: List of frames as NumPy ndarrays
    """
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return torch.empty()

    # Get video properties
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_duration = frame_count / video_fps

    # Convert start and end times to frame numbers
    start_frame = int(start_time * video_fps)
    end_frame = int(end_time * video_fps)

    # Set the video capture to the start frame
    cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    # Calculate the interval between frames to extract
    interval = int(video_fps / fps)

    # Extract frames
    frames = []
    frame_number = start_frame
    while cap.isOpened() and frame_number <= end_frame:
        ret, frame = cap.read()
        if not ret:
            break

        # Append the frame to the list
        frame = torch.from_numpy(frame).permute(2, 0, 1)/255
        frames.append(transforms(frame))

        # Move to the next frame to extract
        frame_number += interval
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)

    # Release the video capture object
    cap.release()
    frames = torch.stack(frames)
    return frames.detach().cpu()

class MaxPoolConvOutputs(nn.Module):
    def __init__(self, conv_layers):
        super(MaxPoolConvOutputs, self).__init__()
        self.conv_layers = conv_layers
        self.pool = nn.AdaptiveMaxPool2d((1, 1))  # Pool to a single value per channel

    def forward(self, x):
        outputs = []
        for layer in self.conv_layers:
            x = layer(x)
            if isinstance(layer, nn.Conv2d):  # Apply max pooling only on Conv2d outputs
                pooled_output = self.pool(x).squeeze(-1).squeeze(-1)  # Remove spatial dims
                outputs.append(pooled_output)
        return outputs

def get_frames_emb(video):
    max_pool_extractor = MaxPoolConvOutputs(conv_layers)

    with torch.no_grad():
        max_pooled_outputs = max_pool_extractor(video.to(device))

    result = torch.cat(max_pooled_outputs, dim=1)

    return result


def normalize_frames(video):
    # Step 1: Average across frames (mean along the first dimension)
    avg_emb = torch.mean(video, dim=0)

    # Step 2: Zero-mean normalization (subtract the mean of the vector)
    mean_value = torch.mean(avg_emb)
    zero_mean_emb = avg_emb - mean_value

    # Step 3: ℓ2-normalization (normalize by the L2 norm)
    l2_norm = torch.norm(zero_mean_emb, p=2)
    l2_normalized_emb = zero_mean_emb / l2_norm

    return l2_normalized_emb

def create_video_embedding(category, name, start, end):
    frames = retrieve_frames(f'core_dataset/core_dataset/{category}/{name}', start, end)
    emb = get_frames_emb(frames)
    emb = normalize_frames(emb)
    return emb

## Create a dictionary to store the video embeddings

In [4]:
import pickle

# videos = {}

# with open('embeddings.pkl', 'rb') as f:
#     videos = pickle.load(f)

# with open('mistakes.pkl', 'rb') as f:
#     mistakes = pickle.load(f)
    
videos = {}
mistakes = []

## Create the embeddings

In [8]:
from tqdm import tqdm

bar = tqdm(total=len(df))
bar.reset()
for i, raw in df.iterrows():
    try:
        name = f"{raw['source_video']}_{raw['source_start']}_{raw['source_end']}"
        if name not in videos:
            videos[name] = create_video_embedding(raw['category'], raw['source_video'], raw['source_start'], raw['source_end'])
        if raw['is_duplicate']:
            name = f"{raw['target_video']}_{raw['target_start']}_{raw['target_end']}"
            if name not in videos:
                videos[name] = create_video_embedding(raw['category'], raw['target_video'], raw['target_start'], raw['target_end'])
        bar.update(1)
    except:
        print(i)
        mistakes.append(i)
    if i % 100 == 0:
        with open('embeddings.pkl', 'wb') as f:
            pickle.dump(videos, f)
        with open('mistakes.pkl', 'wb') as f:
            pickle.dump(mistakes, f)
    if i >= 4000:
        break

 29%|██▉       | 2998/10396 [27:48<1:08:38,  1.80it/s]
 10%|█         | 1084/10396 [00:02<00:20, 444.35it/s]

1083


 13%|█▎        | 1401/10396 [00:03<00:22, 400.80it/s]

1463


 28%|██▊       | 2900/10396 [00:07<00:17, 419.53it/s]

2926


 32%|███▏      | 3370/10396 [03:27<44:03,  2.66it/s]  

3373


 34%|███▍      | 3583/10396 [04:57<34:04,  3.33it/s]  [NULL @ 0x5e87d8731880] Invalid NAL unit size (3295 > 3073).
[NULL @ 0x5e87d8731880] missing picture in access unit with size 3077
[h264 @ 0x5e87d77d0ac0] Invalid NAL unit size (3295 > 3073).
[h264 @ 0x5e87d77d0ac0] Error splitting the input into NAL units.
[NULL @ 0x5e87d8731880] Invalid NAL unit size (3295 > 3073).
[NULL @ 0x5e87d8731880] missing picture in access unit with size 3077
[NULL @ 0x5e87d8731880] Invalid NAL unit size (3295 > 3073).
[NULL @ 0x5e87d8731880] missing picture in access unit with size 3077
[h264 @ 0x5e87d6b2bf40] Invalid NAL unit size (3295 > 3073).
[h264 @ 0x5e87d6b2bf40] Error splitting the input into NAL units.
 36%|███▌      | 3765/10396 [06:34<13:52,  7.97it/s]  

3769


 38%|███▊      | 3996/10396 [07:52<31:35,  3.38it/s]  

## Save the embeddings

In [9]:
import pickle

with open('embeddings.pkl', 'wb') as f:
    pickle.dump(videos, f)
    
with open('mistakes.pkl', 'wb') as f:
    pickle.dump(mistakes, f)