In [21]:
import pandas as pd
import torch
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random
import numpy as np

In [22]:
seed = 43

def set_seed(seed: int):
    """
    Makes process of training more deterministic
    and ables to get reproducible results
    Arguments:
        seed (int): Random seed to be used in fixing
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(seed)

# Open embeddings

In [23]:
# Get uuids of existing videos
existing_uuid = os.listdir('train_data_yappy/train_dataset/')
existing_uuid += os.listdir('train_data_yappy/downloaded/')

for i in range(len(existing_uuid)):
    existing_uuid[i] = existing_uuid[i].replace('.mp4', '')
existing_uuid = set(existing_uuid)

In [24]:
# Open dataset and remoce all videos that are not downloaded
df = pd.read_csv('train_data_yappy/train.csv')
df = df[df['uuid'].isin(existing_uuid)]
df = df.sample(frac=1)
df.reset_index(inplace=True)

In [25]:
video_uuids = df['uuid']
video_uuids = video_uuids.apply(lambda x: x + '.mp4').tolist()

root_dir = 'train_data_yappy'
dataset_dir = 'train_dataset'

# Load embeddings
videos = []

for name in tqdm(video_uuids):
    full_emb_path = os.path.join(root_dir, 'emb', name.replace('.mp4', '.pt'))
    video_emb = torch.load(str(full_emb_path))
    videos.append(video_emb)  # Append video_emb to the list

  video_emb = torch.load(str(full_emb_path))
100%|██████████| 2500/2500 [00:02<00:00, 1048.75it/s]


# Data preprocessing

In [26]:
# Normalization of embeddings
processed_videos = []

for video in videos:
    # Step 1: Average across frames (mean along the first dimension)
    avg_emb = torch.mean(video, dim=0)

    # Step 2: Zero-mean normalization (subtract the mean of the vector)
    mean_value = torch.mean(avg_emb)
    zero_mean_emb = avg_emb - mean_value

    # Step 3: ℓ2-normalization (normalize by the L2 norm)
    l2_norm = torch.norm(zero_mean_emb, p=2)
    l2_normalized_emb = zero_mean_emb / l2_norm

    # Store the processed vector
    processed_videos.append(l2_normalized_emb)


In [27]:
def uuid2idx(uuid: str) -> int:
    """
    Getting index of a video in the dataframe via its uuid
    Arguments:
        uuid (str): uuid of the video
    Returns:
        index (int) Index of the passed video from the dataframe
    """
    return df[df['uuid'] == uuid].index[0]

In [28]:
# Put indexes of original, duplicate and non-duplicate videos into lists
dup_orig = []
originals = []
non_duplicates = []

for i in range(len(processed_videos)):
    if df.is_duplicate[i]:
        # dup_orig[i][0] is a duplicate
        # dup_orig[i][1] is the origin
        dup_orig.append((i, uuid2idx(df.duplicate_for[i])))
        originals.append(uuid2idx(df.duplicate_for[i]))
    else:
        non_duplicates.append(i)
        
# remove originals from non_duplicates
non_duplicates = [i for i in non_duplicates if i not in originals]

In [29]:
# Split data on train and test parts
train_originals, test_originals = train_test_split(originals, test_size=0.2, random_state=42, shuffle=False)
train_dup_orig = [i for i in dup_orig if i[1] in train_originals]
test_dup_orig = [i for i in dup_orig if i[1] in test_originals]

train_non_duplicates, test_non_duplicates = train_test_split(non_duplicates, test_size=0.2, random_state=42, shuffle=True)

In [30]:
from typing import List

# Create triplets
def euclidean_distance(v1: torch.tensor, v2: torch.tensor) -> torch.tensor:
    """
    Calculation of euclidian distance between v1 and v2
    Arguments:
        v1 (torch.tensor): First vector
        v2 (torch.tensor): Second vector
    Returns:
        euclid_dist (torch.tensor) Calculated euclidian distance between two passed vectors
    """
    return torch.norm(v1 - v2)

# Actually we have to find such v-, that dist(origin, v-) < dist(origin, v+),
# Where v- is non duplicate and v+ is duplicate.
# However, our dataset is small, and we can concat all v- and original videos
def find_closest_videos(origin: int, non_duplicates) -> List[int]:
    """
    Arguments:
        origin (int): Index of a video to start search from
        non_duplicates (List[int], numpy.array, or any iterable list with ints): List of indeces of non-duplicates videos

    Returns:
        results (List[int]): Indices of found videos with relatively small cos-distance
    """
    results = []
    min_dist = float('inf')
    min_id = 0
    for i in non_duplicates:
        # Find the minimum distance
        if euclidean_distance(processed_videos[origin], processed_videos[i]) < min_dist:
            min_dist = euclidean_distance(processed_videos[origin], processed_videos[i])
            min_id = i
        # Find the indicies with distance less than dist with v and v+
        # if euclidean_distance(processed_videos[origin], processed_videos[i]) < euclidean_distance(processed_videos[origin], processed_videos[duplet]):
        #     results.append(i)
        results.append(i)
    # If there is no such triplet, add at least one
    if len(results) == 0:
        results.append(min_id)
        
    return results

def create_triplets(dup_orig, non_duplicates: List[int]):
    """
    Creates triplets from list of pairs like "duplicate-original" and list of non-duplicates videos
    Arguments:
        dup_orig (List[(int, int)]): List of duplicate-original pairs
        non_duplicates (List[int]): List of non-duplicate videos
    Returns:
        triplets (List[(int, int, int)]): List of generated triplets
    """
    triplets = []
    for duplet, origin in dup_orig:
        results = find_closest_videos(origin, non_duplicates)
        for result in results:
            triplets.append((origin, duplet, result))
    return triplets

In [31]:
# Train and test triplets 
train_triplets = create_triplets(train_dup_orig, train_non_duplicates)
test_triplets = create_triplets(test_dup_orig, test_non_duplicates)

# Model

In [35]:
import torch.utils
import torch.utils.data
from torchvision.transforms import v2

# 1. Define a Triplet Dataset with Late Fusion
class TripletDataset(Dataset):
    """
    Dateset for storing triplets
    """
    def __init__(self, data, processed_videos: List[torch.tensor], transform = None):
        """
        Creates dataset
        Arguments:
            data (List[(int, int, int)]): List of (anchor, positive, negative) triplets
            processed_videos (List[torch.tensor]): List of video data as sequences of frames
            transform (torchvision.transform): transforms to apply on the data
        """
        self.data = data
        self.processed_videos = processed_videos
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """ 
        Gets triplet (anchor, positive, negative) by index
        Arguments:
            idx (int): Index to get triple by
        Returns:
            triplet ((torch.temsor, torch.tensor, torch.tensor)): Retrieved tripet
        """
        # If late fusion is not used, use the raw triplet data
        anchor, positive, negative = self.processed_videos[self.data[idx][0]], self.processed_videos[self.data[idx][1]], self.processed_videos[self.data[idx][2]]
        if self.transform:
            anchor = self.transform(anchor)
            positive = self.transform(positive)
            negative = self.transform(negative)
        return anchor, positive, negative
        

# 2. Define a Simple Neural Network for Embedding Generation
class EmbeddingNet(nn.Module):
    def __init__(self):
        super(EmbeddingNet, self).__init__()
        self.fc1 = nn.Linear(5488, 2500)  # Assuming input images are 28x28
        self.fc2 = nn.Linear(2500, 1000)
        self.fc3 = nn.Linear(1000, 500)  # Output embedding of size 500

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Output embedding
        x = F.normalize(x, p=2, dim=1)  # Normalize embeddings to have unit norm
        return x

# 3. Triplet Loss with Regularization (Custom)
class CustomTripletLoss(nn.Module):
    def __init__(self, margin=1.0, lambda_reg=1e-3):
        super(CustomTripletLoss, self).__init__()
        self.margin = margin
        self.lambda_reg = lambda_reg
        self.triplet_loss = nn.TripletMarginLoss(margin=self.margin)

    def forward(self, anchor, positive, negative, model_params):
        # Compute the triplet loss
        loss = self.triplet_loss(anchor, positive, negative)

        # L2 regularization on model parameters
        reg_loss = 0
        for param in model_params:
            reg_loss += torch.sum(param ** 2)

        reg_loss = self.lambda_reg * reg_loss
        total_loss = loss + reg_loss
        return total_loss

# 4. Model Forward Pass with Late Fusion (Average Pooling)
class LateFusionModel(nn.Module):
    def __init__(self, base_model):
        super(LateFusionModel, self).__init__()
        self.base_model = base_model

    def forward(self, frames):
        # Process each frame independently
        batch_size, num_frames, _, _ = frames.shape
        frame_embeddings = []
        for i in range(num_frames):
            frame = frames[:, i, :, :]  # Extract each frame
            embedding = self.base_model(frame)
            frame_embeddings.append(embedding)

        # Perform late fusion by averaging the embeddings of all frames
        #fused_embedding = torch.stack(frame_embeddings, dim=1).mean(dim=1)
        # averaging
        fused_embedding = torch.mean(frame_embeddings, dim=0)
        # zero mean
        mean_value = torch.mean(fused_embedding)
        zero_mean_emb = fused_embedding - mean_value
        # l2
        fused_embedding = torch.norm(fused_embedding, p=2)
        fused_embedding = zero_mean_emb / fused_embedding
        
        return fused_embedding

# 5. Training Loop
def train_triplet_model(train_loader: torch.utils.data.DataLoader, model: nn.Module, optimizer: torch.optim, criterion: nn.Module, device: str = "cpu", num_epochs = 10):
    """
    Trains model to encode videos to embeddings
    Arguments:
        train_loader (torch.utils.data.DataLoader): dataloader for training
        model (nn.Module): model to train
        optimizer (torch.optim) optimizer for training
        criterion (nn.Module): loss function for optimization
        num_epochs (int): number of epoch to train the model for
        device (str): 'cuda' or 'cpu' depending on the machine and/or choice
    """
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch_idx, (anchor, positive, negative) in enumerate(train_loader):
            # Move data to the correct device
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
            # Forward pass: Compute embeddings
            anchor_emb = model(anchor)
            positive_emb = model(positive)
            negative_emb = model(negative)

            # Compute the triplet loss with regularization
            loss = criterion(anchor_emb, positive_emb, negative_emb, model.parameters())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate batch loss
            epoch_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.6f}')


In [36]:
# Open dataset and create dataloader
train_dataset = TripletDataset(train_triplets, processed_videos)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

# Initialize base model and late fusion model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = EmbeddingNet().to(device)
model = LateFusionModel(base_model).to(device)

# Loss function and optimizer
criterion = CustomTripletLoss(margin=1.0, lambda_reg=1e-3)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Train the model
train_triplet_model(train_loader, base_model, optimizer, criterion, num_epochs=2, device=device)

Epoch [1/2], Loss: 1.027908
Epoch [2/2], Loss: 0.479768


# Model evaluation

In [37]:
test = test_originals + test_non_duplicates + [do[0] for do in test_dup_orig]

# shuffle test
random.shuffle(test)

In [38]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct
from tqdm import tqdm

# Before oppening the connection, run the docker image of qdrand
client = QdrantClient(url="http://localhost:6333")

# Delete the collection if it exists
client.delete_collection('video')

# Some parameters
video_emb_dim = 500
distance = Distance.EUCLID

# Create collection (db)
client.create_collection(
    collection_name="video",
    vectors_config=VectorParams(size=video_emb_dim, distance=distance)
)


d = []
# For each test video, find the closest video in the db.
# If the similarity greater than threshold, then put the test video in the db
# Else, we have found a duplicate
for i in tqdm(test):
    
    v = base_model(processed_videos[i].unsqueeze(0).to(device)).detach().cpu().numpy()[0]
    # search for the closest vector
    search_result = client.query_points(
        collection_name="video",
        query=v,
        with_payload=True,
        limit=5
    ).points

    # if the db is not empty
    if len(search_result) > 0:
        id1 = i
        id2 = search_result[0].id
        
        if search_result[0].score < 0.59:
            d.append((id1, id2))
            continue

    # if the db is empty then insert the vector
    client.upsert(
        collection_name="video",
        points=[PointStruct(id=int(i), vector=v, payload={'number': int(i), 'uuid': df.uuid[i], 'link': df.link[i]})]
    )

len(d)

100%|██████████| 502/502 [00:03<00:00, 160.93it/s]


80

In [40]:
def metrics(d, df: pd.DataFrame):
    """
    Calculates and prints precision, recall, and f1-score
    Arguments:
        d (List[(int, int)]): List of pairs of indices for detected video duplicates
        df (pd.DataFrame): Dataframe with all the stored info about videos
    """
    tp = 0
    fp = 0
    fn = 0
    for i, j in d:
        if df.is_duplicate[i]:
            if df.uuid[j] == df.duplicate_for[i]:
                tp += 1
        elif df.is_duplicate[j]:
            if df.uuid[i] == df.duplicate_for[j]:
                tp += 1
                
    fn = len(test_dup_orig) - tp

    fp = len(d) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    print(tp)
    print(f'Precision: {precision}\nRecall: {recall}\nF1: {f1}')

metrics(d, df)

73
Precision: 0.9125
Recall: 0.9240506329113924
F1: 0.9182389937106918


In [None]:
# for i, j in d:
#     print(df.uuid[i], "  | ",  df.uuid[j])

In [41]:
# save model
torch.save(base_model.state_dict(), 'model94_59_base.pt')
torch.save(model.state_dict(), 'model94_59.pt')

# TEST data

In [42]:
# load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.load_state_dict(torch.load('model94_59_base.pt', map_location=device))
model.load_state_dict(torch.load('model94_59.pt', map_location=device))

  base_model.load_state_dict(torch.load('model94_59_base.pt', map_location=device))
  model.load_state_dict(torch.load('model94_59.pt', map_location=device))


<All keys matched successfully>

In [43]:
existing_uuid = os.listdir('test_data_yappy/test_dataset/')
for i in range(len(existing_uuid)):
    existing_uuid[i] = existing_uuid[i].replace('.mp4', '')

existing_uuid = set(existing_uuid)

df_test = pd.read_csv('test_data_yappy/test.csv')
df_test = df_test[df_test['uuid'].isin(existing_uuid)]

video_uuids = df_test['uuid']
video_uuids = video_uuids.apply(lambda x: x + '.mp4').tolist()

root_dir = 'test_data_yappy'
dataset_dir = 'test_dataset'

In [44]:
# Open embeddings 
videos = []

for name in tqdm(video_uuids):
    full_emb_path = os.path.join(root_dir, 'emb', name.replace('.mp4', '.pt'))
    video_emb = torch.load(str(full_emb_path))
    videos.append(video_emb)  # Append video_emb to the list

  video_emb = torch.load(str(full_emb_path))
100%|██████████| 1000/1000 [00:01<00:00, 929.12it/s]


In [45]:
# Normalization
processed_videos = []

for video in videos:
    # Step 1: Average across frames (mean along the first dimension)
    avg_emb = torch.mean(video, dim=0)

    # Step 2: Zero-mean normalization (subtract the mean of the vector)
    mean_value = torch.mean(avg_emb)
    zero_mean_emb = avg_emb - mean_value

    # Step 3: ℓ2-normalization (normalize by the L2 norm)
    l2_norm = torch.norm(zero_mean_emb, p=2)
    l2_normalized_emb = zero_mean_emb / l2_norm

    # Store the processed vector
    processed_videos.append(l2_normalized_emb)

In [46]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct
from datetime import datetime
from tqdm import tqdm

client = QdrantClient(url="http://localhost:6333")

client.delete_collection('video')

video_emb_dim = 500
distance = Distance.EUCLID

client.create_collection(
    collection_name="video",
    vectors_config=VectorParams(size=video_emb_dim, distance=distance)
)

d = []

for i in tqdm(range(len(df_test))):
    v = base_model(processed_videos[i].unsqueeze(0).to(device)).detach().cpu().numpy()[0]
    # search for the closest vector
    search_result = client.query_points(
        collection_name="video",
        query=v,
        with_payload=True,
        limit=5
    ).points

    # if the db is not empty
    if len(search_result) > 0:
        id1 = i
        id2 = search_result[0].id
        if search_result[0].score < 0.59:
            
            # check which video was uploaded earlier
            time1 = datetime.strptime(df_test.created[id1], '%Y-%m-%d %H:%M:%S')
            time2 = datetime.strptime(df_test.created[id2], '%Y-%m-%d %H:%M:%S')
            # put ids in original-duplicate order
            if time1 < time2:
                d.append((id1, id2))
            else:
                d.append((id2, id1))
            continue

    # if the db is empty then insert the vector
    client.upsert(
        collection_name="video",
        points=[PointStruct(id=int(i), vector=v, payload={'number': int(i)})]
    )

len(d)

100%|██████████| 1000/1000 [00:05<00:00, 183.27it/s]


327

In [47]:
for i, j in d:
    print(df_test.uuid[i], "  | ",  df_test.uuid[j])

4b005e08-6890-4c4b-b0cf-95dc3e1aabeb   |  fd52cbe5-012f-4451-abad-1c86c8279e8c
22ee80a3-d9ef-48d4-83d7-9c97cc7030c2   |  6f6f5da8-f997-491d-8fe0-64d262b2ee4c
045265e5-0d4c-4372-b960-3087f685eb97   |  cbf3948e-e8a0-4e61-858d-72b7ff2d2d43
2cfd6af3-7df6-4afa-8c3b-c17236c83c03   |  89fca4ee-4678-482b-8d6d-907dbc057151
0d62849b-d3b9-47af-a285-2f638bc9ac13   |  51b077bc-fac3-4cdb-bda4-06cd60d53af7
0d21cfda-39e3-42b8-be97-aff5835036cc   |  7f3217c6-5a31-4b1c-a9f2-a8cb82e46ae3
1c529118-dd4f-4c2d-a01a-1c5bf363ee4b   |  1eac74d7-40ff-4fad-9af1-4900da3e7c78
7f90be99-fc5a-4503-9bef-4f6fbf70f47c   |  2ca7bcc4-eb59-4507-853a-5ff76179ecad
49577a11-51b9-490a-b1f0-df17335219de   |  da9783ba-ceac-47ed-9d8f-30b614e938dd
1c2de832-9df0-46fe-88bb-d125596fdae1   |  e846d78e-63e7-4136-b681-6452013e50dc
42fbf1a0-2f00-4194-9733-675e19947a9e   |  edbacae6-1694-47f1-9abf-e6352acba998
3a1d73a5-27a8-4f8d-8ec9-a129bb16cf32   |  eb129696-beb7-482e-926a-6152ac28d4bd
3687b604-9481-4def-8b3e-7162c5546363   |  9791705f-d

In [48]:
df_test['is_duplicate'] = False
df_test['duplicate_for'] = None
for i, j in d:
    df_test.loc[j, 'is_duplicate'] = True
    df_test.loc[j, 'duplicate_for'] = df_test.uuid[i]

In [None]:
df_test.to_csv('submission.csv', index=False)