In [10]:
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm as tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
def offline_triplet_selection(embeddings_df, minibatch=1800, margin=0.2, max_triplets=None):
    triplets = []

    # Reordena aleatoriamente e extrai os embeddings diretamente
    embeddings_df = embeddings_df.sample(frac=1).reset_index(drop=True)

    # Corrige a extração de embeddings para lidar com arrays aninhados
    embeddings = np.stack(embeddings_df['embedding'].apply(lambda x: np.array(x[0], dtype=np.float32)).values)
    embeddings = torch.tensor(embeddings, dtype=torch.float32)

    ids = embeddings_df['id'].to_numpy()
    
    for i in tqdm(range(0, len(embeddings_df), minibatch), desc="Minibatches"):
        if max_triplets is not None and len(triplets) >= max_triplets:
            break  # Interrompe o processo se o número máximo de triplets for atingido

        batch_indices = list(range(i, min(i + minibatch, len(embeddings_df))))
        batch_embeddings = embeddings[batch_indices]
        batch_ids = ids[batch_indices]

        distances = torch.cdist(batch_embeddings, batch_embeddings, p=2)
        
        for anchor_idx in range(len(batch_indices)):
            if max_triplets is not None and len(triplets) >= max_triplets:
                break  # Interrompe o loop interno se o número máximo de triplets for atingido

            anchor_id = batch_ids[anchor_idx]
            positive_mask = batch_ids == anchor_id
            
            # Evita a própria âncora como positivo
            positive_mask[anchor_idx] = False
            positive_mask = torch.tensor(positive_mask)
            
            for positive_idx in torch.where(positive_mask)[0]:
                d_ap = distances[anchor_idx, positive_idx]
                
                negative_mask = (distances[anchor_idx] > d_ap) & (distances[anchor_idx] < d_ap + margin) & (~positive_mask)
                
                for negative_idx in torch.where(negative_mask)[0]:
                    dist = distances[anchor_idx, negative_idx].item() - d_ap.item()
                    triplets.append((batch_indices[anchor_idx], batch_indices[positive_idx], batch_indices[negative_idx], dist))

                    if max_triplets is not None and len(triplets) >= max_triplets:
                        break  # Interrompe o loop mais interno se o número máximo de triplets for atingido
                    
    print(f"Total de triplets: {len(triplets):,}".replace(',', '.'))
    
    triplets_df = pd.DataFrame(triplets, columns=['anchor_idx', 'positive_idx', 'negative_idx', 'dist'])

    # Mapeia os índices de volta para os caminhos das imagens usando apply e loc
    triplets_df['anchor_path'] = triplets_df['anchor_idx'].apply(lambda x: embeddings_df['path'].loc[x])
    triplets_df['positive_path'] = triplets_df['positive_idx'].apply(lambda x: embeddings_df['path'].loc[x])
    triplets_df['negative_path'] = triplets_df['negative_idx'].apply(lambda x: embeddings_df['path'].loc[x])

    triplets_img_paths_df = triplets_df[['anchor_path', 'positive_path', 'negative_path', 'dist']]
    
    return triplets_img_paths_df

In [12]:
lfw_train_embeddings = pd.read_pickle('../data/lfw_train_embeddings.pkl')

In [13]:
triplets_df = offline_triplet_selection(lfw_train_embeddings, minibatch=1800, margin=0.2, max_triplets=500_000)

Minibatches:   0%|          | 0/8 [00:00<?, ?it/s]

Total de triplets: 500.027


In [14]:
triplets_df.head()

Unnamed: 0,anchor_path,positive_path,negative_path,dist
0,../data/lfw-faces/James_Blake_0004.jpg,../data/lfw-faces/James_Blake_0008.jpg,../data/lfw-faces/Andre_Agassi_0018.jpg,0.15212
1,../data/lfw-faces/James_Blake_0004.jpg,../data/lfw-faces/James_Blake_0008.jpg,../data/lfw-faces/Juan_Pablo_Montoya_0002.jpg,0.198927
2,../data/lfw-faces/James_Blake_0004.jpg,../data/lfw-faces/James_Blake_0008.jpg,../data/lfw-faces/Kurt_Warner_0004.jpg,0.109461
3,../data/lfw-faces/James_Blake_0004.jpg,../data/lfw-faces/James_Blake_0008.jpg,../data/lfw-faces/Roger_Federer_0001.jpg,0.152242
4,../data/lfw-faces/James_Blake_0004.jpg,../data/lfw-faces/James_Blake_0008.jpg,../data/lfw-faces/Alexandre_Daigle_0001.jpg,0.175565


In [15]:
triplets_df = triplets_df.replace(to_replace='../data/lfw-faces/', value='', regex=True)

In [16]:
triplets_df.head()

Unnamed: 0,anchor_path,positive_path,negative_path,dist
0,James_Blake_0004.jpg,James_Blake_0008.jpg,Andre_Agassi_0018.jpg,0.15212
1,James_Blake_0004.jpg,James_Blake_0008.jpg,Juan_Pablo_Montoya_0002.jpg,0.198927
2,James_Blake_0004.jpg,James_Blake_0008.jpg,Kurt_Warner_0004.jpg,0.109461
3,James_Blake_0004.jpg,James_Blake_0008.jpg,Roger_Federer_0001.jpg,0.152242
4,James_Blake_0004.jpg,James_Blake_0008.jpg,Alexandre_Daigle_0001.jpg,0.175565


In [18]:
# Escolher 200k amostras aleatórias
triplets_df = triplets_df.sample(n=200_000, random_state=42)

0.19999957084655762

In [20]:
triplets_df.to_csv('../data/triplets_df.csv', index=False)