# Data Ingestion

In [17]:
# Load libraries
import numpy as np                      
import json                             
from collections import defaultdict     
import tensorflow_datasets as tfds
import numpy as np
import json

In [18]:

# Split ratio
SEED = 47
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
OUTPUT_DIR = "artifacts"

np.random.seed(SEED)

# Load dataset
data = tfds.load("lfw", split="train", as_supervised=True)
data_info = tfds.builder("lfw").info


In [None]:
# Collect labels and original indices for deterministic ordering
entries = []
for idx, (label, _) in enumerate(tfds.as_numpy(data)):
    if isinstance(label, bytes):
        label = label.decode("utf-8")
    entries.append((label, idx))  # (person name, original index)

In [None]:
# Sort deterministically first by label then by index
entries.sort(key=lambda x: (x[0], x[1]))

In [None]:
labels = np.array([e[0] for e in entries])

In [20]:
# Number of samples in each split
indices = np.arange(len(labels))
np.random.shuffle(indices)
train_count = int(len(labels) * train_ratio)
val_count = int(len(labels) * val_ratio)

In [21]:
# Indices for each split
train_indices = indices[:train_count]
val_indices = indices[train_count:train_count + val_count]
test_indices = indices[train_count + val_count:]

In [22]:
# Create a splits dictionary
dataset_split = {
"train": train_indices,
"val": val_indices,
"test": test_indices
}

In [23]:
dataset_split

{'train': array([9462, 4349, 8314, ..., 1677, 3856, 6702], shape=(10586,)),
 'val': array([10564,   914, 11281, ...,  8812, 10514,  2813], shape=(1323,)),
 'test': array([ 4994,  9753,  2841, ..., 11528,  1926,  5255], shape=(1324,))}

In [24]:
manifest = {
    "seed": SEED,
    "split_criteria": "80% train,10% val,10% test",
    "total_images": len(labels),
    "num_identities": len(set(labels)),
    "train_size": len(dataset_split["train"]),
    "val_size": len(dataset_split["val"]),
    "test_size": len(dataset_split["test"])
}

print(manifest)

{'seed': 47, 'split_criteria': '80% train,10% val,10% test', 'total_images': 13233, 'num_identities': 5749, 'train_size': 10586, 'val_size': 1323, 'test_size': 1324}


# Pair Generation

In [25]:
label_with_indices = defaultdict(list) # Defaultdict to map each label to all indices where label occurs
for index in indices:                          
    label = labels[index]                     
    label_with_indices[label].append(index)       # Append this index to the list of that label
    # for example label_with_indices ={"Juan":[0,2,4], Jong:[1,3,5]}

In [26]:
label_with_indices.items()

dict_items([(np.str_('Juan_Roman_Carrasco'), [np.int64(9462)]), (np.str_('Kai-Uwe_Ricke'), [np.int64(4349)]), (np.str_('Jong_Wook_Lee'), [np.int64(8314), np.int64(6565), np.int64(8238), np.int64(12446)]), (np.str_('George_W_Bush'), [np.int64(1546), np.int64(2128), np.int64(2207), np.int64(11604), np.int64(4374), np.int64(7356), np.int64(7828), np.int64(9673), np.int64(1653), np.int64(5550), np.int64(11865), np.int64(3967), np.int64(29), np.int64(289), np.int64(10707), np.int64(2712), np.int64(9565), np.int64(10973), np.int64(10349), np.int64(3900), np.int64(4887), np.int64(5645), np.int64(2403), np.int64(209), np.int64(509), np.int64(7371), np.int64(8104), np.int64(8216), np.int64(12987), np.int64(12590), np.int64(9594), np.int64(5023), np.int64(5869), np.int64(4725), np.int64(4357), np.int64(10495), np.int64(620), np.int64(8311), np.int64(10661), np.int64(1187), np.int64(12723), np.int64(5530), np.int64(8631), np.int64(5415), np.int64(11278), np.int64(7819), np.int64(7820), np.int64(9

In [27]:
# Generate positive pairs 
pair_indices = []          # Will store tuples of 2 indices of the same label
pair_label = []            # Will store 1 for same and 0 for different label

for label, index_list in label_with_indices.items():                  # Iterate over each label and its list of indices
    if len(index_list) >= 2:                                          # Generate postive pairs for labels that has at least 2 indices in its value
        for i in range(len(index_list)):                              # Loop over indices
            for j in range(i + 1, len(index_list)):                   # Loop over this label starting from the i+1 index
                pair_indices.append((index_list[i], index_list[j]))   # Append pair of indices(1st & 2nd then 1st and 3rd) of the same label
                pair_label.append(1)                                  # Append 1 in positive pair list

In [28]:
# Generate negative pairs 
for i, label1 in enumerate(label_with_indices):                  # Loop over each label as the first label
    for label2 in list(label_with_indices.keys())[i + 1:]:       # Loop over labels after label1 to avoid duplicates
        for index1 in label_with_indices[label1]:                     # Loop over indices of the first label
            for index2 in label_with_indices[label2]:                 # Loop over indices of the second label
                pair_indices.append((index1, index2))                 # Add the pair of indices to the list
                pair_label.append(0)                                  # Append 0 in negative pair list


In [29]:
"""
1st loop : i=0, label 1 = Juan_Roman_Carrasco
2nd loop : loop over all the other labels ('Kai-Uwe_Ricke', 'Jong_Wook_Lee', 'George_W_Bush', 'Joschka_Fischer'.......)
3rd loop : loop over each index of (Juan_Roman_Carrasco : ([0, 3]))
4th loop : loop over each index of (Kai-Uwe_Ricke : [1, 5])
in 3rd loop at index 0, 4th loop on index 1: add pairs(0, 1) then loop on the 2nd index in label 2 which is 5 and add (0, 5),all of that
in a list with key zero
then loop again and add 3,1 and 3,5
"""

"\n1st loop : i=0, label 1 = Juan_Roman_Carrasco\n2nd loop : loop over all the other labels ('Kai-Uwe_Ricke', 'Jong_Wook_Lee', 'George_W_Bush', 'Joschka_Fischer'.......)\n3rd loop : loop over each index of (Juan_Roman_Carrasco : ([0, 3]))\n4th loop : loop over each index of (Kai-Uwe_Ricke : [1, 5])\nin 3rd loop at index 0, 4th loop on index 1: add pairs(0, 1) then loop on the 2nd index in label 2 which is 5 and add (0, 5),all of that\nin a list with key zero\nthen loop again and add 3,1 and 3,5\n"

In [30]:
# Shuffle all pairs  
pair_indices = np.array(pair_indices)
pair_label = np.array(pair_label)

# Combine pairs and labels 
combined = np.column_stack((pair_indices, pair_label))
combined

array([[ 8314,  6565,     1],
       [ 8314,  8238,     1],
       [ 8314, 12446,     1],
       ...,
       [ 3095,  8883,     0],
       [ 3095,  6728,     0],
       [ 8883,  6728,     0]], shape=(87549528, 3))

In [31]:
# Shuffle rows in place
np.random.shuffle(combined)

# Split back into pairs and labels
pair_indices, pair_label = combined[:, :2], combined[:, 2]

In [32]:
pair_indices

array([[ 4834,  3180],
       [ 3329, 10813],
       [12392,  9348],
       ...,
       [11837,  4934],
       [12639,  4089],
       [ 7299, 13103]], shape=(87549528, 2))

# Similarity

In [None]:
import numpy as np
import time

# Python Loop Implementations

def cosine_similarity_loop(img1, img2):
    start = time.perf_counter()                     # Start timer to measure runtime

    vec1 = img1.astype(np.float64).ravel()             # Converts images to 1D float arrays for math operations, Flatten the arrays into 1D vectors do element wise operations
    vec2 = img2.astype(np.float64).ravel()

    dot_product  = 0.0
    norm1 = 0.0
    norm2 = 0.0

    for i in range(len(vec1 )):
        dot_product += vec1 [i] * vec2[i]                          # dot product
        norm1 += vec1 [i] ** 2                          # sum of squares of arr1
        norm2 += vec2[i] ** 2

    norm1 = np.sqrt(norm1)
    norm2 = np.sqrt(norm2)

    if norm1 == 0 or norm2 == 0:                    
        return time.perf_counter() - start, 0.0

    similarity = dot_product  / (norm1 * norm2)

    return time.perf_counter() - start, similarity

In [None]:
def euclidean_distance_loop(img1, img2):
    start = time.perf_counter()

    vec1  = img1.astype(np.float64).ravel()
    vec2 = img2.astype(np.float64).ravel()

    dist = 0.0
    for i in range(len(vec1 )):
        diff = vec1[i] - vec2[i]                      # Compute difference squared 
        dist += diff * diff                     # Add to running total dist

    distance = np.sqrt(dist)                    # Take square root of the sum of squared differences

    return time.perf_counter() - start, distance

In [35]:
# Vectorized NumPy Implementations

def cosine_similarity(img1, img2):
    start = time.perf_counter()

    vec1 = img1.astype(np.float64).ravel()
    vec2 = img2.astype(np.float64).ravel()

    dot_product  = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    if norm1 == 0 or norm2 == 0:
        return time.perf_counter() - start, 0.0

    similarity = dot_product  / (norm1 * norm2)

    return time.perf_counter() - start, similarity

In [36]:
def euclidean_distance(img1, img2):
    start = time.perf_counter()

    vec1 = img1.astype(np.float64).ravel()
    vec2 = img2.astype(np.float64).ravel()

    distance = np.linalg.norm(vec1 - vec2)

    return time.perf_counter() - start, distance

# Benchmark

In [None]:
def benchmark():
    # Loading images  
    data = tfds.load("lfw", split="train", as_supervised=True)

    images = []
    for label, image in tfds.as_numpy(data):
        images.append(image)

    images = np.array(images)

    # 2 samples 
    img1 = images[10]
    img2 = images[100]

    # Cosine benchmark 
    loop_time_cos, loop_result_cos = cosine_similarity_loop(img1, img2)
    numpy_time_cos, numpy_result_cos = cosine_similarity(img1, img2)

    print("\nCosine Similarity")
    print(f"Loop Time: {loop_time_cos:.5f} sec")
    print(f"Numpy Time: {numpy_time_cos:.5f} sec")
    print(f"Speedup: {loop_time_cos / numpy_time_cos:.2f}x")
    assert np.isclose(loop_result_cos, numpy_result_cos), "Cosine results do not match!"
    print("Cosine correctness verified")


    # Euclidean benchmark
    loop_time_euc, loop_result_euc = euclidean_distance_loop(img1, img2)
    numpy_time_euc, numpy_result_euc = euclidean_distance(img1, img2)

    print("\nEuclidean Distance")
    print(f"Loop Time: {loop_time_euc:.5f} sec")
    print(f"Numpy Time: {numpy_time_euc:.5f} sec")
    print(f"Speedup: {loop_time_euc / numpy_time_euc:.2f}x")
    assert np.isclose(loop_result_euc, numpy_result_euc), "Euclidean results do not match!"
    print("Euclidean correctness verified")
    