# Face Embedding Analysis

This notebook analyzes face embeddings and compares different embedding models.

## Table of Contents
1. Understanding Embeddings
2. Extracting Embeddings
3. Embedding Visualization
4. Similarity Metrics
5. Model Comparison

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from faceverify import FaceVerifier
from faceverify.config import VerifierConfig
from faceverify.embedding import EmbeddingGenerator

## 1. Understanding Embeddings

Face embeddings are numerical representations of faces in high-dimensional space.

Key concepts:
- Each face is converted to a vector (e.g., 512 dimensions for Facenet512)
- Similar faces have similar vectors (close in space)
- Different faces have different vectors (far apart in space)

In [None]:
# Available embedding models
EMBEDDING_MODELS = [
    "facenet",      # 128-dimensional embeddings
    "facenet512",   # 512-dimensional embeddings (recommended)
    "arcface",      # 512-dimensional, state-of-the-art
    "vggface"       # VGGFace model
]

print("Available Embedding Models")
print("=" * 40)
for model in EMBEDDING_MODELS:
    print(f"  - {model}")

## 2. Extracting Embeddings

Extract face embeddings from images:

In [None]:
# Test images
IMAGES = {
    "person1_a": "../test_images/person1_a.jpg",
    "person1_b": "../test_images/person1_b.jpg",
    "person2": "../test_images/person2.jpg"
}

In [None]:
# Initialize verifier
verifier = FaceVerifier()

# Extract embeddings for each image
embeddings = {}

for name, path in IMAGES.items():
    try:
        emb = verifier.extract_embedding(path)
        embeddings[name] = emb
        print(f"{name}: embedding shape = {emb.shape}")
    except Exception as e:
        print(f"{name}: Error - {e}")

In [None]:
# Examine embedding statistics
print("\nEmbedding Statistics")
print("=" * 50)

for name, emb in embeddings.items():
    print(f"\n{name}:")
    print(f"  Dimensions: {len(emb)}")
    print(f"  Min value:  {emb.min():.4f}")
    print(f"  Max value:  {emb.max():.4f}")
    print(f"  Mean:       {emb.mean():.4f}")
    print(f"  Std:        {emb.std():.4f}")
    print(f"  L2 Norm:    {np.linalg.norm(emb):.4f}")

## 3. Embedding Visualization

Visualize embeddings using dimensionality reduction:

In [None]:
# Create embedding matrix
if len(embeddings) >= 2:
    names = list(embeddings.keys())
    emb_matrix = np.array([embeddings[n] for n in names])
    
    print(f"Embedding matrix shape: {emb_matrix.shape}")
else:
    print("Need at least 2 embeddings for visualization")

In [None]:
# Visualize embedding distribution
if len(embeddings) >= 1:
    fig, axes = plt.subplots(1, len(embeddings), figsize=(5*len(embeddings), 4))
    
    if len(embeddings) == 1:
        axes = [axes]
    
    for idx, (name, emb) in enumerate(embeddings.items()):
        axes[idx].hist(emb, bins=50, alpha=0.7, color='steelblue')
        axes[idx].set_title(f"{name}")
        axes[idx].set_xlabel("Value")
        axes[idx].set_ylabel("Frequency")
    
    plt.tight_layout()
    plt.savefig("embedding_distributions.png", dpi=150)
    plt.show()

In [None]:
# PCA visualization (if we have enough samples)
if len(embeddings) >= 3:
    pca = PCA(n_components=2)
    emb_2d = pca.fit_transform(emb_matrix)
    
    plt.figure(figsize=(8, 6))
    
    # Color by person
    colors = ['blue', 'blue', 'red']  # person1_a, person1_b are same person
    
    for i, (name, color) in enumerate(zip(names, colors)):
        plt.scatter(emb_2d[i, 0], emb_2d[i, 1], c=color, s=100, label=name)
        plt.annotate(name, (emb_2d[i, 0], emb_2d[i, 1]), fontsize=10)
    
    plt.title("Face Embeddings (PCA Projection)")
    plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)")
    plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig("embedding_pca.png", dpi=150)
    plt.show()

## 4. Similarity Metrics

Compare different similarity/distance metrics:

In [None]:
from scipy.spatial.distance import cosine, euclidean

def compute_similarities(emb1, emb2):
    """Compute various similarity metrics."""
    results = {}
    
    # Cosine similarity (1 - cosine distance)
    results['cosine_similarity'] = 1 - cosine(emb1, emb2)
    
    # Euclidean distance
    results['euclidean_distance'] = euclidean(emb1, emb2)
    
    # Normalized Euclidean (on unit vectors)
    emb1_norm = emb1 / np.linalg.norm(emb1)
    emb2_norm = emb2 / np.linalg.norm(emb2)
    results['normalized_euclidean'] = euclidean(emb1_norm, emb2_norm)
    
    # Dot product
    results['dot_product'] = np.dot(emb1_norm, emb2_norm)
    
    return results

In [None]:
# Compare all pairs
if len(embeddings) >= 2:
    print("Similarity Analysis")
    print("=" * 60)
    
    pairs = [
        ("person1_a", "person1_b", "Same person"),
        ("person1_a", "person2", "Different people"),
    ]
    
    for name1, name2, description in pairs:
        if name1 in embeddings and name2 in embeddings:
            sims = compute_similarities(embeddings[name1], embeddings[name2])
            
            print(f"\n{name1} vs {name2} ({description})")
            print("-" * 40)
            for metric, value in sims.items():
                print(f"  {metric}: {value:.4f}")

In [None]:
# Visualize similarity matrix
if len(embeddings) >= 2:
    names = list(embeddings.keys())
    n = len(names)
    
    # Compute cosine similarity matrix
    sim_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            sim_matrix[i, j] = 1 - cosine(embeddings[names[i]], embeddings[names[j]])
    
    # Plot heatmap
    plt.figure(figsize=(8, 6))
    plt.imshow(sim_matrix, cmap='RdYlGn', vmin=0, vmax=1)
    plt.colorbar(label='Cosine Similarity')
    
    plt.xticks(range(n), names, rotation=45, ha='right')
    plt.yticks(range(n), names)
    
    # Add values
    for i in range(n):
        for j in range(n):
            plt.text(j, i, f'{sim_matrix[i,j]:.2f}', ha='center', va='center', fontsize=12)
    
    plt.title('Face Similarity Matrix')
    plt.tight_layout()
    plt.savefig("similarity_matrix.png", dpi=150)
    plt.show()

## 5. Model Comparison

Compare embeddings from different models:

In [None]:
def test_model(model_name, image_path):
    """Test embedding extraction with specific model."""
    try:
        config = VerifierConfig(embedding_model=model_name)
        verifier = FaceVerifier(config=config)
        emb = verifier.extract_embedding(image_path)
        return {
            "model": model_name,
            "dimensions": len(emb),
            "status": "OK"
        }
    except Exception as e:
        return {
            "model": model_name,
            "dimensions": 0,
            "status": f"Error: {str(e)[:40]}"
        }

In [None]:
# Test each model
test_image = list(IMAGES.values())[0]

print("Embedding Model Comparison")
print("=" * 60)
print(f"{'Model':<15} {'Dimensions':<15} {'Status'}")
print("-" * 60)

for model in EMBEDDING_MODELS:
    result = test_model(model, test_image)
    print(f"{result['model']:<15} {result['dimensions']:<15} {result['status']}")

## Summary

Key takeaways:

1. Face embeddings are high-dimensional vectors representing faces
2. Cosine similarity is commonly used to compare face embeddings
3. Same person = high similarity (above 0.65), different person = low similarity
4. Facenet512 provides 512-dimensional embeddings with good accuracy
5. The embedding distribution can help identify quality issues

Next, see `04_threshold_tuning.ipynb` to learn about optimal threshold selection.