# PCA

This notebook analyzes the PCA components and distances of FFHQ latent representations obtained with the Stable Diffusion autoencoder.

## Setup

In [None]:
import numpy as np
import torch

### Load SD latents

If these latents are not available, they can be generated by running `src/run/encode_ffhq_to_sd_latents.py`.

In [None]:
import torch

# Load precomputed SD latents
latents = torch.load("../data/ffhq/sd_latents.pt", weights_only=False)

# Store latent shape for later
latent_shape = latents.shape[1:]

In [None]:
# Prepare latents for PCA
latents_flat = latents.view(latents.size(0), -1).cpu().numpy()

### Load smile scores

In [None]:
import json

# Load smile scores
smile_scores = json.load(open("../data/ffhq/smile_scores.json", "r"))

# Sort by file name
smile_scores = {k: smile_scores[k] for k in sorted(smile_scores.keys())}

# Convert to array
smile_scores = np.array(list(smile_scores.values()))

In [None]:
import matplotlib.pyplot as plt

# plot histogram of smile scores
plt.figure(figsize=(10, 5))
plt.hist(smile_scores, bins=50, color='blue', alpha=0.7)
plt.title("Histogram of Smile Scores")
plt.xlabel("Smile Score")
plt.ylabel("Frequency")
plt.grid()
plt.show()

## Fit PCA

In [None]:
from sklearn.decomposition import PCA
import pickle

# Fit PCA on the latents
pca = PCA()
pca.fit(latents_flat)

# Save PCA model
with open("../models/feature_selection/sd_latents_pca_model.pkl", "wb") as f:
    pickle.dump(pca, f)

In [None]:
import pickle

# Load PCA model
with open("../models/feature_selection/sd_latents_pca_model.pkl", "rb") as f:
    pca = pickle.load(f)

## Transform SD latents into PCA space

In [None]:
# transform latents using PCA
pca_latents = pca.transform(latents_flat)

## Analysis and Visualization

In [None]:
import matplotlib.pyplot as plt

# Get the cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 5))
plt.plot(cumulative_variance)
plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("PCA Component Index")
plt.ylabel("Cumulative Explained Variance Ratio")
plt.grid()
plt.show()

In [None]:
# Subset PCA latents for high and low smile scores
pca_latents_high = pca_latents[smile_scores > 3.0]
pca_latents_low = pca_latents[smile_scores < 1.0]

# Compute the difference between high and low smile scores per PCA component
pca_diff = np.abs(np.mean(pca_latents_high, axis=0) - np.mean(pca_latents_low, axis=0))

In [None]:
# Statistics of the PCA difference
print(
    f"Mean: {np.mean(pca_diff)}, Std: {np.std(pca_diff)}\n" +
    f"Min: {np.min(pca_diff)}, Max: {np.max(pca_diff)}\n" +
    f"Median: {np.median(pca_diff)}, 25th Percentile: {np.percentile(pca_diff, 25)}, 75th Percentile: {np.percentile(pca_diff, 75)}"
)

In [None]:
# Sort PCA components by the absolute difference and normalize to sum to 1
sorted_indices = np.argsort(pca_diff)[::-1]
pca_diff_0 = pca_diff[sorted_indices]
pca_diff_0 = pca_diff_0 / np.sum(pca_diff_0)

# Alternative 1: Include component std in difference
pca_diff_1 = pca_diff * pca.explained_variance_ratio_
sorted_indices_weighted = np.argsort(pca_diff_1)[::-1]
pca_diff_1 = pca_diff_1[sorted_indices_weighted]
pca_diff_1 = pca_diff_1 / np.sum(pca_diff_1)

# Alternative 2: Sort by difference, then multiply by component std
pca_diff_2 = pca_diff[sorted_indices] * pca.explained_variance_ratio_[sorted_indices]
pca_diff_2 = pca_diff_2 / np.sum(pca_diff_2)

# Cumulative plot of the absolute differences
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca_diff_0), color='green', label="Without eigenvalue weighting (diff, sorted)")
plt.plot(np.cumsum(pca_diff_1), color='blue', label="With eigenvalue weighting (diff, *var, sorted)")
plt.plot(np.cumsum(pca_diff_2), color='orange', linestyle='--', label="With eigenvalue weighting (diff, sorted, *var)")
plt.title("Cumulative Sum of Absolute Differences in PCA Components")
plt.xlabel("PCA Component Index (sorted by difference)")
plt.ylabel("Cumulative Absolute Difference")
plt.xlim(0, 2048)
plt.grid()
plt.legend()
plt.show()

In [None]:
# Get top 512 indices for both methods
top_512_indices = sorted_indices[:512]
top_512_indices_weighted = sorted_indices_weighted[:512]

# Count for each method how many indices are <= 512
count_512 = np.sum(top_512_indices < 512)
count_512_weighted = np.sum(top_512_indices_weighted < 512)
print(f"Count of top 512 indices (unweighted): {count_512}")
print(f"Count of top 512 indices (weighted): {count_512_weighted}")
print(f"Overlap between two methods: {len(set(top_512_indices) & set(top_512_indices_weighted))}")

In [None]:
# Plot cumulative explained variance, and cumulative explained smile score variance in one plot
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), label="Total Explained Variance")
plt.plot(np.cumsum(pca_diff_0), label="Smile Score Variance")

plt.title("Cumulative Explained Variance by PCA Components")
plt.xlabel("Number of PCA Components")
plt.ylabel("Cumulative Variance")
plt.grid()
plt.legend(loc='lower right')
# plt.savefig("vis/cum_pca_variance.pdf", bbox_inches='tight')
plt.show()

In [None]:
print("Number of smile score indices in the top 512:", (sorted_indices[:512] < 512).sum(),
      "This corresponds to a percentage of", (sorted_indices[:512] < 512).sum() / 512 * 100, "%")