# LSO Results Metrics

This notebook computes and compares metrics for latent and LoRA space optimization results.

The first part of this notebook implements the general metric functionalities, while the second part creates visualizations specifically for the experiment results.

## Setup

### Results Directory

In [None]:
from pathlib import Path

BASE_DIR = Path("../results").expanduser().resolve()

def get_result_dir(version : str, seed : int) -> Path:
    """
    Return the path to the first main.log that matches the seed.
    Allowed directory names:
      <version>_<seed>
      <version>_<seed>_<anything>
    """
    # exact match first (no job-id)
    exact = BASE_DIR / f"{version}_{seed}"
    if exact.is_dir():
        return exact

    # wildcard for any trailing underscore / job-id
    pattern = f"{version}_{seed}_*/"
    matches = sorted(BASE_DIR.glob(pattern))
    if not matches:
        raise FileNotFoundError(f"No log found for seed {seed} under {BASE_DIR}")
    return matches[-1]  # return the most recent match

### Image Dataset

In [None]:
from pathlib import Path
import re

from torch.utils.data import Dataset
from PIL import Image

_iter_re = re.compile(r"iter_(\d+)")

def iter_num_from_path(p: Path) -> int:
	m = _iter_re.search(str(p))
	return int(m.group(1)) if m else -1

def natural_name_key(name: str):
	parts = re.split(r"(\d+)", name)
	return [int(s) if s.isdigit() else s for s in parts]

class ImgDataset(Dataset):
	"""
	Loads all img images that live under the provided root directory.
	"""

	def __init__(self, version, seed, subdir="img_opt", transform=None, iteration_min=0, iteration_max=500):
		"""
		Args:
			version (str): Parent directory that contains data/samples.
			subdir (str): Subdirectory under data/samples to look for images.
			transform (Optional[Any]): Optional torchvision/Albumentations transform applied to the PIL image.
			iteration_min (int): Minimum number of iterations to consider.
			iteration_max (int): Maximum number of iterations to consider.
		"""
		self.root = get_result_dir(version, seed=seed)
		self.transform = transform
		self.iteration_min = iteration_min
		self.iteration_max = iteration_max

		# Collect all files
		all_files = list(self.root.glob(f"data/samples/iter_*/{subdir}/*.png"))

		# Keep only files from iterations <= iteration_max and >= iteration_min
		all_files = [p for p in all_files if self.iteration_min <= iter_num_from_path(p) <= self.iteration_max]

		# Sort by (iteration number, filename natural order)
		self.files = sorted(
			all_files,
			key=lambda p: (iter_num_from_path(p), natural_name_key(p.name))
		)

	def __len__(self):
		return len(self.files)

	def __getitem__(self, idx):
		img_path = self.files[idx]
		image = Image.open(img_path).convert("RGB")
		if self.transform is not None:
			image = self.transform(image)

		return image

## Fréchet Inception Distance (FID)

In [None]:
import yaml

import numpy as np
import torch
from torchvision import transforms

from src.metrics.fid import FIDScore

def get_fid_score(version, seeds=[42, 43, 44], iteration_min=0, iteration_max=500):
    """
    Compute the Fréchet Inception Distance (FID) score for a given version.
    Args:
        version (str): the version identifier for the model.
        seeds (list): List of random seeds to use for evaluation.
        iteration_min (int): Minimum number of iterations to consider.
        iteration_max (int): Maximum number of iterations to consider.
    Returns:
        float: The computed FID score.
        float: The standard deviation of the FID score across seeds.
    """

    scores = []
    for seed in seeds:
        # Get the result directory for the given version and seed
        result_dir = get_result_dir(version, seed)

        # Load hparams yaml
        hparams = yaml.safe_load(open(result_dir / "hparams.yaml", 'r'))

        # Derive min and max property range (that has not been seen during optimization)
        opt_min = int(hparams['max_property_value'])
        opt_max = 5

        # Derive image size
        if version.startswith("ex4_sdxl"):
            img_size = 1024
        elif version.startswith("ex4"):
            img_size = 512
        else:
            img_size = 256

        # Load optimized images as dataset
        img_opt_dataset = ImgDataset(
            version=version,
            seed=seed,
            subdir="img_opt",
            transform=transforms.Compose([
                transforms.Resize((img_size, img_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
            ]),
            iteration_min=iteration_min,
            iteration_max=iteration_max
        )

        # Initialize FID instance
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        fid_instance = FIDScore(img_size=img_size, device=device, num_workers=0)

        # Load real statistics
        fid_instance.load_real_stats(f"../data/ffhq/inception_stats/size_{img_size}_smile_{opt_min}_{opt_max}.pt")

        # Compute FID score for the optimized images
        fid_score = fid_instance.compute_score_from_data(img_opt_dataset)

        scores.append(float(fid_score))

    return np.mean(scores), np.std(scores)

In [None]:
get_fid_score(version="ex4_sdxl_gbo", seeds=[42, 43, 44])

## Perceptual Similarity (LPIPS)

In [None]:
import yaml

import numpy as np
import torch
from torchvision import transforms

from taming.modules.losses.lpips import LPIPS

# Initialize LPIPS instance
lpips = LPIPS().eval()

def get_lpips_score(version, seeds=[42, 43, 44], iteration_min=0, iteration_max=500):
	"""
	Compute the Learned Perceptual Image Patch Similarity (LPIPS) score for a given version.
	Args:
		version (str): the version identifier for the model.
		seeds (list): List of random seeds to use for evaluation.
		iteration_min (int): Minimum number of iterations to consider.
		iteration_max (int): Maximum number of iterations to consider.
	Returns:
		float: LPIPS score.
		float: Standard deviation of LPIPS score.
	"""
	# Move LPIPS to the appropriate device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	lpips.to(device)

	scores = []
	for seed in seeds:
		# Get the result directory for the given version and seed
		result_dir = get_result_dir(version, seed)

		# Load hparams yaml
		hparams = yaml.safe_load(open(result_dir / "hparams.yaml", 'r'))

		# Load optimized images as dataset
		img_opt_dataset = ImgDataset(
			version=version,
			seed=seed,
			subdir="img_opt",
			transform=transforms.Compose([
				transforms.ToTensor(),
				transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
			]),
			iteration_min=iteration_min,
			iteration_max=iteration_max
		)

		# Load original images as dataset
		img_orig_dataset = ImgDataset(
			version=version,
			seed=seed,
			subdir="img_orig",
			transform=transforms.Compose([
				transforms.ToTensor(),
				transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
			]),
			iteration_min=iteration_min,
			iteration_max=iteration_max
		)

		# Convert datasets to tensors
		img_opt_dataset = torch.stack([img for img in img_opt_dataset], dim=0).to(device)
		img_orig_dataset = torch.stack([img for img in img_orig_dataset], dim=0).to(device)

		# Compute LPIPS score for the optimized images
		lpips_score = lpips(img_opt_dataset, img_orig_dataset).mean().cpu().item()
		scores.append(float(lpips_score))

	return np.mean(scores), np.std(scores)

In [None]:
get_lpips_score(version="ex4_sdxl_gbo", seeds=[42, 43, 44])

## TopK

In [None]:
import numpy as np

def get_top_k(k, version, seeds=[42, 43, 44], iteration_max=500):
    """
    Compute the Top-K smile score (mean ± std over seeds).
    Args:
        k (int) : K in “Top-K”.
        version (str) : Model/version identifier.
        seeds (iterable[int]) : Random seeds to aggregate over.
        iteration_max (int) : Max evaluations to consider.
    Returns:
        tuple(float, float): (mean_topk, std_topk) across the given seeds.
    """
    topk_vals = []

    for seed in seeds:
        # Load the result file
        result_file = get_result_dir(version, seed) / "results.npz"
        results = np.load(result_file, allow_pickle=True)

        # Smile scores from iteration 1 to iteration_max
        scores = results["opt_point_properties"][:iteration_max]

        # Sort descending and pick the K-th best (account for short runs)
        k_idx = min(k, len(scores)) - 1
        topk = np.sort(scores)[::-1][k_idx]
        topk_vals.append(topk)

    mean_topk = float(np.mean(topk_vals))
    std_topk  = float(np.std(topk_vals))

    return mean_topk, std_topk

In [None]:
get_top_k(k=10, version="ex4_sdxl_gbo", seeds=[42, 43, 44], iteration_max=500)

## Mean Smile Score

In [None]:
import numpy as np

def get_smile_score(version, seeds=[42, 43, 44], iteration_max=500):
    """
    Compute the smile score mean and std for a given version and seeds.
    Args:
        version (str): the version identifier for the model.
        seeds (list): list of seeds to quantify variability.
        iteration_max (int): Maximum number of iterations to consider for the smile score.
    Returns:
        float: Mean smile score
        float: Std smile score
    """

    # Load the results for the specified version and seeds
    scores = []
    for seed in seeds:
        # Load results dictionary
        result_file = get_result_dir(version, seed) / "results.npz"
        results = np.load(result_file, allow_pickle=True)

		# Get smile scores
        opt_point_properties = results['opt_point_properties']
        opt_point_properties = opt_point_properties[:iteration_max]
        scores.append(opt_point_properties.mean(axis=0))

    # Compute mean and std
    mean_score = np.mean(scores)
    std_score = np.std(scores)

    return mean_score, std_score

In [None]:
get_smile_score(version="ex4_sdxl_gbo", seeds=[42, 43, 44])

## Runtime

In [None]:
import re
import numpy as np

def get_log_time(version, operation, seeds=[42, 43, 44]):
	"""
	Compute per-run mean time, then the grand mean ± std across runs.
	"""
	if operation == "train":
		line_re = re.compile(r"\b\w+\s+train done in ([\d.]+)s")
	elif operation == "opt":
		line_re = re.compile(r"\b\w+\s+opt done in ([\d.]+)s")
	else:
		raise ValueError(f"Unknown operation type: {operation}")

	mean_times = []
	for seed in seeds:
		log_path = get_result_dir(version, seed) / "main.log"

		with open(log_path, 'r') as f:
			lines = f.readlines()

		times = []
		for line in lines:
			match = line_re.search(line)
			if match:
				times.append(float(match.group(1)))

		if not times:
			raise ValueError(f"No time found in log for seed {seed}")

		mean_times.append(np.mean(times))

	# Compute mean and std across all seeds
	mean_time = np.mean(mean_times)
	std_time = np.std(mean_times)

	return mean_time, std_time

In [None]:
get_log_time(version="ex4_sdxl_gbo", operation="opt", seeds=[42, 43, 44])

# Experiments

## Experiment 1 (SD-VAE)

### Original vs. Initial smile score

In [None]:
original = []
initial = []

for seed in [42, 43, 44]:
	for comb in [
		"gp_train_lbfgsb", "gp_train_lbfgsb_pca", "gp_train_lbfgsb_fi",
		"dngo_train_lbfgsb", "dngo_train_lbfgsb_pca", "dngo_train_lbfgsb_fi",
		"gp_train_trustconstr", "gp_train_trustconstr_pca", "gp_train_trustconstr_fi",
		"dngo_train_trustconstr", "dngo_train_trustconstr_pca", "dngo_train_trustconstr_fi",
		"gp_train_trustconstrgmm_pca", "gp_train_trustconstrgmm_fi",
		"dngo_train_trustconstrgmm_pca", "dngo_train_trustconstrgmm_fi",
		"gbo_train", "gbo_train_pca", "gbo_train_fi",
	]:
		# Get the result directory for the current combination
		result_dir = get_result_dir(f"ex1_sd35_{comb}", seed=seed)

		# Load the results file
		scores_file = np.load(result_dir / "results.npz")

		# Get the original and initial smile scores of the first iteration
		original.extend(scores_file['orig_point_properties'][:5].tolist())
		initial.extend(scores_file['init_point_properties'][:5].tolist())

print(f"Number of smile scores: {len(original)}")
print(f"Original smile scores: {np.mean(original):.2f} ± {np.std(original):.2f}")
print(f"Initial smile scores: {np.mean(initial):.2f} ± {np.std(initial):.2f}")

## Experiment 3 (LatentVQVAE)

### Top10 evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

version_dict = {
	"ex3_dngo_train_lbfgsb": "DNGO",
	"ex3_gp_train_lbfgsb": "SparseGP",
	"ex3_gbo_train": "GBO",
}

result_dict = {}
max_iterations = 100
top_k = 10
x_axis = list(range(5, max_iterations+5, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_top_k(top_k, version, seeds=[42, 43, 44], iteration_max=it)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('Top10 score')
ax.set_xlim(0, 100)
ax.set_ylim(0, 5)
ax.axhline(y=2, color='gray', linestyle='--', label='Input Max')
ax.legend(loc="lower right")

plt.tight_layout()
# plt.savefig("vis/ex3_top10_evolution.pdf", bbox_inches="tight")
plt.show()

## Experiment 4 (LoRAdapter)

### SD1.5 & SDXL Top10 evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

version_dict = {
	"ex4_sd15_gp_style": "SD1.5 SparseGP",
	"ex4_sdxl_gp": "SDXL SparseGP",
	"ex4_sd15_dngo_style": "SD1.5 DNGO",
	"ex4_sdxl_dngo": "SDXL DNGO",
	"ex4_sd15_gbo_style": "SD1.5 GBO",
	"ex4_sdxl_gbo": "SDXL GBO",
}

result_dict = {}
max_iterations = 100
top_k = 10
x_axis = list(range(5, max_iterations+5, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_top_k(top_k, version, seeds=[42, 43, 44], iteration_max=it)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	if idx % 2 == 0:
		ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx // 2}")
	else:
		ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx // 2}", linestyle="--")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx // 2}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('Top10 score')
ax.set_xlim(0, 100)
ax.set_ylim(0, 5)
ax.axhline(y=2, color='gray', linestyle='--', label='Input Max')
ax.legend(loc="lower right")

plt.tight_layout()
# plt.savefig("vis/ex4_top10_evolution.pdf", bbox_inches="tight")
plt.show()

### Retraining Top10 evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

version_dict = {
	"ex4_sd15_dngo_long": "w/o Retraining",
	"ex4_sd15_dngo_long_retrain": "w/ Retraining",
}

result_dict = {}
max_iterations = 500
top_k = 50
x_axis = list(range(5, max_iterations+5, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_top_k(top_k, version, seeds=[42, 43, 44], iteration_max=it)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('Top50 score')
ax.set_xlim(0, 500)
ax.set_ylim(0, 5)
ax.axhline(y=2, color='gray', linestyle='--', label='Input Max')
ax.legend(loc="lower right")

plt.tight_layout()
# plt.savefig("vis/ex4_retraining_evolution.pdf", bbox_inches="tight")
plt.show()

## Experiment 5 (Comparison)

### Top10 evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

version_dict = {
	"ex5_sd35_dngo_long": "LSO-SD",
	"ex5_latentvqvae_dngo_long": "LSO-LatentVQVAE",
	"ex4_sd15_dngo_long": "LoRASO",
}

result_dict = {}
max_iterations = 500
top_k = 10
x_axis = list(range(5, max_iterations+5, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_top_k(top_k, version, seeds=[42, 43, 44], iteration_max=it)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('Top10 score')
ax.set_xlim(0, 500)
ax.set_ylim(0, 5)
ax.axhline(y=2, color='gray', linestyle='--', label='Input Max')
ax.legend(loc="lower right")

plt.tight_layout()
# plt.savefig("vis/ex5_top10_evolution.pdf", bbox_inches="tight")
plt.show()

### Top50 evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

version_dict = {
	"ex5_sd35_dngo_long": "LSO-SD",
	"ex5_latentvqvae_dngo_long": "LSO-LatentVQVAE",
	"ex4_sd15_dngo_long": "LoRASO",
}

result_dict = {}
max_iterations = 500
top_k = 50
x_axis = list(range(5, max_iterations+5, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_top_k(top_k, version, seeds=[42, 43, 44], iteration_max=it)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('Top50 score')
ax.set_xlim(0, 500)
ax.set_ylim(0, 5)
ax.axhline(y=2, color='gray', linestyle='--', label='Input Max')
ax.legend(loc="lower right")

plt.tight_layout()
# plt.savefig("vis/ex5_top50_evolution.pdf", bbox_inches="tight")
plt.show()

### FID evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

version_dict = {
	"ex5_sd35_dngo_long": "LSO-SD",
	"ex5_latentvqvae_dngo_long": "LSO-LatentVQVAE",
	"ex4_sd15_dngo_long": "LoRASO",
}

result_dict = {}
max_iterations = 500
x_axis = list(range(5, max_iterations+1, 50))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_fid_score(version, seeds=[42, 43, 44], iteration_min=0, iteration_max=it-5)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('FID score')
ax.set_xlim(0, 500)
ax.legend(loc="upper right")

plt.tight_layout()
# plt.savefig("vis/ex5_fid_evolution.pdf", bbox_inches="tight")
plt.show()

### LPIPS evolution

In [None]:
import matplotlib.pyplot as plt
import numpy as np

version_dict = {
	"ex5_sd35_dngo_long": "LSO-SD",
	"ex5_latentvqvae_dngo_long": "LSO-LatentVQVAE",
	"ex4_sd15_dngo_long": "LoRASO",
}

result_dict = {}
max_iterations = 500
x_axis = list(range(5, max_iterations+1, 5))

fig, ax = plt.subplots(figsize=(6, 4))


for idx, (version, version_name) in enumerate(version_dict.items()):

	mean_list, var_list = [], []
	for it in x_axis:
		m, v = get_lpips_score(version, seeds=[42, 43, 44], iteration_min=it-5, iteration_max=it-5)

		mean_list.append(m)
		var_list.append(v)

	# Convert to arrays
	mean_arr = np.array(mean_list)
	var_arr = np.array(var_list)

	ax.plot(x_axis, mean_arr, label=f'{version_name}', color=f"C{idx}")
	ax.fill_between(x_axis, mean_arr-var_arr, mean_arr+var_arr, alpha=0.2, color=f"C{idx}")

ax.set_xlabel('Number of smile classifier evaluations')
ax.set_ylabel('LPIPS score')
ax.set_xlim(0, 500)
ax.set_ylim(0, 1)
ax.legend(loc="upper right")

plt.tight_layout()
# plt.savefig("vis/ex5_lpips_evolution.pdf", bbox_inches="tight")
plt.show()