<a href="https://colab.research.google.com/github/lukiod/T2I-and-I2I-Report/blob/main/metrics_for_fid_and_clipscore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Fid Score

In [16]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import os
from scipy.linalg import sqrtm
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_image(image_path):
    """
    Load and preprocess an image for feature extraction with PyTorch models
    """
    # Check if the file exists
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found: {image_path}")

    # Define transforms for InceptionV3 - resize to 299x299 and normalize
    preprocess = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load the image
    img = Image.open(image_path).convert('RGB')

    # Apply preprocessing
    img_tensor = preprocess(img)

    # Add batch dimension
    img_tensor = img_tensor.unsqueeze(0)

    return img_tensor

class InceptionFeatureExtractor(nn.Module):
    """
    Class to extract features from InceptionV3 model
    """
    def __init__(self):
        super(InceptionFeatureExtractor, self).__init__()
        # Load pretrained InceptionV3
        self.inception = models.inception_v3(pretrained=True)
        # Remove the last fully connected layer
        self.inception.fc = nn.Identity()
        # Set to evaluation mode
        self.inception.eval()

    def forward(self, x):
        # Set model to not require gradients
        with torch.no_grad():
            # Need to handle the InceptionV3's auxiliary outputs for compatibility
            if self.inception.training:
                output, _ = self.inception(x)
            else:
                output = self.inception(x)
        return output

def calculate_fid_for_two_images(image_path1, image_path2):
    """
    Calculate FID score between two individual images using PyTorch
    """
    try:
        # Set device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize feature extractor
        feature_extractor = InceptionFeatureExtractor().to(device)

        # Load and preprocess images
        img1 = load_and_preprocess_image(image_path1).to(device)
        img2 = load_and_preprocess_image(image_path2).to(device)

        # Extract features
        feature1 = feature_extractor(img1).cpu().numpy()
        feature2 = feature_extractor(img2).cpu().numpy()

        # Calculate mean of features
        mean1, mean2 = feature1[0], feature2[0]

        # For single images, use a small identity matrix with noise as covariance
        feature_dim = feature1.shape[1]
        eps = 1e-6
        cov1 = np.eye(feature_dim) * eps
        cov2 = np.eye(feature_dim) * eps

        # Calculate sum of squared difference between means
        ssdiff = np.sum((mean1 - mean2) ** 2.0)

        # Calculate sqrt of product of covariances
        covmean = sqrtm(cov1.dot(cov2))

        # Check for complex numbers from sqrtm
        if np.iscomplexobj(covmean):
            covmean = covmean.real

        # Calculate FID
        fid = ssdiff + np.trace(cov1 + cov2 - 2.0 * covmean)

        return fid

    except Exception as e:
        return f"Error calculating FID: {str(e)}"

def main():
    """
    Main function to calculate FID between two 512x512 RGB images
    """
    # Image paths
    image_path1 = "1.png"  # 512x512 RGB image
    image_path2 = "2.png"  # 512x512 RGB image

    print("Starting FID calculation between two images...")
    print(f"Image 1: {image_path1}")
    print(f"Image 2: {image_path2}")

    # Calculate FID
    fid_score = calculate_fid_for_two_images(image_path1, image_path2)

    # Print result
    print(f"\nFID score between the two images: {fid_score}")

    # Additional information
    print("\nNote: Lower FID scores indicate more similar images.")
    print("FID typically measures distribution similarity between sets of images.")
    print("This adaptation works for individual images but should be interpreted accordingly.")

if __name__ == "__main__":
    main()

Starting FID calculation between two images...
Image 1: 1.png
Image 2: 2.png

FID score between the two images: 3.0817925930023193

Note: Lower FID scores indicate more similar images.
FID typically measures distribution similarity between sets of images.
This adaptation works for individual images but should be interpreted accordingly.


#Clip Score

In [3]:
!pip install transformers torch Pillow requests

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [17]:
import torch
from PIL import Image
import requests # Keep for the function's URL capability
from io import BytesIO # Keep for the function's URL capability
from transformers import CLIPProcessor, CLIPModel
import os # To check if path exists

# --- Configuration ---
MODEL_NAME = "openai/clip-vit-base-patch32" # Or "openai/clip-vit-large-patch14"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# --- Load Model and Processor ---
# Load the CLIP model and processor just once
try:
    model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
    processor = CLIPProcessor.from_pretrained(MODEL_NAME)
    print(f"Successfully loaded model '{MODEL_NAME}' and processor.")
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure you have installed the required libraries and have internet access.")
    exit() # Exit if model loading fails

# --- Function to Calculate CLIP Score (same as before) ---
def get_clip_score(image_path_or_url, text_description):
    """
    Calculates the CLIP score (cosine similarity) between an image and a text description.

    Args:
        image_path_or_url (str): Path to the local image file OR URL of the image.
        text_description (str): The text description to compare with the image.

    Returns:
        float: The cosine similarity score between the image and text embeddings (between -1 and 1).
               Returns None if image loading or processing fails.
    """
    image = None
    try:
        # Check if it's a URL
        if image_path_or_url.startswith("http://") or image_path_or_url.startswith("https://"):
            response = requests.get(image_path_or_url, stream=True)
            response.raise_for_status()
            image = Image.open(BytesIO(response.content)).convert("RGB")
            # print(f"Successfully loaded image from URL: {image_path_or_url}") # Less verbose for batch
        # Check if it's a local file path
        elif os.path.exists(image_path_or_url):
            image = Image.open(image_path_or_url).convert("RGB")
            # print(f"Successfully loaded image from path: {image_path_or_url}") # Less verbose for batch
        else:
             print(f"Error: Image path or URL does not exist: {image_path_or_url}")
             return None

    except requests.exceptions.RequestException as e:
        print(f"Error downloading image from URL {image_path_or_url}: {e}")
        return None
    except FileNotFoundError:
        print(f"Error: Image file not found at: {image_path_or_url}")
        return None
    except Exception as e:
        print(f"Error opening or processing image {image_path_or_url}: {e}")
        return None

    if image is None:
        return None

    try:
        # --- Preprocess Data ---
        inputs = processor(
            text=[text_description], # Note: text is always a list
            images=image,
            return_tensors="pt",
            padding=True
        ).to(DEVICE)

        # --- Get Embeddings ---
        with torch.no_grad():
            outputs = model(**inputs)
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds

        # --- Calculate Cosine Similarity ---
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
        cosine_similarity = (image_embeds @ text_embeds.T).item()

        return cosine_similarity

    except Exception as e:
        print(f"Error during CLIP processing for {image_path_or_url}: {e}")
        return None


# --- Main Execution ---
if __name__ == "__main__":
    image_files = ["1.png", "2.png"] # Your image filenames
    text_prompt = "turn it into a cyborg" # Your specific prompt

    print(f"\nCalculating CLIP scores for prompt: '{text_prompt}'")
    print("-" * 30)

    results = {} # Dictionary to store results

    for image_path in image_files:
        print(f"Processing: {image_path}")
        if not os.path.exists(image_path):
            print(f"  WARNING: Image file not found at '{image_path}'. Skipping.")
            results[image_path] = None # Record that it was skipped
            continue # Move to the next image

        # Calculate the score
        score = get_clip_score(image_path, text_prompt)
        results[image_path] = score # Store the score (or None if an error occurred)

        if score is not None:
            print(f"  CLIP Score: {score:.4f}")
        else:
            print(f"  Failed to calculate CLIP score.")
        print("-" * 10) # Separator for clarity

    # --- Summary ---
    print("\n--- Summary ---")
    print(f"Prompt: '{text_prompt}'")
    for img, score in results.items():
        if score is not None:
            print(f"Image: {img} -> Score: {score:.4f}")
        else:
            print(f"Image: {img} -> Score: Failed/Skipped")

Using device: cpu
Successfully loaded model 'openai/clip-vit-base-patch32' and processor.

Calculating CLIP scores for prompt: 'turn it into a cyborg'
------------------------------
Processing: 1.png
  CLIP Score: 0.2535
----------
Processing: 2.png
  CLIP Score: 0.2515
----------

--- Summary ---
Prompt: 'turn it into a cyborg'
Image: 1.png -> Score: 0.2535
Image: 2.png -> Score: 0.2515
