# Automated Metrics
Summary: https://pypi.org/project/piq/

In [1]:
!pip install opencv-python -q
!pip install torchvision -q
!pip install torch-fidelity -q
!pip install numpy -q
!pip install torch -q
!pip install 'transformers>=4.10.0' -q
#!pip install https://github.com/Lightning-AI/torchmetrics/archive/master.zip -q

In [2]:
# DreamSIM
!pip install dreamsim -q
#!mkdir models/
#!wget -O models/open_clip_vitb32_pretrain.pth.tar https://github.com/ssundaram21/dreamsim/releases/download/v0.1.0/open_clip_vitb32_pretrain.pth.tar -q
from dreamsim import dreamsim

In [3]:
!python -m pip install torchmetrics -q
!python3 -m pip install torchmetrics -q

In [4]:
from torchmetrics.multimodal.clip_score import CLIPScore
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
from torchmetrics.image import StructuralSimilarityIndexMeasure, PeakSignalNoiseRatio
from PIL import Image
from torchvision import transforms
from torchmetrics.image.fid import FrechetInceptionDistance
from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError

In [22]:
import json
import pandas as pd
import re
import cv2
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Load manually sampled examples:
- Good examples have a "human_rating_binary" of 1,
- Bad examples of 0

In [8]:
samples = pd.read_csv("./samples.csv", sep=";")

In [9]:
samples.head()

Unnamed: 0,id,turn,human_rating_binary
0,49,2,0
1,49,3,0
2,5434,3,0
3,43425,2,0
4,54492,2,0


Load input image names and text instruction 

In [10]:
with open('./edit_turns.json') as f:
    turns = json.load(f)

In [11]:
turns[:2]

[{'input': '142585-input.png',
  'mask': '142585-mask1.png',
  'output': '142585-output1.png',
  'instruction': 'Let the van turn black.'},
 {'input': '392687-input.png',
  'mask': '392687-mask1.png',
  'output': '392687-output1.png',
  'instruction': 'change the scooter into a skateboard'}]

## Scores

### Clip-Score
- [Link](https://lightning.ai/docs/torchmetrics/stable/multimodal/clip_score.html)
- Score is between 0 and 100, higher = better
- Input
    - img = single tensor with shape (N, C, H, W), or multiple tensors
    - text = str, or tensor
- Output
    - float scalar tensor
- Paras
    - model_name_or_path, e.g. "openai/clip-vit-base-patch16"

In [12]:
clip = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

## FID
[Link](https://lightning.ai/docs/torchmetrics/stable/image/frechet_inception_distance.html)
- To measure the similarity (distribution) between two datasets of images
- Requires multiple images for calculation!! Still incorporate it?

In [12]:
#fid = FrechetInceptionDistance(feature=64)
#fid.set_dtype(torch.float64) # for better results

## LPIPS
[Link](https://lightning.ai/docs/torchmetrics/stable/image/learned_perceptual_image_patch_similarity.html)
- Calculates perceptual similarity between two images, low=more similar
- Has been shown to match human perception

Input
- 2 tensors img, shape (N, 3, H, W). The minimum size of H, W depends on the chosen backbone

Parameters
- net_type (Literal['vgg', 'alex', 'squeeze'])
- reduction (Literal['sum', 'mean']) – str indicating how to reduce over the batch dimension. Choose between ‘sum’ or ‘mean’.
- normalize (bool) – by default this is False meaning that the input is expected to be in the [-1,1] range. If set to True will instead expect input to be in the [0,1] range.

In [13]:
lpips = LearnedPerceptualImagePatchSimilarity(
    net_type='vgg', 
    reduction='mean', 
    normalize=True
)

  self.load_state_dict(torch.load(model_path, map_location="cpu"), strict=False)


## SSIM

[Link](https://lightning.ai/docs/torchmetrics/stable/image/structural_similarity.html)
- Perceptual metric that quantifies image quality degradation
- Between -1 and 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates perfect anti-correlation

Output: 
- float scalar tensor with average SSIM value over sample else returns tensor of shape (N,) with SSIM values per sample

In [14]:
ssim = StructuralSimilarityIndexMeasure(
    gaussian_kernel=True,
    sigma=1.0,
    reduction='elementwise_mean',
    data_range=None
)

## PSNR

[Link](https://lightning.ai/docs/torchmetrics/stable/image/peak_signal_noise_ratio.html)
- peak signal-to-noise ratio (PSNR)
- Ratio between the maximum possible value (power) of a signal and the power of distorting noise that affects the quality of its representation

Output
- Range from 0 to infinity - Typically range from about 20 dB to 50 dB.
- Lower values indicate poorer quality, while higher values indicate better quality. 0 dB indicates that the images are identical.

Orientation
- High Quality: Above 30 dB.
- Medium Quality: Between 20 dB and 30 dB.
- Low Quality: Below 20 dB.

In [15]:
psnr = PeakSignalNoiseRatio(
    data_range=None,
    base=10,
    reduction='elementwise_mean',
    dim=None
)

## MAE und MSE
[Link to MAE](https://lightning.ai/docs/torchmetrics/stable/regression/mean_absolute_error.html)
- Mean Absolute Error

[Link to MSE](https://lightning.ai/docs/torchmetrics/stable/regression/mean_squared_error.html)
- Mean Square Error
- Compares the “true” pixel values of the original image to the degraded image
- The MSE represents the average of the squares of the "errors" between the actual image and the noisy image.

Output
- Between 0 and infinity
- A lower MAE value indicates that the edited image is closer to the original image, while a higher MAE indicates greater differences

In [16]:
mean_absolute_error = MeanAbsoluteError()
mean_squared_error = MeanSquaredError()

## DreamSim

- To measure the perceptual similarity between two images
- How? Concatenating CLIP, OpenCLIP, and DINO embeddings, and then finetuning on human perceptual judgements
- Higher score means more different, lower means more similar
- Link to [Notebook](https://colab.research.google.com/drive/1taEOMzFE9g81D9AwH27Uhy2U82tQGAVI?usp=sharing#scrollTo=zD2XN-UAvCZq)

In [17]:
#!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [18]:
dreamsim_model, dreamsim_preprocess = dreamsim(pretrained=True, device="cpu")

Using cache found in ./models/facebookresearch_dino_main


Using cached ./models


  WeightNorm.apply(module, name, dim)


## Calculate

In [32]:
pattern = r'(\d+)-output(\d+)'
target_size = (1024, 1024)
    
def get_path(human_rating):
    if human_rating==0:
        path = "../bad_samples"
    elif human_rating==1:
        path = "../good_samples"
    else: 
        print("Check Binary human rating, neither 0 nor 1!")
    return path
    
def get_mask_area(output_image, mask_image):
    mask_image = mask_image.convert('RGB')
    output_array = np.array(output_image)
    mask_array = np.array(mask_image)
    masked_area = cv2.absdiff(output_array, mask_array)
    return masked_area

def get_mask_area_image(output_image, mask_image):
    masked_area = get_mask_area(output_image, mask_image)
    return Image.fromarray(masked_area)

def get_tensor(image):   
    to_tensor = transforms.ToTensor()
    return to_tensor(image).unsqueeze(0) 

In [33]:
from tqdm import tqdm

In [37]:
def get_df():   
    results = []
    for index,row in tqdm(samples.iterrows(), desc="Progress"):
        id = row["id"] 
        turn = row["turn"]

        path = get_path(row["human_rating_binary"])

        for entry in turns:
            output = entry["output"]
            match = re.search(pattern, output)

            if match:
                found_id = match.group(1) # get id of sample
                found_turn = match.group(2) # get turn of sample

                if int(found_id) == id and int(found_turn) == turn: # check if turn is within samples
                    input = entry["input"]
                    mask = entry["mask"]
                    instruction = entry["instruction"]

                    output_image = Image.open(fr'{path}/{output}')
                    input_image = Image.open(fr'{path}/{input}')
                    mask_image = Image.open(fr'{path}/{mask}')
                    #print(input_image.size)

                    masked_area_image = get_mask_area_image(output_image, mask_image)
                    masked_area_tensor = get_tensor(masked_area_image)
                    
                    #input_image = input_image.resize((1024, 1024))
                    #initial_masked_area_image = get_mask_area_image(input_image, mask_image)
                    #initial_masked_area_tensor = get_tensor(initial_masked_area_image)
                    
                    outside_mask_tensor = get_tensor(mask_image.convert('RGB'))
                    output_image_tensor = get_tensor(output_image)
                    full_img_tensor = F.interpolate(get_tensor(input_image), 
                                                            size=target_size, 
                                                            mode='bilinear', 
                                                            align_corners=False)
                    
                    print(get_tensor(input_image).shape)
                    image = torch.cat([get_tensor(input_image), get_tensor(output_image)], axis=1)
                    image_tensor = F.interpolate(image_tensor,
                                                 size=target_size,
                                                 mode='bilinear',
                                                 align_corners=False)

                    # CLIP
                    clip_score_1 = clip(masked_area_tensor, instruction).detach().item()
                    clip_score_2 = clip(output_image_tensor, instruction).detach().item()
                    clip_score_3 = clip(image_tensor, instruction).detach().item()

                    """
                    Calculate two different scores: 
                    - edited area & unedited area
                    - edited area & original image
                    (*See examples below)
                    """

                    """
                    # LPIPS
                    lpips_score_1 = lpips(masked_area_tensor, outside_mask_tensor).item()
                    lpips_score_2 = lpips(masked_area_tensor, full_img_tensor).item()
                    lpips_score_3 = lpips(full_img_tensor, output_image_tensor).item()
                    lpips_score_4 = lpips(initial_masked_area_tensor, masked_area_tensor).item()

                    # SSIM
                    ssim_score_1 = ssim(masked_area_tensor, outside_mask_tensor).item()
                    ssim_score_2 = ssim(masked_area_tensor, full_img_tensor).item()
                    ssim_score_3 = ssim(full_img_tensor, output_image_tensor).item()
                    ssim_score_4 = ssim(initial_masked_area_tensor, masked_area_tensor).item()

                    # PSNR
                    psnr_score_1 = psnr(masked_area_tensor, outside_mask_tensor).item()
                    psnr_score_2 = psnr(masked_area_tensor, full_img_tensor).item()
                    psnr_score_3 = psnr(full_img_tensor, output_image_tensor).item()
                    psnr_score_4 = psnr(initial_masked_area_tensor, masked_area_tensor).item()

                    # MAE
                    mae_score_1 = mean_absolute_error(masked_area_tensor, outside_mask_tensor).item()
                    mae_score_2 =  mean_absolute_error(masked_area_tensor, full_img_tensor).item()
                    mae_score_3 =  mean_absolute_error(full_img_tensor, output_image_tensor).item()
                    mae_score_4 =  mean_absolute_error(initial_masked_area_tensor, masked_area_tensor).item()

                    # MSE
                    mse_score_1 =  mean_squared_error(masked_area_tensor, outside_mask_tensor).item()       
                    mse_score_2 =  mean_squared_error(masked_area_tensor, full_img_tensor).item()  
                    mse_score_3 =  mean_squared_error(full_img_tensor, output_image_tensor).item()
                    mse_score_4 =  mean_squared_error(initial_masked_area_tensor, masked_area_tensor).item()

                    # DreamSim
                    dream_sim_score_1 = dreamsim_model(
                        dreamsim_preprocess(masked_area_image).to("cpu"), 
                        dreamsim_preprocess(mask_image).to("cpu")
                    ).item()
                    
                    dream_sim_score_2 = dreamsim_model(
                        dreamsim_preprocess(masked_area_image).to("cpu"), 
                        dreamsim_preprocess(input_image).to("cpu")
                    ).item()
                    
                    dream_sim_score_3 = dreamsim_model(
                        dreamsim_preprocess(input_image).to("cpu"), 
                        dreamsim_preprocess(output_image).to("cpu")
                    ).item()
                    
                    dream_sim_score_4 = dreamsim_model(
                        dreamsim_preprocess(initial_masked_area_image).to("cpu"), 
                        dreamsim_preprocess(masked_area_image).to("cpu")
                    ).item()
                    """

                    result = {
                        "index": index,
                        "clip_score_1": clip_score_1,
                        "clip_score_2": clip_score_2,
                        "clip_score_3": clip_score_3,
                    }
                    results.append(result)

    for result in results:
        for key, value in result.items():
            if key != 'index':
                samples.at[result['index'], key] = value
    return samples

In [38]:
samples = get_df()

Progress: 0it [00:00, ?it/s]

torch.Size([1, 3, 1024, 1024])





UnboundLocalError: local variable 'image_tensor' referenced before assignment

In [None]:
samples

Unnamed: 0,id,turn,human_rating_binary,clip_score_1,clip_score_2,lpips_score_1,lpips_score_2,lpips_score_3,ssim_score_1,ssim_score_2,...,mse_score_3,dream_sim_score_1,dream_sim_score_2,dream_sim_score_3,lpips_score_4,ssim_score_4,psnr_score_4,mae_score_4,mse_score_4,dream_sim_score_4
0,49,2,0,22.089853,21.937899,0.743450,0.712001,0.090945,0.026027,0.101741,...,0.005973,0.684431,0.680819,0.049065,0.095240,0.925186,22.240963,0.020217,0.005969,0.404989
1,49,3,0,22.171751,22.083504,0.748147,0.721401,0.143029,0.030369,0.251738,...,0.008686,0.698212,0.792064,0.023627,0.140604,0.926705,20.617954,0.026669,0.008674,0.142823
2,5434,3,0,20.117992,20.334154,0.705543,0.722557,0.154690,0.047790,0.106797,...,0.016799,0.598214,0.768621,0.034291,0.181946,0.819952,17.881340,0.039803,0.016288,0.289991
3,43425,2,0,24.319836,24.311813,0.661717,0.663453,0.011401,0.060731,0.067517,...,0.000352,0.749138,0.780394,0.005919,0.011233,0.988803,34.401665,0.001965,0.000349,0.081140
4,54492,2,0,23.013315,23.010784,0.664067,0.682227,0.096621,0.038788,0.102482,...,0.011792,0.567202,0.652441,0.068155,0.088254,0.930151,19.289078,0.029084,0.011779,0.285989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,194956,2,1,24.225899,24.088795,0.773829,0.784145,0.038071,0.039838,0.066862,...,0.003676,0.780690,0.812421,0.005386,0.035032,0.976163,24.281298,0.010720,0.003673,0.286418
74,203920,1,1,23.103289,23.185959,0.662909,0.729869,0.304588,0.015874,0.220075,...,0.035316,0.581854,0.664295,0.286536,0.469181,0.675706,14.521969,0.083582,0.035302,0.524669
75,209923,3,1,21.210674,21.257814,0.730553,0.738669,0.046077,0.080936,0.100084,...,0.006593,0.736350,0.776160,0.009197,0.043388,0.957554,21.813662,0.014661,0.006586,0.288349
76,211860,1,1,22.933147,22.661072,0.731956,0.726820,0.063587,0.000903,0.022761,...,0.004571,0.707377,0.789326,0.108386,0.319897,0.832182,23.406586,0.013926,0.004564,0.306432


In [48]:
samples.to_csv("auto_scores_4.csv")

In [None]:
                        """
                        "lpips_score_1": lpips_score_1,
                        "lpips_score_2": lpips_score_2,
                        "lpips_score_3": lpips_score_3,
                        "lpips_score_4": lpips_score_4,
                        "ssim_score_1": ssim_score_1,
                        "ssim_score_2": ssim_score_2,
                        "ssim_score_3": ssim_score_3,
                        "ssim_score_4": ssim_score_4,
                        "psnr_score_1": psnr_score_1,
                        "psnr_score_2": psnr_score_2,
                        "psnr_score_3": psnr_score_3,
                        "psnr_score_4": psnr_score_4,
                        "mae_score_1": mae_score_1,
                        "mae_score_2": mae_score_2,
                        "mae_score_3": mae_score_3,
                        "mae_score_4": mae_score_4,
                        "mse_score_1": mse_score_1,
                        "mse_score_2": mse_score_2,
                        "mse_score_3": mse_score_3,
                        "mse_score_4": mse_score_4,
                        "dream_sim_score_1": dream_sim_score_1,
                        "dream_sim_score_2": dream_sim_score_2,
                        "dream_sim_score_3": dream_sim_score_3,
                        "dream_sim_score_4": dream_sim_score_4,
                        """

Other

In [None]:
# FID: did not work as it requires multiple real images and multiple generated images
"""
transform = transforms.Compose([
    transforms.Resize((299, 299)),  # InceptionV3 expects 299x299
    transforms.ToTensor()
    transforms.Normalize(mean=[0, 0, 0], std=[1/255.0, 1/255.0, 1/255.0]),
])

#masked_image = Image.fromarray(cv2.cvtColor(masked_area, cv2.COLOR_BGR2RGB))

input_transformed = transform(input_image)
masked_transformed = transform(masked_area_image)

input_transformed = (input_transformed * 255).byte()
masked_transformed = (masked_transformed * 255).byte()

input_transformed = input_transformed.unsqueeze(0)
masked_transformed = masked_transformed.unsqueeze(0)

fid.update(input_transformed, real=True)
fid.update(masked_transformed, real=False)
fid_value = fid.compute()
#samples.at[index, 'fid_value_1'] = fid_value
print(fid_value)
"""