# State-of-the-art Automated Metrics
List of metrics: https://pypi.org/project/piq/

In [1]:
!pip install opencv-python -q
!pip install torchvision -q
!pip install torch-fidelity -q
!pip install numpy -q
!pip install torch -q
!pip install 'transformers>=4.10.0' -q

In [2]:
#!python -m pip install torchmetrics -q
#!python3 -m pip install torchmetrics -q

In [3]:
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from torchmetrics.multimodal.clip_score import CLIPScore
from torchmetrics.image.fid import FrechetInceptionDistance
from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
from torchmetrics.image import StructuralSimilarityIndexMeasure, PeakSignalNoiseRatio

In [4]:
import json
import pandas as pd
import re
import cv2
import numpy as np
import torch
import torch.nn.functional as F

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Load dev split of MagicBrush (incl. paths to images).

In [68]:
dev_split = pd.read_csv("/home/jovyan/BA/Github/MagicBrush/dev_data_with_mask.csv")

In [69]:
dev_split.head(2)

Unnamed: 0,img_id,turn_index,source_img,target_img,instruction,mask_img
0,360871,1,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...,What if the vegetables are in a bowl?,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...
1,360871,2,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...,Let's add a drawing of a flower to the fridge.,/home/jovyan/BA/Github/MagicBrush/dev-00002-of...


## Initializing Scores

### Clip-Score
- [Link](https://lightning.ai/docs/torchmetrics/stable/multimodal/clip_score.html)
- Score is between 0 and 100, higher = better.
- **Input**
    - img = single tensor with shape (N, C, H, W), or multiple tensors
    - text = str, or tensor
- **Output**: float scalar tensor
- **Paras**: model_name_or_path, e.g. "openai/clip-vit-base-patch16"

In [14]:
clip = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32")

### Learned Perceptual Image Patch Similarity (LPIPS)
- [Link](https://lightning.ai/docs/torchmetrics/stable/image/learned_perceptual_image_patch_similarity.html)
- Calculates perceptual similarity between two images, low=more similar.
- **Input**: 2 tensors img, shape (N, 3, H, W). The minimum size of H, W depends on the chosen backbone.
- **Parameters**
    - net_type (Literal['vgg', 'alex', 'squeeze'])
    - reduction (Literal['sum', 'mean']) – str indicating how to reduce over the batch dimension. Choose between ‘sum’ or ‘mean’.
    - normalize (bool) – by default this is False meaning that the input is expected to be in the [-1,1] range. If set to True will instead expect input to be in the [0,1] range.

In [16]:
lpips = LearnedPerceptualImagePatchSimilarity(
    net_type="vgg", reduction="mean", normalize=True
)

### Structural Similarity Index Measure (SSIM)

- [Link](https://lightning.ai/docs/torchmetrics/stable/image/structural_similarity.html)
- Perceptual metric that quantifies image quality degradation.
- Between -1 and 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates perfect anti-correlation.
- **Output**: float scalar tensor with average SSIM value over sample else returns tensor of shape (N,) with SSIM values per sample.

In [17]:
ssim = StructuralSimilarityIndexMeasure(
    gaussian_kernel=True, sigma=1.0, reduction="elementwise_mean", data_range=None
)

### Peak Signal-to-Noise Ratio (PSNR)

- [Link](https://lightning.ai/docs/torchmetrics/stable/image/peak_signal_noise_ratio.html)
- Ratio between the maximum possible value (power) of a signal and the power of distorting noise that affects the quality of its representation.
- **Output:**
    - Range from 0 to infinity - Typically range from about 20 dB to 50 dB.
    - Lower values indicate poorer quality, while higher values indicate better quality. 0 dB indicates that the images are identical.
- **Orientation:**
    - High Quality: Above 30 dB.
    - Medium Quality: Between 20 dB and 30 dB.
    - Low Quality: Below 20 dB.

In [18]:
psnr = PeakSignalNoiseRatio(
    data_range=None, base=10, reduction="elementwise_mean", dim=None
)

### Mean Absolute Error (MAE) und Mean Square Error (MSE)
- [Link to MAE](https://lightning.ai/docs/torchmetrics/stable/regression/mean_absolute_error.html)
- [Link to MSE](https://lightning.ai/docs/torchmetrics/stable/regression/mean_squared_error.html)
- Compare the “true” pixel values of the original image to the degraded image.
- MSE represents the average of the squares of the "errors" between the actual image and the noisy image.
- **Output**:
    - Between 0 and infinity.
    - A lower MAE value indicates that the edited image is closer to the original image, while a higher MAE indicates greater differences.

In [19]:
mean_absolute_error = MeanAbsoluteError()
mean_squared_error = MeanSquaredError()

## Computing Scores

In [79]:
target_size = (512, 512)

In [80]:
def get_mask_area(output_image, mask_image):
    output_array = np.array(output_image)
    mask_array = np.array(mask_image)
    masked_area = cv2.absdiff(output_array, mask_array)
    return masked_area


def get_mask_area_image(output_image, mask_image):
    masked_area = get_mask_area(output_image, mask_image)
    return Image.fromarray(masked_area)


def get_tensor(image):
    to_tensor = transforms.ToTensor()
    return to_tensor(image).unsqueeze(0)


def get_masked_area_tensor(output_image, mask_image):
    masked_area_image = get_mask_area_image(output_image, mask_image)
    return get_tensor(masked_area_image)

Calculate four different scores: 
- edited area & unedited area
- edited area & original image
- original image & output image
- unedited masked area & edited masked area

Exception: CLIP
- instruction & edited are
- instruction & output image
- instruction & input & output images averaged

In [64]:
results = []

for index, row in tqdm(dev_split.iterrows(), desc="Progress"):
    id = row["img_id"]
    turn = row["turn_index"]
    instruction = row["instruction"]

    # open images, resize to 512x512, and convert to RGB
    output_image = Image.open(row["target_img"]).resize(target_size).convert("RGB")
    input_image = Image.open(row["source_img"]).resize(target_size).convert("RGB")
    mask_image = Image.open(row["mask_img"]).resize(target_size).convert("RGB")

    masked_area_tensor = get_masked_area_tensor(output_image, mask_image)
    # masked area prior to edit
    initial_masked_area_tensor = get_masked_area_tensor(input_image, mask_image)

    outside_mask_tensor = get_tensor(mask_image.convert("RGB"))
    output_image_tensor = get_tensor(output_image)
    full_img_tensor = F.interpolate(
        get_tensor(input_image), size=target_size, mode="bilinear", align_corners=False
    )

    # image = torch.cat([get_tensor(input_image), get_tensor(output_image)], axis=1)
    image = (get_tensor(input_image) + get_tensor(output_image)) / 2
    image_tensor = F.interpolate(
        image, size=target_size, mode="bilinear", align_corners=False
    )

    metrics = {
        "clip": lambda x, y: clip(x, y).detach().item(),
        "lpips": lambda x, y: lpips(x, y).item(),
        "ssim": lambda x, y: ssim(x, y).item(),
        "psnr": lambda x, y: psnr(x, y).item(),
        "mae": lambda x, y: mean_absolute_error(x, y).item(),
        "mse": lambda x, y: mean_squared_error(x, y).item(),
    }

    # all metrics besides CLIP
    pairs = {
        "1": (masked_area_tensor, outside_mask_tensor),
        "2": (masked_area_tensor, full_img_tensor),
        "3": (full_img_tensor, output_image_tensor),
        "4": (initial_masked_area_tensor, masked_area_tensor),
    }

    # CLIP
    clip_pairs = {
        "1": (masked_area_tensor.clip(0, 1), instruction),
        "2": (output_image_tensor.clip(0, 1), instruction),
        "3": (image_tensor.clip(0, 1), instruction),
    }

    result = {"index": index}

    for i, (image, text) in clip_pairs.items():
        result[f"clip_score_{i}"] = clip(image.squeeze(0), text).detach().item()

    for metric, func in metrics.items():
        if metric == "clip":
            continue
        for i, (x, y) in pairs.items():
            result[f"{metric}_score_{i}"] = func(x, y)

    results.append(result)

Progress: 528it [32:09,  3.65s/it]


In [65]:
for result in results:
    for key, value in result.items():
        if key != "index":
            dev_split.at[result["index"], key] = value

In [67]:
dev_split.to_csv("auto_scores.csv")