photosynthesis-team · snk4tr · Jun 5, 2023 · Feb 28, 2023 · Mar 1, 2023 · Apr 5, 2023
diff --git a/.github/workflows/ci-mypy.yml b/.github/workflows/ci-mypy.yml
@@ -24,6 +24,6 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Check types with mypy
         run: |
-          python3 -m pip install mypy
+          python3 -m pip install mypy types-setuptools
           # stop the build if there are Python syntax errors or undefined names
           python3 -m mypy piq/ --allow-redefinition
diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         include:
           - python-version: "3.7"
-            torchvision-version: "0.6.1"
+            torchvision-version: "0.10.0"
           - python-version: "3.7"
             torchvision-version: "0.14.1"
           - python-version: "3.10"
@@ -40,6 +40,9 @@ jobs:
             ${{ runner.os }}-pip-
             ${{ runner.os }}-
       - name: Install dependencies
+      # It is important to freeze scikit-image to version < 0.21.0 because there they change the default number of
+      # dimensions in loaded images, which breaks image loading-related tests
+      # Might update the version in the future but tests need to be adjusted accordingly.
         run: |
           python -m pip install --upgrade pip setuptools wheel
           pip install torchvision==${{ matrix.torchvision-version }}
@@ -49,7 +52,7 @@ jobs:
                       tensorflow \
                       libsvm \
                       pybrisque \
-                      scikit-image \
+                      "scikit-image<=0.20.0" \
                       pandas \
                       tqdm
           pip install --upgrade scipy

diff --git a/README.rst b/README.rst
@@ -219,7 +219,7 @@ Benchmark
 ---------
 
 As part of our library we provide `code to benchmark <tests/results_benchmark.py>`_ all metrics on a set of common Mean Opinon Scores databases.
-Currently we support `TID2013`_,  `KADID10k`_ and `PIPAL`_.
+Currently we support several Full-Reference (`TID2013`_,  `KADID10k`_ and `PIPAL`_) and No-Reference (`KonIQ10k`_ and `LIVE-itW`_) datasets.
 You need to download them separately and provide path to images as an argument to the script.
 
 Here is an example how to evaluate SSIM and MS-SSIM metrics on TID2013 dataset:
@@ -228,7 +228,7 @@ Here is an example how to evaluate SSIM and MS-SSIM metrics on TID2013 dataset:
 
    python3 tests/results_benchmark.py --dataset tid2013 --metrics SSIM MS-SSIM --path ~/datasets/tid2013 --batch_size 16
 
-Below we provide a comparison between `Spearman's Rank Correlation cCoefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ (SRCC) values obtained with PIQ and reported in surveys.
+Below we provide a comparison between `Spearman's Rank Correlation Coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ (SRCC) values obtained with PIQ and reported in surveys.
 Closer SRCC values indicate the higher degree of agreement between results of computations on given datasets.
 We do not report `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ (KRCC)
 as it is highly correlated with SRCC and provides limited additional information.
@@ -237,6 +237,8 @@ as it's highly dependent on fitting method and is biased towards simple examples
 
 For metrics that can take greyscale or colour images, ``c`` means chromatic version.
 
+Full-Reference (FR) Datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ===========  ===========================  ===========================  ===========================
      \                  TID2013                    KADID10k                       PIPAL
 -----------  ---------------------------  ---------------------------  ---------------------------
@@ -264,13 +266,25 @@ LPIPS-VGG    0.67 / 0.67 `DISTS`_         0.72 / -                     0.57 / 0.
 PieAPP       0.84 / 0.88 `DISTS`_         0.87 / -                     0.70 / 0.71 `PIPAL`_
 DISTS        0.81 / 0.83 `DISTS`_         0.88 / -                     0.62 / 0.66 `PIPAL`_
 BRISQUE      0.37 / 0.84 `Eval2019`_      0.33 / 0.53 `KADID10k`_      0.21 / -
+CLIP-IQA     0.50 / -                     0.48 / -                     0.26 / -
 IS           0.26 / -                     0.25 / -                     0.09 / -
 FID          0.67 / -                     0.66 / -                     0.18 / -
 KID          0.42 / -                     0.66 / -                     0.12 / -
 MSID         0.21 / -                     0.32 / -                     0.01 / -
 GS           0.37 / -                     0.37 / -                     0.02 / -
 ===========  ===========================  ===========================  ===========================
 
+No-Reference (NR) Datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+===========  ===========================  ===========================
+     \                  KonIQ10k                    LIVE-itW         
+-----------  ---------------------------  ---------------------------
+  Source            PIQ / Reference            PIQ / Reference       
+===========  ===========================  ===========================
+BRISQUE      0.22 / -                     0.31 / -                   
+CLIP-IQA     0.68 / 0.68 `CLIP-IQA off`_  0.64 / 0.64 `CLIP-IQA off`_
+===========  ===========================  ===========================
+
 .. _TID2013: http://www.ponomarenko.info/tid2013.htm
 .. _KADID10k: http://database.mmsp-kn.de/kadid-10k-database.html
 .. _Eval2019: https://ieeexplore.ieee.org/abstract/document/8847307/
@@ -280,6 +294,9 @@ GS           0.37 / -                     0.37 / -                     0.02 / -
 .. _HaarPSI: https://arxiv.org/abs/1607.06140
 .. _PIPAL: https://arxiv.org/pdf/2011.15002.pdf
 .. _IW-SSIM: https://ieeexplore.ieee.org/document/7442122
+.. _KonIQ10k: http://database.mmsp-kn.de/koniq-10k-database.html
+.. _LIVE-itW: https://live.ece.utexas.edu/research/ChallengeDB/index.html
+.. _CLIP-IQA off: https://github.com/IceClear/CLIP-IQA
 
 Unlike FR and NR IQMs, designed to compute an image-wise distance, the DB metrics compare distributions of *sets* of images.
 To address these problems, we adopt a different way of computing the DB IQMs proposed in `<https://arxiv.org/abs/2203.07809>`_.

diff --git a/piq/__init__.py b/piq/__init__.py
@@ -22,3 +22,4 @@
 from .pieapp import PieAPP
 from .dss import dss, DSSLoss
 from .iw_ssim import information_weighted_ssim, InformationWeightedSSIMLoss
+from .clip_iqa import CLIPIQA
diff --git a/piq/clip_iqa.py b/piq/clip_iqa.py
@@ -0,0 +1,128 @@
+r"""This module implements CLIP-IQA metric in PyTorch.
+
+The metric is proposed in:
+"Exploring CLIP for Assessing the Look and Feel of Images"
+by Jianyi Wang, Kelvin C.K. Chan and Chen Change Loy.
+AAAI 2023.
+https://arxiv.org/abs/2207.12396
+
+This implementation is inspired by the offisial implementation but avoids using MMCV and MMEDIT libraries.
+Ref url: https://github.com/IceClear/CLIP-IQA
+"""
+import os
+import torch
+
+from torch.nn.modules.loss import _Loss
+from typing import Union
+
+from piq.feature_extractors import clip
+from piq.utils.common import download_tensor, _validate_input
+
+
+OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+TOKENS_URL = "https://github.com/photosynthesis-team/piq/releases/download/v0.7.1/clipiqa_tokens.pt"
+
+
+class CLIPIQA(_Loss):
+    r"""Creates a criterion that measures image quality based on a general notion of text-to-image similarity
+    learned by the CLIP[1] model during its large-scale pre-training on a large dataset with paired texts and images.
+
+    The method is based on the idea that two antonyms ("Good photo" and "Bad photo") can be used as anchors in the
+    text embedding space representing good and bad images in terms of their image quality.
+
+    After the anchors are defined, one can use them to determine the quality of a given image in the following way:
+    1. Compute the image embedding of the image of interest using the pre-trained CLIP model;
+    2. Compute the text embeddings of the selected anchor antonyms;
+    3. Compute the angle (cosine similarity) between the image embedding (1) and both text embeddings (2);
+    4. Compute the Softmax of cosine similarities (3) -> CLIP-IQA[2] score.
+
+    This method is proposed to eliminate the linguistic ambiguity of the naive approach
+    (using a single prompt, e.g., "Good photo").
+
+    This method has an extension called CLIP-IQA+[2] proposed in the same research paper.
+    It uses the same approach but also fine-tunes the CLIP weights using the CoOp[3] fine-tuning algorithm.
+
+    Note:
+        The initial computation of the metric is performed in `float32` and other dtypes (i.e. `float16`, `float64`)
+        are not supported. We preserve this behaviour for reproducibility perposes. Also, at the time of writing
+        conv2d is not supported for `float16` tensors on CPU.
+
+    Warning:
+        In order to avoid implicit dtype conversion and normalization of input tensors, they are copied.
+        Note that it may consume extra memory, which might be noticible on large batch sizes.
+
+    Args:
+        data_range: Maximum value range of images (usually 1.0 or 255).
+
+    Examples:
+        >>> from piq import CLIPIQA
+        >>> clipiqa = CLIPIQA()
+        >>> x = torch.rand(1, 3, 224, 224)
+        >>> score = clipiqa(x)
+
+    References:
+        [1] Radford, Alec, et al. "Learning transferable visual models from natural language supervision."
+        International conference on machine learning. PMLR, 2021.
+        [2] Wang, Jianyi, Kelvin CK Chan, and Chen Change Loy. "Exploring CLIP for Assessing the Look
+        and Feel of Images." arXiv preprint arXiv:2207.12396 (2022).
+        [3] Zhou, Kaiyang, et al. "Learning to prompt for vision-language models." International
+        Journal of Computer Vision 130.9 (2022): 2337-2348.
+    """
+    def __init__(self, data_range: Union[float, int] = 1.) -> None:
+        super().__init__()
+
+        self.feature_extractor = clip.load().eval()
+        for param in self.feature_extractor.parameters():
+            param.requires_grad = False
+
+        # Pre-computed tokens for prompt pairs: "Good photo.", "Bad photo.".
+        tokens = download_tensor(TOKENS_URL, os.path.expanduser("~/.cache/clip"))
+
+        anchors = self.feature_extractor.encode_text(tokens).float()
+        anchors = anchors / anchors.norm(dim=-1, keepdim=True)
+
+        self.data_range = float(data_range)
+        default_mean = torch.tensor(OPENAI_CLIP_MEAN).view(1, 3, 1, 1)
+        default_std = torch.tensor(OPENAI_CLIP_STD).view(1, 3, 1, 1)
+        self.logit_scale = self.feature_extractor.logit_scale.exp()
+
+        # Take advantage of Torch buffers. CLIPIQA.to(device) will move these to the device as well.
+        self.register_buffer("anchors", anchors)
+        self.register_buffer("default_mean", default_mean)
+        self.register_buffer("default_std", default_std)
+
+    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+        r"""Computation of CLIP-IQA metric for a given image :math:`x`.
+
+        Args:
+            x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+                The metric is designed in such a way that it expects:
+                - 3D or 4D PyTorch tensors;
+                - These tensors are have any ranges of values between 0 and 255;
+                - These tensros have channels first format.
+
+        Returns:
+            The value of CLI-IQA score in [0, 1] range.
+        """
+        _validate_input([x_input], dim_range=(3, 4), data_range=(0., 255.), check_for_channels_first=True)
+
+        x = x_input.clone()
+        x = x.float() / self.data_range
+        x = (x - self.default_mean) / self.default_std
+
+        # Device for nn.Module cannot be cached through the buffer so it has to be done here.
+        self.feature_extractor = self.feature_extractor.to(x)
+
+        with torch.no_grad():
+            image_features = self.feature_extractor.encode_image(x, pos_embedding=False).float()
+
+        # Normalized features.
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+
+        # Cosine similarity as logits.
+        logits_per_image = self.logit_scale * image_features @ self.anchors.t()
+
+        probs = logits_per_image.reshape(logits_per_image.shape[0], -1, 2).softmax(dim=-1)
+        result = probs[..., 0]
+        return result.detach()