In [1]:
from dataclasses import dataclass

from PIL import Image


import numpy as np


from sklearn.cluster import KMeans


import matplotlib.pyplot as plt


import pandas as pd


from skimage import color, io


from tqdm import tqdm


from yellowbrick.cluster import KElbowVisualizer

In [2]:
@dataclass
class Neighbor:
    """Dataclass for a nearest color.

    Attributes:
        index (int): The index of the nearest color in the named color list.
        name (str): The name of the nearest color.
        hex (str): The hex color of the nearest color.
        delta_E (float): The distance of the average cluster color to the neighbor.
    """

    index: int
    name: str
    hex: str
    delta_E: float

    def __eq__(self, __value: object) -> bool:
        return self.index == __value.index

    def __gt__(self, __value: object) -> bool:
        return self.delta_E < __value.delta_E

    def __repr__(self):
        return f"{self.index}. {self.name} ({self.hex}, deltaE={self.delta_E:.2f})"


@dataclass
class Cluster:
    """Dataclass for a cluster of colors.

    Attributes:
        hex (str): The hex color of the average color of the cluster.
        n (int): The number of pixels in the cluster.
        freq (float): The fraction of pixels in the cluster.
        lab_L (int): The L* value of the average color of the cluster.
        lab_a (int): The a* value of the average color of the cluster.
        lab_b (int): The b* value of the average color of the cluster.
        rgb_R (int): The R value of the average color of the cluster.
        rgb_G (int): The G value of the average color of the cluster.
        rgb_B (int): The B value of the average color of the cluster.
        hsv_H (int): The H value of the average color of the cluster.
        hsv_S (int): The S value of the average color of the cluster.
        hsv_V (int): The V value of the average color of the cluster.
        nearest_color (Neighbor): The nearest color to the average color of the cluster.
        nearest_colors (list[int]): The indices of the nearest colors to the average color of the cluster (with delta_E<5) in the named color list.
        tags (list[str]): The names of the nearest colors to the average color of the cluster.
    """

    hex: str
    n: int
    freq: float
    lab_L: int
    lab_a: int
    lab_b: int
    rgb_R: int
    rgb_G: int
    rgb_B: int
    hsv_H: int
    hsv_S: int
    hsv_V: int
    nearest_color: Neighbor
    nearest_colors: list[int]
    tags: list[str]

    def __repr__(self):
        return f"{self.hex} ({self.freq*100:.2f}%)"

    def set_nearest_colors(self, neighbors: list[Neighbor]):
        self.nearest_colors = neighbors
        self.nearest_color = neighbors[0]

In [None]:
class NamedColors:
    def __init__(self, colors: pd.DataFrame = pd.read_csv("./data/colors.names.csv")):
        self.colors = colors

    def closest_color(self, color: np.ndarray) -> pd.DataFrame:
        """Find the closest color in the list of named colors.

        Args:
            color (np.ndarray): A color in the RGB colorspace. Shape: (B,1,3), where 3 is the RGB channels and B is the batch size.

        Returns:
            pd.DataFrame: A dataframe with the closest color in the list of named colors, where a row corresponds to the closest color to the corresponding color in the batch.
        """
        distances = np.linalg.norm(
            self.colors[["lab_L", "lab_a", "lab_b"]].values.reshape(1, -1, 3) - color,
            axis=2,
        )
        closest_index = np.argmin(
            distances,
            axis=1,
        )
        return self.colors.iloc[closest_index]

    def closest_colors(self, color: np.ndarray, threshold: float) -> list[pd.DataFrame]:
        """Find the closest color in the list of named colors.

        Args:
            color (np.ndarray): A color in the RGB colorspace. Shape: (B,1,3), where 3 is the RGB channels and B is the batch size.
            threshold (float): The threshold for the distance between the color and the closest color in the list of named colors.

        Returns:
            list[pd.DataFrame]: A list of dataframes with the closest colors in the list of named colors, where a row corresponds to the closest color to the corresponding color in the batch. If no color is within the threshold, the list will contain the closest color.
        """
        distances = np.linalg.norm(
            self.colors[["lab_L", "lab_a", "lab_b"]].values.reshape(1, -1, 3) - color,
            axis=2,
        )
        results = []

        for i in range(color.shape[0]):
            dist_series = pd.Series(distances[i, :], name="delta_E")
            neighbors = (
                self.colors[["name", "hex"]].join(dist_series).sort_values("delta_E")
            ).reset_index(names=["index"])
            neighbors_under_threshold = neighbors.loc[neighbors["delta_E"] <= threshold]
            if neighbors_under_threshold.shape[0] == 0:
                # There are no neighbors under the threshold
                results.append([Neighbor(**neighbors.iloc[0].to_dict())])
            else:
                results.append(
                    [
                        Neighbor(**neighbor)
                        for neighbor in neighbors_under_threshold.to_dict(
                            orient="records"
                        )
                    ]
                )
        return results

In [2]:
named_colors = NamedColors()
volumes_df = pd.read_csv("data/volumes.csv", index_col=0)

In [3]:
# elbows = []

# for i in tqdm(range(1, len(volumes_df) + 1)):
#     # Read image and convert to L*a*b* color space
#     elbow = {
#         "volume": i,
#         "name": volumes_df.loc[i, "name"],
#         "date": volumes_df.loc[i, "release_date"],
#     }
#     Lab = pd.DataFrame(
#         color.rgb2lab(io.imread(f"./data/small_covers/{i}.webp")).reshape(-1, 3),
#         columns=["L", "a", "b"],
#     )
#     elbow["elbow"] = (
#         KElbowVisualizer(KMeans(random_state=4, n_init="auto"), k=(2, 10))
#         .fit(Lab)
#         .elbow_value_
#     )
#     elbows.append(elbow)
# elbows_df = pd.DataFrame(elbows).set_index("volume")
# elbows_df.to_csv("data/elbows.csv")

# Load the saved elbow values
elbows_df = pd.read_csv("data/elbows.csv", index_col=0)

In [4]:
volumes = []

for i in tqdm(range(1, len(volumes_df) + 1)):
    volume = {
        "volume": i,
        "name": volumes_df.loc[i, "name"],
        "date": volumes_df.loc[i, "release_date"],
    }
    # Read image and convert to L*a*b* color space
    Lab = pd.DataFrame(
        color.rgb2lab(io.imread(f"./data/small_covers/{i}.webp")).reshape(-1, 3),
        columns=["L", "a", "b"],
    )
    # Perform k-means clustering
    kmeans = KMeans(
        n_clusters=elbows_df.loc[i, "elbow"], random_state=0, n_init="auto"
    ).fit(Lab)
    # Add cluster labels to dataframe
    Lab["cluster"] = kmeans.labels_

    # Create a dataframe with the cluster centers
    clusters = pd.DataFrame(
        np.floor(kmeans.cluster_centers_).astype(int),
        columns=["lab_L", "lab_a", "lab_b"],
    )
    # Add RGB and hex values
    clusters[["rgb_R", "rgb_G", "rgb_B"]] = (
        np.floor(color.lab2rgb(kmeans.cluster_centers_.reshape(1, -1, 3)) * 255).astype(
            int
        )
    ).squeeze()
    clusters["hex"] = clusters[["rgb_R", "rgb_G", "rgb_B"]].apply(
        lambda x: "#{:02x}{:02x}{:02x}".format(x.rgb_R, x.rgb_G, x.rgb_B),
        axis=1,
    )
    # Add HSV values
    hsv = color.rgb2hsv(
        color.lab2rgb(kmeans.cluster_centers_.reshape(1, -1, 3)) * 255
    ).squeeze()
    hsv[:, 0] = hsv[:, 0] * 360
    hsv[:, 1] = hsv[:, 1] * 100
    hsv[:, 2] = hsv[:, 2] * 100 / 255
    clusters[["hsv_H", "hsv_S", "hsv_V"]] = np.floor(hsv).astype(int)

    # Add the number of pixels and fraction of pixels in the cluster
    clusters = clusters.join(Lab["cluster"].value_counts(normalize=True).rename("freq"))
    clusters = clusters.join(Lab["cluster"].value_counts().rename("n"))

    # Add the nearest colors to the cluster
    closest_colors = named_colors.closest_colors(
        kmeans.cluster_centers_.reshape(-1, 1, 3), 5
    )
    clusters["nearest_color"] = [neighbors[0] for neighbors in closest_colors]
    clusters["nearest_colors"] = [
        [neighbor.index for neighbor in neighbors if neighbor.delta_E <= 5]
        for neighbors in closest_colors
    ]
    clusters["tags"] = [
        [neighbor.name for neighbor in neighbors if neighbor.delta_E <= 5]
        for neighbors in closest_colors
    ]

    # Sort the clusters by hue
    clusters = clusters.sort_values("hsv_H")
    clusters = [Cluster(**cluster) for cluster in clusters.to_dict(orient="records")]
    volume["clusters"] = clusters

    volumes.append(volume)

100%|██████████| 72/72 [00:37<00:00,  1.94it/s]


In [5]:
import dataclasses, json


class EnhancedJSONEncoder(json.JSONEncoder):
    def default(self, o):
        if dataclasses.is_dataclass(o):
            return dataclasses.asdict(o)
        return super().default(o)


json.dump(volumes, fp=open("./data/clusters.json", "w"), cls=EnhancedJSONEncoder)