# PCA 压缩图像

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams["font.sans-serif"] = "SimHei"
plt.rcParams["axes.unicode_minus"] = False
plt.tight_layout()

手动实现一个简易的 PCA，和 `scikit-learn` 的 PCA 保持相同 API：

In [None]:
class PCA:
    def __init__(self, n_components=None):
        self.n_components = n_components
        self.mean = None
        self.components = None
        self.explained_variance_ratio = None

    def fit(self, data):
        self.mean = np.mean(data, axis=0)
        centered_data = data - self.mean

        cov = np.cov(centered_data.T)
        eigenvalues, eigenvectors = np.linalg.eig(cov)

        sorted_indices = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[sorted_indices]
        eigenvectors = eigenvectors[:, sorted_indices]

        self.components = eigenvectors[:, : self.n_components]
        self.explained_variance_ratio = eigenvalues / np.sum(eigenvalues)

    def transform(self, data):
        centered_data = data - self.mean
        return np.dot(centered_data, self.components)

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, pca_data):
        return np.dot(pca_data, self.components.T) + self.mean

读取图像，分别提取出 RGB 三个通道的数据，并分析它们的主成分。

取 0.5、0.9、0.99、0.999 四个累计贡献率阈值，观察需要多少个主成分的共同贡献，才能达到这个精度。

In [None]:
image = plt.imread("butterfly.bmp")

red_component = image[:, :, 0]
green_component = image[:, :, 1]
blue_component = image[:, :, 2]

configs = {
    "red": {
        "title": "红色通道",
        "color": "red",
        "data": red_component,
        "axis": 0,
    },
    "green": {
        "title": "绿色通道",
        "color": "green",
        "data": green_component,
        "axis": 1,
    },
    "blue": {
        "title": "蓝色通道",
        "color": "blue",
        "data": blue_component,
        "axis": 2,
    },
}

precision_list = (0.3, 0.5, 0.7, 0.9, 0.99)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle("主成分数-累计贡献率")


def plot_accumulative_contribution(color):
    config = configs[color]
    title = config["title"]
    color = config["color"]
    data = config["data"]
    ax = axes[config["axis"]]

    pca = PCA()
    pca.fit(data)
    accumulative_contribution = np.cumsum(pca.explained_variance_ratio)
    ax.plot(accumulative_contribution, color=color)
    ax.set_title(title)

    for precision in precision_list:
        component_num = len(
            accumulative_contribution[accumulative_contribution < precision]
        )
        ax.plot(component_num, precision, color=color, marker="o")
        ax.text(component_num + 5, precision - 0.04, f"({component_num},{precision})")


plot_accumulative_contribution("red")
plot_accumulative_contribution("green")
plot_accumulative_contribution("blue")
plt.show()

从上图可以看到，主成分的累计贡献率一开始随着主成分数的增加而快速上升，红色和绿色通道在 50 多个主成分时达到 99%，蓝色则需要 94 个。

为了压缩的质量，我们以蓝色为准，分别选择 2、5、11、27、94 这五个主成分数，分别观察它们的压缩效果。

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 5))
fig.suptitle("压缩效果对比图")

n_components_list = (2, 5, 11, 27, 94)

for index, n_components in enumerate(n_components_list):
    pca = PCA(n_components)
    compressed = np.dstack(
        (
            pca.inverse_transform(pca.fit_transform(red_component)),
            pca.inverse_transform(pca.fit_transform(green_component)),
            pca.inverse_transform(pca.fit_transform(blue_component)),
        )
    ).astype(np.uint8)
    ax = axes[index // 3, index % 3]
    ax.imshow(compressed)
    ax.set_title(f"主成分数={n_components}，精度={precision_list[index]}")
    ax.set_xticks([])
    ax.set_yticks([])

axes[1][2].imshow(image)
axes[1][2].set_title("原图")
axes[1][2].set_xticks([])
axes[1][2].set_yticks([])
plt.show()