In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA

x = np.random.uniform(0.1,5,100)
noise = np.random.normal(scale=0.3, size=x.size)

## Intuition: factorization
Why is it useful to express something as a few parts multiplied together?
To convey more information 

In [None]:
# at what points does y=0?
# y = -x**3 + 7*x**2 - 14*x + 8
y = (4-x) * (2-x) * (1-x)

In [None]:
pd.DataFrame({"x": x, "y": y+noise}).plot.scatter(x="x", y="y")
plt.hlines(0, -1, 6, color="k")

## Some cool dimensionality reduction examples:
https://pair-code.github.io/understanding-umap/ \
https://distill.pub/2016/misread-tsne/ 

# Decomposition with Principal Component Analysis (PCA)
Q: Is it possible to use fewer columns to represent this dataframe?

In [None]:
df = pd.DataFrame(make_blobs(centers=2, random_state=320)[0], columns=["A", "B"])
df["C"] = df["A"] * 2
df["D"] = df["A"] - df["B"]
df.head()

A: Yes. C is two times of A and D is A - B, so we only need A & B and their relationship to C & D to represent the dataframe.

# PCA on two columns

In [None]:
# plot A & B column
df.plot.scatter("A", "B")

## sklearn.decomposition.PCA

In [None]:
p = PCA()
W = p.fit_transform(df[["A", "B"]])
C = p.components_

In [None]:
# PCA will first find the mean
mean_point = p.mean_
mean_point

In [None]:
df[["A", "B"]].mean()

In [None]:
# plot mean point
df.plot.scatter("A", "B")
plt.plot(mean_point[0], mean_point[1], marker="X", markersize=20, color="red")

C is called the **component matrix** \
first row of C is the most important component \
second row of C is the second most important component \
and so on ...

Each row is in the form of the slope of the componenet

In [None]:
# two components for 2d data
C

For the first component, PCA will try to fit a line that corss the mean point and 
has the largest spreadout in terms of points. \
The second component will be prependicular to the first component, corssing the mean point, 
and has the largest spreadout in its direction. 

In [None]:
# plot first component 
df.plot.scatter("A", "B")

plt.plot(mean_point[0], mean_point[1], marker="X", markersize=20, color="red")
span = 6
point2 = [span + mean_point[0], C[0][1] / C[0][0] * span + mean_point[1]]
point3 = [-span + mean_point[0], C[0][1] / C[0][0] * (-span) + mean_point[1]]
x = [point2[0], point3[0]]
y = [point2[1], point3[1]]
plt.plot(x, y, linestyle="-", color="red")

First column of W represents relative positions of points along the first component \
Second column of W represents relative positions of points along the second component \
and so on ...

In [None]:
W[:10]

In [None]:
print(W.shape, C.shape)

In [None]:
print(df[["A", "B"]].shape)

In [None]:
# use W and C to reconstruct the original A & B columns
pd.DataFrame((W @ C) + p.mean_).head()

In [None]:
df[["A", "B"]].head()

In [None]:
# use only the first component to approximately reconstruct A & B columns
# the first column of W (relative position of W along the first component) multiply the first row of C (the first component)
pd.DataFrame(W[:, :1] @ C[:1, :] + p.mean_).head()

## Explained Variance

In [None]:
a = np.array([1.1, 1.9, 3.2])
a

In [None]:
b = np.array([1, 2, 3])
b

In [None]:
a - b

In [None]:
a.var()

In [None]:
(a - b).var()

In [None]:
1 - (a - b).var() / a.var()

In [None]:
# the amount of variance explained by each components
# the first component has largest explained variance
# the second component has the second largest explained variance
# and so on 
explained_variance = p.explained_variance_
explained_variance

In [None]:
explained_variance / explained_variance.sum()

In [None]:
# explained variance percentage wise
p.explained_variance_ratio_

# PCA on two dependent columns

In [None]:
p = PCA()
W = p.fit_transform(df[["A", "C"]])
C = p.components_

In [None]:
mean = p.mean_

In [None]:
# plot A & C columns and the mean 
df.plot.scatter("A", "C")
mean_point = [mean[0],mean[1]]
plt.plot(mean[0],mean[1], marker="X", markersize=20, color="red")

In [None]:
# plot the first component
df.plot.scatter("A", "C")
mean_point = [mean[0],mean[1]]
plt.plot(mean_point[0], mean_point[1], marker="X", markersize=20, color="red")
span = 6
point2 = [span + mean_point[0], C[0][1] / C[0][0] * span + mean_point[1]]
point3 = [-span + mean_point[0], C[0][1] / C[0][0] * (-span) + mean_point[1]]
x = [point2[0], point3[0]]
y = [point2[1], point3[1]]
plt.plot(x, y, linestyle="-", color="red")

In [None]:
p.explained_variance_

In [None]:
# noted the first component is explianing 100% of the data
# because C is two times of A
# the first component is capturing the 2* relationship using its slope
p.explained_variance_ratio_

In [None]:
# we can reconstruct A & C only using one component
pd.DataFrame(W[:, :1] @ C[:1, :] + p.mean_).head()

In [None]:
df[["A", "C"]].head()

# PCA on all columns

In [None]:
p = PCA()
W = p.fit_transform(df)
C = p.components_

In [None]:
# four components for 4d data
C.shape

In [None]:
p.explained_variance_

In [None]:
# noted the first two components are explaining 100% of the data
ev_ratio = p.explained_variance_ratio_
ev_ratio

In [None]:
# we can reconstruct the original dataframe only using the first two components
pd.DataFrame(W[:, :2] @ C[:2, :] + p.mean_).head()

In [None]:
df.head()

### Cumulative plot of explained variance ratio

In [None]:
# cumsum() compute the cumulative sum
s = pd.Series(p.explained_variance_ratio_.cumsum(), index=range(1,5))
ax = s.plot.line(ylim=0)
ax.set_ylabel("Explained Variance")
ax.set_xlabel("Component")

# Dimensionality Reduction on Feature Columns

In [None]:
pipe = Pipeline([
    ("pca", PCA(2)), 
    # n_components parameter
    # specify an int for number of components to use 
    # or a float indicates how much variance we want to explain (explained_variance_ratio_)
    ("km", KMeans(2)),
])

pipe.fit(df) # fit PCA, transform using PCA, fit KMeans using output from PCA

groups = pipe.predict(df) # transform using PCA

In [None]:
# -1 is white
pd.DataFrame(pipe["pca"].transform(df)).plot.scatter(x=0, y=1, c=groups, vmin=-1)

# Lossy Compression

Use PCA to extract the most important information and throw away the less important ones

In [None]:
img = plt.imread("bug.jpeg")
plt.imshow(img)

In [None]:
img.shape

In [None]:
# averaging the color dimension to make it a bit more easy to handle
img = img.mean(axis=2)
img.shape

In [None]:
plt.imshow(img, cmap="gray")

In [None]:
# we want to explian 95% of the variance
p = PCA(0.95)
W = p.fit_transform(img)
C = p.components_
m = p.mean_

In [None]:
original_size = len(img.reshape(-1))
original_size

In [None]:
compressed_size = len(W.reshape(-1)) + len(C.reshape(-1)) + len(m.reshape(-1))
compressed_size

In [None]:
# compression ratio
original_size / compressed_size

In [None]:
plt.imshow(W @ C + m, cmap="gray")

In [None]:
# savez saves numpy arrays into .npz format
# use wb to write in binary format
with open("img1.npz", "wb") as f: 
    np.savez(f, img)

In [None]:
with open("img2.npz", "wb") as f: 
    np.savez(f, W, C, m)

In [None]:
with np.load("img2.npz") as f: 
    W, C, m = f.values()

In [None]:
plt.imshow(W @ C + m, cmap="gray")

In [None]:
# original plot is 33M vs. the compressed plot is 876K
!ls -lh