# PCA

In [1]:
from sklearn.datasets import load_digits

digits, y = load_digits(return_X_y=True)
digits.shape

(1797, 64)

In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(digits)

In [3]:
pca.components_

array([[ 7.31744676e-18, -1.73094669e-02, -2.23428836e-01,
        -1.35913304e-01, -3.30323263e-02, -9.66341379e-02,
        -8.32947741e-03,  2.26899528e-03, -3.20516336e-04,
        -1.19308905e-01, -2.44451683e-01,  1.48512706e-01,
        -4.67319864e-02, -2.17740747e-01, -1.48136980e-02,
         4.47779306e-03, -4.94136179e-05, -7.95419362e-02,
         8.33951266e-02,  2.15915312e-01, -1.72126814e-01,
        -1.63712057e-01,  2.86444580e-02,  4.23251919e-03,
         9.85488669e-05,  6.42319253e-02,  2.54093320e-01,
        -3.56771044e-02, -2.09462568e-01, -4.31311395e-02,
         5.13118828e-02,  2.13422764e-04,  0.00000000e+00,
         1.59950896e-01,  3.68690791e-01,  1.64406851e-01,
         8.52008006e-02,  3.72982255e-02,  2.15866787e-02,
         0.00000000e+00,  1.28865582e-03,  1.06945288e-01,
         3.03067460e-01,  2.47813003e-01,  2.09637326e-01,
         1.22324935e-02, -3.69458823e-02,  1.61484953e-03,
         6.93023297e-04, -8.35144374e-03, -5.58599067e-0

## 因子寄与率

In [4]:
pca.explained_variance_ratio_

array([0.14890594, 0.13618771])

## 適切な次数の選択

In [5]:
import numpy as np

pca = PCA()
pca.fit(digits)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

29

In [6]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(digits)
print(X_reduced.shape)
X_reduced

(1797, 29)


array([[ -1.25946645,  21.27488348,  -9.46305462, ...,   3.67072108,
         -0.9436689 ,  -1.13250195],
       [  7.9576113 , -20.76869896,   4.43950604, ...,   2.18261819,
         -0.51022719,   2.31354911],
       [  6.99192297,  -9.95598641,   2.95855808, ...,   4.22882114,
          2.1576573 ,   0.8379578 ],
       ...,
       [ 10.8012837 ,  -6.96025223,   5.59955453, ...,  -3.56866194,
          1.82444444,   3.53885886],
       [ -4.87210009,  12.42395362, -10.17086635, ...,   3.25330054,
          0.95484174,  -0.93895602],
       [ -0.34438963,   6.36554919,  10.77370849, ...,  -3.01636722,
          1.29752723,   2.58810313]])

## 圧縮のためのPCA

In [7]:
X_recovered = pca.inverse_transform(X_reduced)
print(X_recovered.shape)
X_recovered

(1797, 64)


array([[ 4.18592165e-16,  1.33767782e-01,  5.63152277e+00, ...,
         1.85964222e-01, -1.20577212e+00, -3.66920147e-01],
       [-1.06138208e-16, -1.22739900e-01,  1.38809014e-01, ...,
         8.02843002e+00,  1.26699313e+00,  9.88032910e-01],
       [-6.46339140e-17,  2.57737337e-01,  5.62138359e-01, ...,
         1.56032837e+01,  8.74160970e+00,  1.28370760e+00],
       ...,
       [-9.43354464e-17, -3.29326197e-01,  8.19751265e-01, ...,
         5.30083425e+00, -7.52796422e-02, -3.42569422e-01],
       [ 1.49406226e-16, -2.86793118e-01,  3.86722104e+00, ...,
         1.10058818e+01,  1.13105112e+00,  6.21574304e-01],
       [-1.20192377e-16,  4.85982909e-01,  8.39476464e+00, ...,
         1.10743005e+01,  2.51424465e+00, -1.43283183e-01]])

## 追加学習型PCA

In [8]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=10)
for X_batch in np.array_split(digits, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_reduced = inc_pca.transform(digits)
print(X_reduced.shape)
X_reduced

(1797, 10)


array([[ -1.09934412, -21.3298656 ,   9.51025817, ...,  -3.46225129,
          2.08098498,  -2.89229304],
       [  7.76837085,  20.77183848,  -4.47235979, ...,   4.04664807,
         -4.68367924,  -0.30021956],
       [  6.8595604 ,   9.99186342,  -2.97079741, ...,  16.24524062,
         -1.99558633,  -4.40854958],
       ...,
       [ 10.67276519,   6.97978733,  -5.65838529, ...,   6.98527037,
          2.27146201,  14.40875716],
       [ -4.75620571, -12.42200388,  10.16221789, ...,   3.72202565,
         -6.01439249,  13.54765337],
       [ -0.34426331,  -6.36423255, -10.77220158, ...,  -0.64782913,
          3.56338395,  14.20541496]])

## ランダム化PCA

In [9]:
rnd_pca = PCA(n_components=10, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(digits)
print(X_reduced.shape)
X_reduced

(1797, 10)


array([[ -1.25954027,  21.27483606,  -9.46308565, ...,   2.55625694,
         -0.56488014,  -3.59689194],
       [  7.95761743, -20.76870749,   4.43943625, ...,  -4.61968138,
          3.57055194,   1.07899353],
       [  6.99185934,  -9.95601082,   2.95881611, ..., -16.41601934,
          0.77223983,  -4.30651446],
       ...,
       [ 10.80131705,  -6.96026134,   5.59946881, ...,  -7.41505324,
         -3.98158082,  13.0726427 ],
       [ -4.87212741,  12.42394772, -10.17086783, ...,  -4.35900069,
          3.94507329,  13.15840645],
       [ -0.34442851,   6.36551379,  10.77383729, ...,   0.67039127,
         -4.08118934,  12.54389734]])

# カーネルPCA

In [10]:
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.4)
X_reduced = rbf_pca.fit_transform(digits)
X_reduced

array([[0.00078743, 0.00078816],
       [0.00078743, 0.00078817],
       [0.00078743, 0.00078816],
       ...,
       [0.00078743, 0.00078817],
       [0.00078743, 0.00078817],
       [0.00078743, 0.00078816]])

## カーネルの選択とハイパーパラメータのチューニング

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())
])

#print(clf.get_params().keys())

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(digits, y)

print(grid_search.best_params_)

  return np.dot(K, self.alphas_ / np.sqrt(self.lambdas_))


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [14]:
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0433, fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(digits)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [15]:
from sklearn.metrics import mean_squared_error

mean_squared_error(digits, X_preimage)

18.76204117780906

# LLE

In [16]:
from sklearn.datasets import make_swiss_roll

X, t = make_swiss_roll()
print(X)

from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)
X_reduced

[[  0.62839068  17.85357793  -4.80157643]
 [ -9.47481609   9.78556089  -1.21135626]
 [ 12.59776919  19.96494821   1.45067008]
 [ -6.25312759  19.97581368  -8.24321789]
 [ 10.74375526  19.70558505  -5.54428098]
 [ -9.19638993   9.92369097  -3.29484238]
 [ -8.50620138  11.68665041   3.1486646 ]
 [ -2.72619193  17.87425323   7.72633213]
 [  2.47652052   0.34815657   7.09872609]
 [ 10.15926974  10.01649085  -6.39518504]
 [ -1.77804981   3.4760707    7.87780194]
 [ -0.49190243   6.44706811   7.90086276]
 [  1.79947607  18.42215503  -4.74511282]
 [  8.42598069   2.54765069  -8.24890737]
 [ -5.15150223   0.27440335   6.76692547]
 [  5.779544     7.97520064  -9.9665505 ]
 [  4.73867799   1.53428044  -3.15911944]
 [  6.35686529   6.1515876    1.21784827]
 [  3.7347992    2.02664865  -3.98944417]
 [  4.93775266   3.78233532 -10.32148927]
 [ -7.86275315  15.46091533   4.23496086]
 [  0.49141576   1.22115409  -4.78948979]
 [  2.86395746   9.17619648 -10.88236212]
 [ -6.88241865   4.73729912   5.41

array([[ 0.02086027,  0.10419142],
       [ 0.00789076,  0.08908699],
       [ 0.09757728, -0.03097202],
       [ 0.00848594,  0.22299379],
       [ 0.01683858,  0.02162441],
       [-0.01465016,  0.10093649],
       [ 0.07079182,  0.0759465 ],
       [ 0.16796525,  0.06516458],
       [ 0.04588947, -0.15645406],
       [-0.08739933, -0.06993821],
       [ 0.07657438, -0.09209523],
       [ 0.09384775, -0.07448427],
       [ 0.02639367,  0.09848378],
       [-0.18184731, -0.12166119],
       [ 0.0443592 , -0.08859262],
       [-0.14044756, -0.02975697],
       [-0.10932941, -0.11635815],
       [-0.01168792, -0.10536484],
       [-0.11375938, -0.09713141],
       [-0.18611776, -0.06461649],
       [ 0.10998899,  0.10637846],
       [-0.1233327 , -0.06830231],
       [-0.13169793,  0.01810998],
       [ 0.05351669, -0.0223249 ],
       [ 0.01134951,  0.13613416],
       [ 0.14279475, -0.03921338],
       [ 0.02658593, -0.06295197],
       [-0.13739229,  0.04249344],
       [ 0.07987074,