In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.set_printoptions(precision=5)

import pandas as pd
pd.set_option('display.precision', 5)

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=42)

In [3]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
pca = PCA(n_components=2)
pca.fit(X_train_scaled)

In [5]:
X_train_pca=pca.transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

print("Original shape: {}".format(str(X_train_scaled.shape)))
print("Reduced shape: {}".format(str(X_train_pca.shape)))

Original shape: (426, 30)
Reduced shape: (426, 2)


In [6]:
pd.DataFrame(X_test_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.4681,-0.14171,-0.44468,-0.48598,0.29337,0.06406,-0.0945,-0.25211,0.46573,0.1556,...,-0.26002,-0.144,-0.32803,-0.34684,0.49138,-0.06654,-0.00301,-0.17331,0.22117,0.23656
1,1.36446,0.49959,1.30644,1.33441,-0.39172,0.00765,0.26146,0.84,-0.81474,-1.10777,...,1.81075,0.17826,1.78633,1.75755,-0.51309,-0.09469,0.00504,1.02943,-0.53162,-0.99406
2,0.37879,0.06653,0.40431,0.26397,0.97775,0.38502,0.75306,0.87596,0.48813,-0.64371,...,0.63822,0.08192,0.54526,0.49998,1.00871,-0.05582,0.56055,0.6021,-0.06661,-0.17972
3,-0.48793,-0.35942,-0.42903,-0.52558,0.70543,0.56593,-0.12813,-0.52237,0.04015,1.16546,...,-0.69763,-0.43138,-0.52297,-0.63396,0.59485,0.10167,-0.13786,-0.6053,-0.52349,0.58336
4,-0.73151,-1.12615,-0.70996,-0.70788,0.30699,0.18467,-0.25599,-0.57658,0.06629,0.72217,...,-0.82744,-0.96794,-0.85079,-0.73669,0.14219,-0.24078,-0.44278,-0.67554,-0.89257,-0.11423


In [7]:
pd.DataFrame(pca.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.21606,0.10257,0.22511,0.21883,0.14804,0.23929,0.25919,0.26246,0.1507,0.06038,...,0.22575,0.10635,0.2356,0.22353,0.13033,0.20797,0.23177,0.25148,0.125,0.12505
1,-0.23826,-0.05282,-0.22045,-0.23449,0.1737,0.15546,0.0588,-0.03834,0.17583,0.36351,...,-0.22336,-0.03836,-0.2024,-0.22168,0.16714,0.1527,0.09835,-0.00333,0.11991,0.28722


In [8]:
pd.DataFrame({'Z1': X_test_pca[:,0],
              'Z2': X_test_pca[:,0]}
            ).head()

Unnamed: 0,Z1,Z2
0,-0.68489,-0.68489
1,2.72594,2.72594
2,1.53861,1.53861
3,-0.80155,-0.80155
4,-1.55432,-1.55432


In [9]:
pca = PCA(n_components=0.95)
pca.fit(X_train_scaled)

In [10]:
X_train_pca=pca.transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

print("Original shape: {}".format(str(X_train_scaled.shape)))
print("Reduced shape: {}".format(str(X_train_pca.shape)))

Original shape: (426, 30)
Reduced shape: (426, 10)


In [11]:
pca.explained_variance_ratio_

array([0.43736, 0.19531, 0.09618, 0.06483, 0.05181, 0.04118, 0.02252,
       0.01698, 0.01371, 0.01197])

In [12]:
sum(pca.explained_variance_ratio_)

0.9518619710973647

In [13]:
pca = PCA()
pca.fit(X_train_scaled)

In [14]:
X_train_pca=pca.transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

print("Original shape: {}".format(str(X_train_scaled.shape)))
print("Reduced shape: {}".format(str(X_train_pca.shape)))

Original shape: (426, 30)
Reduced shape: (426, 30)


In [15]:
pca.explained_variance_ratio_

array([4.37365e-01, 1.95314e-01, 9.61800e-02, 6.48280e-02, 5.18071e-02,
       4.11845e-02, 2.25213e-02, 1.69848e-02, 1.37072e-02, 1.19706e-02,
       1.01161e-02, 9.01401e-03, 7.94309e-03, 5.20908e-03, 2.80842e-03,
       2.30760e-03, 1.96259e-03, 1.78970e-03, 1.61782e-03, 1.04904e-03,
       9.79526e-04, 8.89178e-04, 8.27362e-04, 5.64751e-04, 4.91393e-04,
       2.65341e-04, 2.26001e-04, 4.81653e-05, 2.48752e-05, 3.94240e-06])

In [16]:
sum(pca.explained_variance_ratio_)

1.0

In [17]:
sum(pca.explained_variance_ratio_[:10])

0.9518619710973647

In [18]:
pca.components_

array([[ 2.16062e-01,  1.02568e-01,  2.25108e-01,  2.18835e-01,
         1.48042e-01,  2.39289e-01,  2.59190e-01,  2.62463e-01,
         1.50702e-01,  6.03829e-02,  2.04403e-01,  3.11721e-02,
         2.09465e-01,  1.99112e-01,  2.67462e-02,  1.66810e-01,
         1.57091e-01,  1.84836e-01,  5.07846e-02,  1.00954e-01,
         2.25746e-01,  1.06346e-01,  2.35604e-01,  2.23532e-01,
         1.30331e-01,  2.07974e-01,  2.31769e-01,  2.51481e-01,
         1.25004e-01,  1.25050e-01],
       [-2.38263e-01, -5.28217e-02, -2.20454e-01, -2.34486e-01,
         1.73699e-01,  1.55455e-01,  5.88006e-02, -3.83365e-02,
         1.75833e-01,  3.63505e-01, -1.14842e-01,  9.28182e-02,
        -9.42456e-02, -1.56540e-01,  1.97244e-01,  2.36287e-01,
         2.00515e-01,  1.35343e-01,  1.54953e-01,  2.87742e-01,
        -2.23358e-01, -3.83620e-02, -2.02404e-01, -2.21678e-01,
         1.67145e-01,  1.52700e-01,  9.83546e-02, -3.33393e-03,
         1.19911e-01,  2.87219e-01],
       [-9.08387e-03,  5.17470

In [19]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=42)

In [20]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_scaled, y_train)

In [22]:
y_train_hat = clf.predict(X_train_scaled)
print('train accuracy: %.5f'%accuracy_score(y_train, y_train_hat))

y_test_hat = clf.predict(X_test_scaled)
print('test accuracy: %.5f'%accuracy_score(y_test, y_test_hat))

train accuracy: 0.98357
test accuracy: 0.95804


In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X_train_scaled)

X_train_pca=pca.transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)
X_train_pca=pca.transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

In [24]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_pca, y_train)

In [25]:
y_train_hat = clf.predict(X_train_pca)
print('train accuracy: %.5f'%accuracy_score(y_train, y_train_hat))
y_test_hat = clf.predict(X_test_pca)
print('test accuracy: %.5f'%accuracy_score(y_test, y_test_hat))

train accuracy: 0.95775
test accuracy: 0.96503


In [26]:
pca = PCA()
pca.fit(X_train_scaled)

In [27]:
X_test_pca = pca.transform(X_test_scaled)
X_test_rec = pca.inverse_transform(X_test_pca)

In [28]:
pd.DataFrame(X_test_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.4681,-0.14171,-0.44468,-0.48598,0.29337,0.06406,-0.0945,-0.25211,0.46573,0.1556,...,-0.26002,-0.144,-0.32803,-0.34684,0.49138,-0.06654,-0.00301,-0.17331,0.22117,0.23656
1,1.36446,0.49959,1.30644,1.33441,-0.39172,0.00765,0.26146,0.84,-0.81474,-1.10777,...,1.81075,0.17826,1.78633,1.75755,-0.51309,-0.09469,0.00504,1.02943,-0.53162,-0.99406
2,0.37879,0.06653,0.40431,0.26397,0.97775,0.38502,0.75306,0.87596,0.48813,-0.64371,...,0.63822,0.08192,0.54526,0.49998,1.00871,-0.05582,0.56055,0.6021,-0.06661,-0.17972
3,-0.48793,-0.35942,-0.42903,-0.52558,0.70543,0.56593,-0.12813,-0.52237,0.04015,1.16546,...,-0.69763,-0.43138,-0.52297,-0.63396,0.59485,0.10167,-0.13786,-0.6053,-0.52349,0.58336
4,-0.73151,-1.12615,-0.70996,-0.70788,0.30699,0.18467,-0.25599,-0.57658,0.06629,0.72217,...,-0.82744,-0.96794,-0.85079,-0.73669,0.14219,-0.24078,-0.44278,-0.67554,-0.89257,-0.11423


In [29]:
pd.DataFrame(X_test_rec).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.4681,-0.14171,-0.44468,-0.48598,0.29337,0.06406,-0.0945,-0.25211,0.46573,0.1556,...,-0.26002,-0.144,-0.32803,-0.34684,0.49138,-0.06654,-0.00301,-0.17331,0.22117,0.23656
1,1.36446,0.49959,1.30644,1.33441,-0.39172,0.00765,0.26146,0.84,-0.81474,-1.10777,...,1.81075,0.17826,1.78633,1.75755,-0.51309,-0.09469,0.00504,1.02943,-0.53162,-0.99406
2,0.37879,0.06653,0.40431,0.26397,0.97775,0.38502,0.75306,0.87596,0.48813,-0.64371,...,0.63822,0.08192,0.54526,0.49998,1.00871,-0.05582,0.56055,0.6021,-0.06661,-0.17972
3,-0.48793,-0.35942,-0.42903,-0.52558,0.70543,0.56593,-0.12813,-0.52237,0.04015,1.16546,...,-0.69763,-0.43138,-0.52297,-0.63396,0.59485,0.10167,-0.13786,-0.6053,-0.52349,0.58336
4,-0.73151,-1.12615,-0.70996,-0.70788,0.30699,0.18467,-0.25599,-0.57658,0.06629,0.72217,...,-0.82744,-0.96794,-0.85079,-0.73669,0.14219,-0.24078,-0.44278,-0.67554,-0.89257,-0.11423


In [30]:
pca = PCA(n_components=15)
pca.fit(X_train_scaled)

In [31]:
X_test_pca = pca.transform(X_test_scaled)
X_test_rec = pca.inverse_transform(X_test_pca)

In [32]:
pd.DataFrame(X_test_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.4681,-0.14171,-0.44468,-0.48598,0.29337,0.06406,-0.0945,-0.25211,0.46573,0.1556,...,-0.26002,-0.144,-0.32803,-0.34684,0.49138,-0.06654,-0.00301,-0.17331,0.22117,0.23656
1,1.36446,0.49959,1.30644,1.33441,-0.39172,0.00765,0.26146,0.84,-0.81474,-1.10777,...,1.81075,0.17826,1.78633,1.75755,-0.51309,-0.09469,0.00504,1.02943,-0.53162,-0.99406
2,0.37879,0.06653,0.40431,0.26397,0.97775,0.38502,0.75306,0.87596,0.48813,-0.64371,...,0.63822,0.08192,0.54526,0.49998,1.00871,-0.05582,0.56055,0.6021,-0.06661,-0.17972
3,-0.48793,-0.35942,-0.42903,-0.52558,0.70543,0.56593,-0.12813,-0.52237,0.04015,1.16546,...,-0.69763,-0.43138,-0.52297,-0.63396,0.59485,0.10167,-0.13786,-0.6053,-0.52349,0.58336
4,-0.73151,-1.12615,-0.70996,-0.70788,0.30699,0.18467,-0.25599,-0.57658,0.06629,0.72217,...,-0.82744,-0.96794,-0.85079,-0.73669,0.14219,-0.24078,-0.44278,-0.67554,-0.89257,-0.11423


In [33]:
pd.DataFrame(X_test_rec).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.44043,-0.16636,-0.43065,-0.4192,0.3391,-0.09056,-0.14773,-0.20787,0.47651,0.24501,...,-0.35049,-0.10202,-0.34731,-0.33085,0.43354,-0.02323,-0.02201,-0.09845,0.21848,0.22992
1,1.50463,0.40364,1.45823,1.51384,-0.46949,0.07078,0.29091,0.83497,-0.84738,-1.07968,...,1.64288,0.30354,1.58081,1.68036,-0.42491,-0.16382,-0.01147,0.81188,-0.44343,-0.88953
2,0.5468,0.09063,0.54826,0.50727,1.03817,0.22914,0.62185,0.7077,0.47771,-0.40944,...,0.47322,0.06218,0.45585,0.40626,0.91313,0.0829,0.57493,0.63553,0.00885,-0.16753
3,-0.59842,-0.39602,-0.55741,-0.555,0.78447,0.37577,-0.00591,-0.23389,-0.02099,1.11289,...,-0.66108,-0.38113,-0.63755,-0.59781,0.47486,0.06309,-0.16353,-0.50791,-0.43804,0.52971
4,-0.79276,-1.17839,-0.75623,-0.73185,0.28641,0.11142,-0.19221,-0.4634,0.08143,0.69596,...,-0.83627,-0.89944,-0.79441,-0.71507,0.20497,-0.26847,-0.41281,-0.70748,-0.92676,-0.19273


In [34]:
pca = PCA(n_components=3)
pca.fit(X_train_scaled)

In [35]:
X_test_pca = pca.transform(X_test_scaled)
X_test_rec = pca.inverse_transform(X_test_pca)

In [36]:
pd.DataFrame(X_test_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.4681,-0.14171,-0.44468,-0.48598,0.29337,0.06406,-0.0945,-0.25211,0.46573,0.1556,...,-0.26002,-0.144,-0.32803,-0.34684,0.49138,-0.06654,-0.00301,-0.17331,0.22117,0.23656
1,1.36446,0.49959,1.30644,1.33441,-0.39172,0.00765,0.26146,0.84,-0.81474,-1.10777,...,1.81075,0.17826,1.78633,1.75755,-0.51309,-0.09469,0.00504,1.02943,-0.53162,-0.99406
2,0.37879,0.06653,0.40431,0.26397,0.97775,0.38502,0.75306,0.87596,0.48813,-0.64371,...,0.63822,0.08192,0.54526,0.49998,1.00871,-0.05582,0.56055,0.6021,-0.06661,-0.17972
3,-0.48793,-0.35942,-0.42903,-0.52558,0.70543,0.56593,-0.12813,-0.52237,0.04015,1.16546,...,-0.69763,-0.43138,-0.52297,-0.63396,0.59485,0.10167,-0.13786,-0.6053,-0.52349,0.58336
4,-0.73151,-1.12615,-0.70996,-0.70788,0.30699,0.18467,-0.25599,-0.57658,0.06629,0.72217,...,-0.82744,-0.96794,-0.85079,-0.73669,0.14219,-0.24078,-0.44278,-0.67554,-0.89257,-0.11423


In [37]:
pd.DataFrame(X_test_rec).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.30953,-0.14359,-0.30272,-0.33479,0.10948,-0.00481,-0.14094,-0.18303,0.0537,0.2288,...,-0.27531,-0.05717,-0.26697,-0.2976,0.22328,0.12411,0.02853,-0.04962,0.19335,0.2716
1,1.63067,0.52049,1.57722,1.62859,-0.38056,-0.04161,0.45033,0.87704,-0.36794,-1.43095,...,1.58375,0.44656,1.51878,1.57699,-0.42832,-0.14392,0.1698,0.66698,-0.23592,-0.95795
2,0.58494,0.13934,0.58184,0.53378,0.2284,0.3099,0.32994,0.48953,0.1203,-0.24534,...,0.64505,0.28697,0.63977,0.59037,0.4186,0.48157,0.49159,0.63822,0.4574,0.21039
3,-0.78036,-0.19852,-0.74268,-0.76032,0.27858,0.17818,-0.0563,-0.31915,0.30891,0.86638,...,-0.76514,-0.20333,-0.72007,-0.74691,0.22432,0.14293,0.00606,-0.27097,0.10898,0.55245
4,-0.93178,-0.21116,-0.9033,-0.87112,0.00495,-0.09805,-0.24874,-0.55251,0.12484,0.76462,...,-0.97541,-0.35051,-0.94022,-0.91433,-0.21117,-0.29047,-0.3725,-0.6667,-0.31773,0.17599
