<a href="https://colab.research.google.com/github/pawel0508/MachineLearning_UcznieNienadzorowane/blob/main/01_pca_math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=8, suppress=True, edgeitems=5, linewidth=200)


Generowanie danych

In [5]:
from sklearn.datasets import load_iris

raw_data = load_iris()
data = raw_data['data']
target = raw_data['target']
feature_names = list(raw_data['feature_names'])



In [9]:
feature_names = [name.replace(' ', '_')[:-5] for name in feature_names]

In [14]:
df = pd.DataFrame(data = np.c_[data, target], columns = feature_names + ['class'])
df['class'] = df['class'].map({0.0 : 'setosa', 1.0 : 'versicolor', 2.0 : 'virginica'})
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


Wizualizacja danych

In [15]:
px.scatter_3d(data_frame=df, x = 'sepal_length', y = 'petal_length', z = 'petal_width',
              template = 'simple_white', title='Iris data - wizualizacja 3D (sepal_length, petal_length, petal_width)',
              color='class', symbol='class', opacity=0.5, width=950, height=700)

In [19]:
from sklearn.preprocessing import StandardScaler
x = df.iloc[:, [0, 2, 3]]
y = df.iloc[:,-1]

scaler = StandardScaler()
x_std = scaler.fit_transform(x)
x_std[:10]


array([[-0.90068117, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.34022653, -1.3154443 ],
       [-1.38535265, -1.39706395, -1.3154443 ],
       [-1.50652052, -1.2833891 , -1.3154443 ],
       [-1.02184904, -1.34022653, -1.3154443 ],
       [-0.53717756, -1.16971425, -1.05217993],
       [-1.50652052, -1.34022653, -1.18381211],
       [-1.02184904, -1.2833891 , -1.3154443 ],
       [-1.74885626, -1.34022653, -1.3154443 ],
       [-1.14301691, -1.2833891 , -1.44707648]])

Implementacja PCA

In [21]:
cov_mat = np.cov(x_std, rowvar = False)
cov_mat

array([[1.00671141, 0.87760447, 0.82343066],
       [0.87760447, 1.00671141, 0.96932762],
       [0.82343066, 0.96932762, 1.00671141]])

Wyznaczanie wartości i wektorów własnych

In [28]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print(f'Wartości własne macierzy kowariancji:\n{eig_vals}\n')
print(f'Wektory własne macierzy kowariancji:\n{eig_vecs}')


Wartości własne macierzy kowariancji:
[2.78833033 0.20075012 0.03105378]

Wektory własne macierzy kowariancji:
[[ 0.55964149  0.81270446  0.16221241]
 [ 0.59148855 -0.2546058  -0.76506024]
 [ 0.58046765 -0.52410624  0.62319335]]


Sortowanie wektorów według wartości własnych

In [30]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
eig_pairs.sort(reverse = True)
eig_pairs

[(2.7883303296752913, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343807, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449190244, array([ 0.16221241, -0.76506024,  0.62319335]))]

Obliczanie wartosci procentowej wyjaśnianej wariancji

In [31]:
total = sum(eig_vals)
explained_variance_ratio = [(i/total) for i in sorted(eig_vals, reverse = True)]
explained_variance_ratio

[0.9232471536035964, 0.06647059464767172, 0.010282251748731881]

In [32]:
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

In [33]:

results = pd.DataFrame(data={'explained_variance_ratio': explained_variance_ratio})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.923247,0.923247,1
1,0.066471,0.989718,2
2,0.010282,1.0,3


In [34]:
fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'], name='explained variance ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'], name='cumulative explained variance')],
                layout=go.Layout(title='PCA - 3 components', width=950, template='simple_white'))
fig.show()

In [35]:
eig_pairs

[(2.7883303296752913, array([0.55964149, 0.59148855, 0.58046765])),
 (0.20075011806343807, array([ 0.81270446, -0.2546058 , -0.52410624])),
 (0.031053780449190244, array([ 0.16221241, -0.76506024,  0.62319335]))]

In [37]:
W = np.hstack((eig_pairs[0][1].reshape(3,1), eig_pairs[1][1].reshape(3,1)))
W

array([[ 0.55964149,  0.81270446],
       [ 0.59148855, -0.2546058 ],
       [ 0.58046765, -0.52410624]])

In [38]:
X_pca = x_std.dot(W)

In [40]:
X_pca[:10]

array([[-2.06036006,  0.2986744 ],
       [-2.1959812 ,  0.10172707],
       [-2.36522102, -0.08074913],
       [-2.36579421, -0.20816508],
       [-2.12817063,  0.20020073],
       [-1.60325585,  0.4127035 ],
       [-2.32300467, -0.26268319],
       [-2.09455194,  0.1857296 ],
       [-2.53503403, -0.39064128],
       [-2.23877073,  0.15624518]])

In [44]:
pca_df = pd.DataFrame(data = X_pca, columns = ['pca_1', 'pca_2'])
pca_df['class'] = df['class']
pca_df['pca_2'] = - pca_df['pca_2']
pca_df

Unnamed: 0,pca_1,pca_2,class
0,-2.060360,-0.298674,setosa
1,-2.195981,-0.101727,setosa
2,-2.365221,0.080749,setosa
3,-2.365794,0.208165,setosa
4,-2.128171,-0.200201,setosa
...,...,...,...
145,1.906692,0.124424,virginica
146,1.262579,0.213420,virginica
147,1.541846,0.114404,virginica
148,1.634876,0.645735,virginica


In [45]:
px.scatter(pca_df, 'pca_1', 'pca_2', color='class', width=950, template='simple_white')

PCA scikit-learn

In [46]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
x_pca = pca.fit_transform(x_std)
pca_df = pd.DataFrame(data = x_pca, columns = ['pca_1', 'pca_2'])
pca_df['class'] = df['class']
pca_df


Unnamed: 0,pca_1,pca_2,class
0,-2.060360,-0.298674,setosa
1,-2.195981,-0.101727,setosa
2,-2.365221,0.080749,setosa
3,-2.365794,0.208165,setosa
4,-2.128171,-0.200201,setosa
...,...,...,...
145,1.906692,0.124424,virginica
146,1.262579,0.213420,virginica
147,1.541846,0.114404,virginica
148,1.634876,0.645735,virginica
