In [1]:
import pandas as pd
import numpy as np

<h4 class="text-center"> PCA (Linear Dimensionality Reduction)</h4>

In [3]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

In [4]:
##Dataset
data = datasets.load_digits()

In [9]:
len(data.feature_names) #we got 64 features

64

In [11]:
raw_data = data.data

In [13]:
raw_data.shape #rows and features

(1797, 64)

In [14]:
scaler = StandardScaler()

In [16]:
scale_feature = scaler.fit_transform(raw_data)

<span class="badge"> PCA </span>

In [18]:
from sklearn.decomposition import PCA

In [23]:
#retain 99% variance
pca = PCA(n_components=0.99, whiten=True)

In [24]:
pca

PCA(n_components=0.99, whiten=True)

In [25]:
#conduct PCA

In [26]:
featured_pca = pca.fit_transform(scale_feature) 

In [29]:
featured_pca.shape, scale_feature.shape

((1797, 54), (1797, 64))

In [30]:
#featured reduced to 
(64-54)

10

<h4 class="text-center"> PCA (Linearly Inseparable) Non-Linear</h4>

In [31]:
from sklearn.decomposition import KernelPCA

In [32]:
circle = datasets.make_circles

In [33]:
#non-linear dataset
circle_feature, a = circle(n_samples=1000, random_state=42, noise=0.1, factor=0.1)

<span class="badge"> Radius basis Function RBF </span>

In [34]:
rbf_pca = KernelPCA(kernel='rbf', gamma=15, n_components=1)

In [35]:
featured_rbf = rbf_pca.fit_transform(circle_feature)

In [37]:
featured_rbf.shape

(1000, 1)

In [38]:
circle_feature.shape

(1000, 2)

<h4 class="text-center"> LDR (Linear Discriminant Analysis) </h4>

In [39]:
#for classifier

In [40]:
iris = datasets.load_iris()

In [41]:
iris_features = iris.data
iris_target = iris.target

<span class="badge"> LDA </span>

In [42]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [61]:
##with 1 component

In [43]:
lda = LinearDiscriminantAnalysis(n_components=1)

In [44]:
lda.fit(iris_features,iris_target)

LinearDiscriminantAnalysis(n_components=1)

In [47]:
lda.transform(iris_features).shape

(150, 1)

In [48]:
iris_features.shape

(150, 4)

In [50]:
lda.explained_variance_ratio_ #variance explained by each component # we have one component

array([0.9912126])

In [62]:
# with 2 components

In [56]:
lda_3 = LinearDiscriminantAnalysis(n_components=2)

In [57]:
lda_3.fit(iris_features,iris_target)

LinearDiscriminantAnalysis(n_components=2)

In [59]:
lda_3.transform(iris_features).shape

(150, 2)

In [60]:
lda_3.explained_variance_ratio_

array([0.9912126, 0.0087874])

In [63]:
## n_components = None

In [64]:
lda_none = LinearDiscriminantAnalysis(n_components=None)

In [65]:
lda_none.fit(iris_features,iris_target)

LinearDiscriminantAnalysis()

In [67]:
lda_none.transform(iris_features).shape

(150, 2)

In [68]:
lda_none.explained_variance_ratio_

array([0.9912126, 0.0087874])

<h4 class="text-center"> Non Negative matrix factorization (NMF) </h4>

In [69]:
# Unsupervised Linear dimensionality Reduction

In [70]:
digits = datasets.load_digits()

In [71]:
digit_feature = digits.data

<span class="badge"> NMF </span>

In [72]:
from sklearn.decomposition import NMF

In [73]:
nmf = NMF(n_components=10,random_state=42)

In [75]:
nmf_feature = nmf.fit_transform(digit_feature)



In [76]:
nmf_feature.shape

(1797, 10)

In [77]:
digit_feature.shape

(1797, 64)

<h4 class="text-center"> Truncated Singular Value Decomposition </h4>

In [83]:
digit_feature.shape

(1797, 64)

In [84]:
digit_scaler = scaler.fit_transform(digit_feature)

In [85]:
digit_scaler.shape

(1797, 64)

In [86]:
from scipy.sparse import csr_matrix

In [87]:
feature_sparse = csr_matrix(digit_scaler)

In [88]:
feature_sparse

<1797x64 sparse matrix of type '<class 'numpy.float64'>'
	with 109617 stored elements in Compressed Sparse Row format>

In [89]:
from sklearn.decomposition import TruncatedSVD

In [96]:
svd = TruncatedSVD(n_components=40)

In [97]:
svd.fit(feature_sparse)

TruncatedSVD(n_components=40)

In [98]:
svd.transform(feature_sparse).shape

(1797, 40)

In [99]:
digit_scaler.shape

(1797, 64)

In [100]:
svd.explained_variance_ratio_

array([0.12033916, 0.09561054, 0.08444415, 0.06498408, 0.04860155,
       0.0421412 , 0.03942083, 0.03389381, 0.02998221, 0.02932003,
       0.02781805, 0.02577055, 0.02275303, 0.0222718 , 0.02165229,
       0.01914167, 0.01775547, 0.01638069, 0.0159646 , 0.01489191,
       0.0134797 , 0.01271931, 0.01165837, 0.01057647, 0.00975316,
       0.00944559, 0.00863014, 0.00836643, 0.00797692, 0.00746471,
       0.00725581, 0.00691911, 0.00653907, 0.00640789, 0.00591381,
       0.00571157, 0.00523631, 0.00481781, 0.00453666, 0.00423136])

In [103]:
svd.explained_variance_ratio_[0:40].sum()


0.9507778385931096

In [104]:
#we can create a automated select n_components until it reaches 0.95

In [107]:
tsvd = TruncatedSVD(n_components=feature_sparse.shape[1]-1)
features_tsvd = tsvd.fit(digit_scaler)
tsvd_var_ratios = tsvd.explained_variance_ratio_

In [109]:
def select_n_components(var_ratio, goal_var):
    sum_of_variance = 0.0
    n_component = 0
    
    for explained_var in var_ratio:
        sum_of_variance += explained_var
        n_component += 1
        
        if sum_of_variance >= goal_var:
            break
    return n_component

In [110]:
select_n_components(tsvd_var_ratios,0.95)

40

In [111]:
# we can use this automated method to finds n_component of other feature_extraction method