In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,KernelPCA,NMF,TruncatedSVD
from sklearn import datasets
from scipy.sparse import csr_matrix
from sklearn.datasets import make_circles
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

9.1 : Reducing Features Using Principal Components

In [2]:
digits = datasets.load_digits()
features = StandardScaler().fit_transform(digits.data)
pca = PCA(n_components=0.99,whiten=True)
features_pca = pca.fit_transform(features)

print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_pca.shape[1])

Original number of features: 64
Reduced number of features: 54


9.2 : Reducing Features When Data Is Linearly Inseparable

In [3]:
features, _ = make_circles(n_samples=1000,random_state=1,noise=0.1,factor=0.1)
kpca = KernelPCA(kernel="rbf",gamma=15,n_components=1)
features_kpca = kpca.fit_transform(features)
print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_kpca.shape[1])

Original number of features: 2
Reduced number of features: 1


9.3 : Reducing Features by Maximizing Class Separability

In [4]:
iris = datasets.load_iris()
features = iris.data
target = iris.target

lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features ,target).transform(features)
print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_lda.shape[1])

Original number of features: 4
Reduced number of features: 1


In [5]:
lda.explained_variance_ratio_

array([0.9912126])

Discussion

In [6]:
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features,target)
lda_var_rations = lda.explained_variance_ratio_

def select_n_components(var_ratio,goal_var:float) -> int:
    total_variance = 0.0
    n_components = 0
    for explained_variance in var_ratio:
        total_variance += explained_variance
        n_components += 1

        if total_variance >= goal_var :
            break
    return n_components

select_n_components(lda_var_rations,0.95)

1

9.4 : Reducing Features Using Matrix Factorization

In [7]:
digits = datasets.load_digits()
features = digits.data
nmf = NMF(n_components=10,random_state=4)
features_nmf = nmf.fit_transform(features)

print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_nmf.shape[1])

Original number of features: 64
Reduced number of features: 10


9.5 : Reducing Features on Sparse Data

In [8]:
digits = datasets.load_digits()
features = StandardScaler().fit_transform(digits.data)
features_sparse = csr_matrix(features)
tsvd = TruncatedSVD(n_components=10)
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)
print("Original number of features:",features.shape[1])
print("Reduced number of features:",features_sparse_tsvd.shape[1])

Original number of features: 64
Reduced number of features: 10


Discussion

In [9]:
tsvd.explained_variance_ratio_[0:3].sum()

np.float64(0.30039385388038775)

In [10]:
tsvd = TruncatedSVD(n_components=features_sparse.shape[1]-1)
features_tsvd = tsvd.fit(features)
tsvd_var_ratios = tsvd.explained_variance_ratio_

def select_n_components(var_ratio,goal_var):
    total_variance = 0.0
    n_components = 0
    for explained_variance in var_ratio :
        total_variance += explained_variance
        n_components+= 1 
        if total_variance >= goal_var:
            break

    return n_components

select_n_components(tsvd_var_ratios,0.95)


40