# Dimensionality reduction

## Generate data

In [1]:
import numpy as np
from numpy.random import normal
from collections import namedtuple

Data = namedtuple('Data', 'X y')

np.random.seed(37)

def get_data(N=1000):
    x0 = normal(1, 1, N)
    x1 = normal(3 + 2 * x0, 1, N)
    x2 = normal(2.5 + 0.8 * x1, N)
    x3 = normal(5, 2, N)
    x4 = normal(0.8 + 0.8 * x3, N)
    x5 = normal(10, 5, N)

    X0 = [x0, x1, x2, x3, x4, x5]
    X0 = np.hstack([x.reshape(-1, 1) for x in X0])

    x0 = normal(1.5, 1, N)
    x1 = normal(7.3 + 2 * x0, 1, N)
    x2 = normal(3.6 + 0.9 * x1, N)
    x3 = normal(3.3, 2.5, N)
    x4 = normal(0.9 - 0.4 * x3, N)
    x5 = normal(10, 5, N)

    X1 = [x0, x1, x2, x3, x4, x5]
    X1 = np.hstack([x.reshape(-1, 1) for x in X1])

    X = np.vstack([X0, X1])
    y = np.hstack([np.full((1, X0.shape[0]), 0), np.full((1, X1.shape[0]), 1)])[0]
    
    return Data(X, y)

# training data
T = get_data()

## Types of dimensionality reductions

### Principal Component Analysis (PCA)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)

pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pipeline.fit(T.X)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=3,
                     random_state=37, svd_solver='auto', tol=0.0,
                     whiten=False))],
         verbose=False)

### Kernel PCA

In [3]:
from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components=3, random_state=37, kernel='linear')
kpca.fit(T.X)

KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
          fit_inverse_transform=False, gamma=None, kernel='linear',
          kernel_params=None, max_iter=None, n_components=3, n_jobs=None,
          random_state=37, remove_zero_eig=False, tol=0)

### Singular Value Decomposition (SVD)

In [4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3, random_state=37)
svd.fit(T.X)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=5, random_state=37,
             tol=0.0)

### Factor analysis

In [5]:
from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=3, random_state=37)
fa.fit(T.X)

FactorAnalysis(copy=True, iterated_power=3, max_iter=1000, n_components=3,
               noise_variance_init=None, random_state=37,
               svd_method='randomized', tol=0.01)

### Non-Negative Matrix Factorization (NMF)

In [6]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=3, random_state=37)
nmf.fit(np.abs(T.X))

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=37, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)