# 5.6.2 Nystroem

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings("ignore")

Video: https://www.youtube.com/watch?v=Ys5B-t5KiGo

Se sabe que un kernel es una función que computa:

$$ k(x,x_i) = < \phi(x) , \phi(x_i) > $$

donde $<,>$ representa el producto interno entre matrices.

Para datasets de gran tamaño es preferible reemplazar el cálculo exacto de un kernel por un método de aproximación.

Nystroem aproxima un kernel mediante el muestreo de un subconjunto de los datos, con el fin de no generar una matriz de $n×n$, donde $n$ es la cantidad de ejemplos.

Los métodos de aproximación permiten la transformación no lineal de las variables de entrada; las transformaciones obtenidas pueden servir de base para modelos lineales y otros algoritmos, por ejemplo, la combinación con un estimador SGDClassifier.


In [None]:
X, y = load_digits(
    n_class=9,
    return_X_y=True,
)

data = X / 16.0

linearSVC = LinearSVC()

nystroem = Nystroem(
    # -------------------------------------------------------------------------
    # Kernel map to be approximated.
    kernel="rbf",
    # -------------------------------------------------------------------------
    # Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and
    # sigmoid kernels.
    gamma=0.2,
    # -------------------------------------------------------------------------
    # Zero coefficient for polynomial and sigmoid kernels. Ignored by other
    # kernels
    coef0=None,
    # -------------------------------------------------------------------------
    # Degree of the polynomial kernel. Ignored by other kernels.
    degree=None,
    # -------------------------------------------------------------------------
    # Number of features to construct. How many data points will be used to
    # construct the mapping.
    n_components=300,
    # -------------------------------------------------------------------------
    # Pseudo-random number generator to control the uniform sampling without
    # replacement of n_components of the training data to construct the basis
    # kernel.
    random_state=1,
)


data_transformed = nystroem.fit_transform(data)

linearSVC.fit(data_transformed, y)

linearSVC.score(data_transformed, y)

In [None]:
#
# Dimensiones del dataset
#
data.shape

In [None]:
#
# Dimensiones del dataset transformado
#
data_transformed.shape

In [None]:
#
# Subset of training points used to construct the feature map.
#
nystroem.components_

In [None]:
nystroem.component_indices_.shape

In [None]:
#
# Patrones en el espacio original de los datos
#
data[nystroem.component_indices_, :]

In [None]:
nystroem.component_indices_.shape

In [None]:
#
# Normalization matrix needed for embedding.
#
nystroem.normalization_

In [None]:
nystroem.normalization_.shape

In [None]:
print('ok_')

ok_
