In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "unsupervised_learning"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
# Build a 3D set
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [3]:
X_centered = X - X.mean(axis=0)
U, s, V = np.linalg.svd(X_centered)
c1=V.T[:,0]
c2=V.T[:,1]

In [4]:
# Projecting into the first hiperplane decribed by the first component
# to reduce to the d dimension, we need the first d columns of V.T
W2= V.T[:,:2]
X2D = X_centered.dot(W2) 
W2

array([[ 0.93636116, -0.34027485],
       [ 0.29854881,  0.90119108],
       [ 0.18465208,  0.2684542 ]])

In [5]:
V.T

array([[ 0.93636116, -0.34027485, -0.08626012],
       [ 0.29854881,  0.90119108, -0.31420255],
       [ 0.18465208,  0.2684542 ,  0.94542898]])

In [6]:
# Easier implementation with Scikit-learn

from sklearn.decomposition import PCA

pca= PCA(n_components=2) # 2D probably
X2D = pca.fit_transform(X)

In [7]:
print(pca.explained_variance_ratio_) 
# this is the proportion of the datasets variance that lies along the axis of each principal component
# the 84% of the data lies along the 1st axis and the 14.6 along the second ( ther eis 1.2% "losts")

[0.84248607 0.14631839]


In [8]:
# The minimum number of dimentsions to preserve 95% pg the variance

pca= PCA()
pca.fit(X)
cumsum= np.cumsum(pca.explained_variance_ratio_)
d= np.argmax(cumsum > 0.95) +1
print(d)

2


In [9]:
pca=PCA(n_components=d)
X_reduced= pca.fit_transform(X)

In [10]:
from six.moves import urllib
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(np.int64)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

In [11]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# For my case, minibaches of things can be in the memory for training (otherwise it would be imposible for me to implement this)

from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train,n_batches):
    inc_pca.partial_fit(X_batch) ## REMARK: the attribute is called partial_fit , not fit_transfor

X_mnist_reduced = inc_pca.transform(X_train) # we called fit, now trasnform

alternative to manipulate a large array stored in a binary file on disk as if were entirely in memory

```python
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

```

In [13]:
# randomized PCA
rnd_pca= PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_train)

In [14]:
from sklearn.decomposition import KernelPCA # Kernel was the feature enhencing method, like to elevate the features to squre and all
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)


MemoryError: 

In [None]:
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
y = t > 6.9

In [None]:
# using grid to select the kernel hiperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf= Pipeline(
    [# Reducing the dimensionality to two dimensions
    ("kpca", KernelPCA(n_components=2)),
# applying Logistic Regression for classificatio
    ("log_reg", LogisticRegression())    
    ])
# find best kernel and gamma value for kPCA
param_grid = [
    {
        "kcpa__gamma": np.linspace(0.03,0.05,10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }
]

grid_search= GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X,y)

# get best classification accuracy