In [1]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
import numpy as np

In [4]:
mnist = fetch_openml('mnist_784', as_frame=False)
X_train, y_train = mnist.data[:60_000], mnist.target[:60_000] 
X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]

In [3]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95)

In [4]:
pca = PCA(n_components=0.95) 
X_reduced = pca.fit_transform(X_train)

In [5]:
pca.n_components_

np.int64(154)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline

In [7]:
clf = make_pipeline(PCA(random_state=42), RandomForestClassifier(random_state=42))

In [8]:
param_dist = {
    "pca__n_components": np.arange(10,80),
    "randomforestclassifier__n_estimators": np.arange(50,500)
}

In [9]:
rnd_search = RandomizedSearchCV(clf, param_dist, n_iter=10, cv=3, random_state=42)
rnd_search.fit(X_train[:10_000], y_train[:10_000])

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'pca__n_components': array([10, 11... 78, 79]), 'randomforestclassifier__n_estimators': array([ 50, ...97, 498, 499])}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_components,np.int64(62)
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,n_estimators,np.int64(304)
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
rnd_search.best_params_

{'randomforestclassifier__n_estimators': np.int64(304),
 'pca__n_components': np.int64(62)}

In [11]:
X_recovered = pca.inverse_transform(X_reduced)

In [12]:
rnd_pca = PCA(n_components=154, svd_solver="randomized", random_state=42)
X_reduced = rnd_pca.fit_transform(X_train)

In [3]:
from sklearn.decomposition import IncrementalPCA

In [None]:
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

In [None]:
X_reduced = inc_pca.transform(X_train)

In [None]:
filename = "my_mnist.mmap" 
X_mmap = np.memmap(filename, dtype="float32", mode='write', shape=X_train.shape)
X_mmap[:] = X_train # could be a loop instead, saving the data chunk by chunk
X_mmap.flush()

In [None]:
X_mmap = np.memmap(filename, dtype="float32", mode="readonly").reshape(-1, 784)
batch_size = X_mmap.shape[0] // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_train)

<h1>Problem 9:</h1>

In [11]:
from time import time

In [12]:
X_train, X_test, y_train, y_test = mnist.data[:60_000], mnist.data[60_000:], mnist.target[:60_000], mnist.target[60_000:]

In [13]:
rf = RandomForestClassifier(random_state=42)

In [16]:
start = time()
rf.fit(X_train, y_train)
end = time()
print(f"Total training time: {round(end-start, 2)}")

Total training time: 21.632941246032715


In [17]:
rf.score(X_test, y_test)

0.9705

In [21]:
pca = PCA(n_components=0.95)

X_train_reduced, X_test_reduced = pca.fit_transform(X_train), pca.transform(X_test)

In [26]:
pca.n_components_

np.int64(154)

In [22]:
rf = RandomForestClassifier(random_state=42)

In [24]:
start = time() 
rf.fit(X_train_reduced, y_train)
end = time() 
print(f"Total training time: {round(end-start, 2)} seconds")

Total training time: 70.75 seconds


In [27]:
rf.score(X_test_reduced, y_test)

0.9488

In [28]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=42)

In [32]:
%time sgd.fit(X_train_reduced, y_train)

CPU times: user 16.4 s, sys: 1.06 ms, total: 16.4 s
Wall time: 16.5 s


0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [30]:
sgd.score(X_test_reduced, y_test)

0.8959

<h1>Problem 10</h1>

In [33]:
from sklearn.manifold import TSNE

In [34]:
X_sample, y_sample = mnist.data[:5_000], mnist.target[:5_000]