# Problem 1
Load mnist digits data set. Estimate base line prediction accuracy with SDGClassifier (20 iteractions), RandomForest(max_depth=3) and RandomForest(max_depth=15). Train model on training data and predict accuracy using testing data. Record the amount of time needed to estimate each. 

In [2]:
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)
from six.moves import urllib
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
from sklearn.model_selection import train_test_split
X = mnist["data"]
y = mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import time

t0 = time.time()

sgd_clf = SGDClassifier(max_iter=20,random_state=42)
sgd_clf.fit(X_train,y_train)
y_pred_0= sgd_clf.predict(X_test)

t1 = time.time()
print("sdg_clf takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_0)

sdg_clf takes 12.00s


0.8492571428571428

In [8]:
from sklearn.ensemble import RandomForestClassifier

t0 = time.time()

rnd_clf = RandomForestClassifier(max_depth=3, random_state=42)
rnd_clf.fit(X_train,y_train)
y_pred_1 = rnd_clf.predict(X_test)

t1 = time.time()
print("rnd_clf takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_1)

rnd_clf takes 0.98s


0.6704

In [9]:
t0 = time.time()

rnd_clf_1 = RandomForestClassifier(max_depth=15, random_state=42)
rnd_clf_1.fit(X_train,y_train)
y_pred_2 = rnd_clf_1.predict(X_test)

t1 = time.time()
print("rnd_clf_1 takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_2)

rnd_clf_1 takes 2.81s


0.9437142857142857

# Problem 2
Apply PCA to extract principle components responsible for 90% of variance. Apply the algorithms above to the components. Report new accuracy score. Make sure to apply PCA to the data before the split into training and testing. Record time of the PCA procedure and record separately time and accuracy of each estimation and report changes relative to Problem 1. 

In [10]:
from sklearn.decomposition import PCA
 
pca = PCA(n_components=0.9)
X_reduced = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y)

In [11]:
t0 = time.time()

sgd_clf = SGDClassifier(max_iter=20,random_state=42)
sgd_clf.fit(X_train,y_train)
y_pred_0= sgd_clf.predict(X_test)

t1 = time.time()
print("sdg_clf takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_0)

sdg_clf takes 2.80s


0.7621714285714286

In [12]:
rnd_clf = RandomForestClassifier(max_depth=3, random_state=42)
rnd_clf.fit(X_train,y_train)
y_pred_1 = rnd_clf.predict(X_test)

t1 = time.time()
print("rnd_clf takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_1)

rnd_clf takes 8.79s


0.6161142857142857

In [13]:
rnd_clf_1 = RandomForestClassifier(max_depth=15, random_state=42)
rnd_clf_1.fit(X_train,y_train)
y_pred_2 = rnd_clf_1.predict(X_test)

t1 = time.time()
print("rnd_clf_1 takes {:.2f}s".format(t1 - t0))

accuracy_score(y_test, y_pred_2)

rnd_clf_1 takes 16.06s


0.9058857142857143

# Problem 3

Load the same data. Extract 1000 observations, use the code below. Try five different PCA alogrithms that would extract 100 principle components. Use the following PCA algorithms: PCA,  Kernel PCA(Linear), Kernel PCA(Sigmoid(gamma=0.001), LLE (10 neighbors), Isomap. Then estimate logistic regression on the training data and test the accuracy using testing data. 
* What are the accuracy score on testing data you find with each PCA algorithm? 
* Which PCA algorithm has the highest prediction accuracy?
* What is the accuracy of the logistic regression applied to the 1000 obs without applying PCA? 

In [14]:
np.random.seed(42)
smp = np.random.randint(50000, size=1000)
X_s = mnist["data"][smp,:]
y_s = mnist["target"][smp]

In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)

X_reduced = pca.fit_transform(X_s)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_reduced, y_s)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.816

In [16]:
from sklearn.decomposition import KernelPCA

lin_pca = KernelPCA(n_components = 100, kernel="linear", fit_inverse_transform=True)
X_reduced = lin_pca.fit_transform(X_s)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_reduced, y_s)

log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.816

In [17]:
sig_pca = KernelPCA(n_components = 100, kernel="sigmoid", gamma=0.001, coef0=1, fit_inverse_transform=True)
X_reduced = sig_pca.fit_transform(X_s)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_reduced, y_s)

log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.148

In [18]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=100, n_neighbors=10, random_state=42)
X_reduced = lle.fit_transform(X_s)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_reduced, y_s)

log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.828

In [19]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=100)
X_reduced = isomap.fit_transform(X_s)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_reduced, y_s)

log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.888

In [28]:
# Randomly sample 1000 obs, otherwise it will get really slow.
np.random.seed(42)
smp = np.random.randint(50000, size=1000)
X_s = mnist["data"][smp,:]
y_s = mnist["target"][smp]
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y_s)

In [29]:
pca = PCA(n_components=100)
X_reduced_train = pca.fit_transform(X_train_s)
X_reduced_test = pca.fit_transform(X_test_s)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_reduced_train, y_train_s)
log_reg.score(X_reduced_test,y_test_s)

0.208

In [30]:
from sklearn.decomposition import KernelPCA

lin_pca = KernelPCA(n_components = 100, kernel="linear", fit_inverse_transform=True)
X_reduced_train = lin_pca.fit_transform(X_train_s)
X_reduced_test = lin_pca.fit_transform(X_test_s)

log_reg.fit(X_reduced_train, y_train_s)
log_reg.score(X_reduced_test,y_test_s)

0.088

In [32]:
sig_pca = KernelPCA(n_components = 100, kernel="sigmoid", gamma=0.001, coef0=1, fit_inverse_transform=True)
X_reduced_train = sig_pca.fit_transform(X_train_s)
X_reduced_test = sig_pca.fit_transform(X_test_s)

log_reg.fit(X_reduced_train, y_train_s)
log_reg.score(X_reduced_test,y_test_s)

0.132

In [33]:
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=100, n_neighbors=10, random_state=42)
X_reduced_train = lle.fit_transform(X_train_s)
X_reduced_test = lle.fit_transform(X_test_s)

log_reg.fit(X_reduced_train, y_train_s)
log_reg.score(X_reduced_test,y_test_s)

0.384

In [34]:
from sklearn.manifold import Isomap

isomap = Isomap(n_components=100)
X_reduced_train = isomap.fit_transform(X_train_s)
X_reduced_test = isomap.fit_transform(X_test_s)

log_reg.fit(X_reduced_train, y_train_s)
log_reg.score(X_reduced_test,y_test_s)

0.252

In [35]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_s, y_train_s)
log_reg.score(X_test_s,y_test_s)

0.872