In [1]:
import numpy as np
import time

from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [2]:
mnist = fetch_mldata( "MNIST original" )

trainX, testX, trainY, testY = mnist.data[:60000], mnist.data[60000:], mnist.target[:60000], mnist.target[60000:]

In [3]:
forest = RandomForestClassifier()

t1 = time.time()
forest.fit( trainX, trainY )
t2 = time.time()

original = t2 - t1

print(f"Random forest trained in {original : 4.1f} s.")

Random forest trained in  3.5 s.


In [4]:
pca = PCA( n_components = 0.95 )

t3 = time.time()
trainRedX = pca.fit_transform( trainX )
t4 = time.time()

pcaTime = t4 - t3

print(f"PCA completed in {pcaTime : 4.1f} s.")

PCA completed in  16.8 s.


In [5]:
forestPCA = RandomForestClassifier()

t5 = time.time()
forestPCA.fit( trainRedX, trainY )
t6 = time.time()

reduced = t6 - t5

print(f"PCA reduced random forest trained in {reduced : 4.1f} s.")

PCA reduced random forest trained in  8.2 s.


In [6]:
pred    = forest.predict( testX )
predPCA = forestPCA.predict( pca.transform( testX ) )

acc = accuracy_score( testY, pred )
accPCA = accuracy_score( testY, predPCA )

print(f"Original accuracy: {acc}\nPCA reduced accuracy: {accPCA}")

Original accuracy: 0.9481
PCA reduced accuracy: 0.8843
