# Principal Component Analysis Case Study

## Prepare MNIST datasets

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Model 1: Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
import time

In [54]:
total_elapsed = 0
for _ in range(5):
    start = time.time()
    rfc = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
    rfc.fit(X_train, y_train)
    end = time.time()
    total_elapsed += end - start
print(f"Time taken to train the model: {total_elapsed/5:.3f}s")

Time taken 9.331s


In [37]:
from sklearn.metrics import accuracy_score
y_pred_rfc = rfc.predict(X_test)
accuracy_score(y_test, y_pred_rfc)

0.8295

### Implement PCA to reduced X_train dataset's dimension (to 154)

In [38]:
from sklearn.decomposition import PCA

In [39]:
total_elapsed = 0
for _ in range(5):
    start = time.time()
    pca = PCA(n_components=0.95) # explained variance = 95%
    X_reduced = pca.fit_transform(X_train)
    rfc = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
    rfc.fit(X_reduced, y_train)
    end = time.time()
    total_elapsed += end - start
print(f"Time taken to train the model: {total_elapsed/5:.3f}s")


Time taken 27.689531135559083


In [40]:
X_test_reduced = pca.transform(X_test)
y_pred_rfc_reduced = rfc.predict(X_test_reduced)
accuracy_score(y_test, y_pred_rfc_reduced)

0.8163

## Model 2: Softmax Regression

In [56]:
from sklearn.linear_model import LogisticRegression
from warnings import simplefilter
simplefilter(action='ignore')

In [57]:
total_elapsed = 0
for _ in range(5):
    start = time.time()
    softmax_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)
    softmax_clf.fit(X_train, y_train)
    end = time.time()
    total_elapsed += end - start
print(f"Time taken to train the model: {total_elapsed/5:.3f}s")

Time taken 15.733s


In [58]:
y_pred = softmax_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9255

### Implement PCA to reduced X_train dataset's dimension (to 154)

In [59]:
total_elapsed = 0
for _ in range(5):
    start = time.time()
    X_reduced = pca.fit_transform(X_train)
    softmax_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42)
    softmax_clf.fit(X_reduced, y_train)
    end = time.time()
    total_elapsed += end - start
print(f"Time taken to train the model: {total_elapsed/5:.3f}s")

Time taken to train the model:11.642s


In [60]:
X_test_reduced = pca.transform(X_test)
y_pred = softmax_clf.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9201

## Conclusion

- It took longer to train a random forest classifier after performing pca on X_train. However, it was quicker to train a softmax regression on a reduced X_train.

- The accuracy scores generally drop with the application of PCA.

- Performing PCA doesn't necessarily reduce the time taken to train a model. It depends on the datasets.