In [1]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
%%time
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

Wall time: 26.4 s


In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [5]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Base Random Forest Classifier Modeling

In [6]:
%%time
# Fit the data into Random Forest Classifier model
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train, y_train)

Wall time: 38.4 s


RandomForestClassifier(random_state=42)

In [7]:
base_train_accuracy = round(rnd_clf.score(X_train, y_train)*100,3)
base_test_accuracy = round(rnd_clf.score(X_test, y_test)*100,3)
print(f"Training Data Score: {base_train_accuracy}")
print(f"Testing Data Score: {base_test_accuracy}")

Training Data Score: 100.0
Testing Data Score: 96.737


# Dimension reduction using PCA and Random Forest Classifier modeling

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

In [9]:
%%time
# Fit the data into Random Forest Classifier model
rnd_clf2 = RandomForestClassifier(random_state=42)
rnd_clf2.fit(X_train_reduced, y_train)

Wall time: 1min 32s


RandomForestClassifier(random_state=42)

In [10]:
base_train_accuracy = round(rnd_clf2.score(X_train_reduced, y_train)*100,3)
base_test_accuracy = round(rnd_clf2.score(X_test_reduced, y_test)*100,3)
print(f"Training Data Score: {base_train_accuracy}")
print(f"Testing Data Score: {base_test_accuracy}")

Training Data Score: 100.0
Testing Data Score: 94.714


In [11]:
y_pred_rf = rnd_clf2.predict(X_test_reduced)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1722
           1       0.98      0.98      0.98      2007
           2       0.94      0.94      0.94      1799
           3       0.92      0.92      0.92      1788
           4       0.95      0.95      0.95      1690
           5       0.94      0.93      0.94      1622
           6       0.96      0.98      0.97      1669
           7       0.95      0.96      0.96      1798
           8       0.93      0.91      0.92      1708
           9       0.93      0.92      0.93      1697

    accuracy                           0.95     17500
   macro avg       0.95      0.95      0.95     17500
weighted avg       0.95      0.95      0.95     17500



In [12]:
print(f"Actual Labels: {list(y_test[:5])}")
print(f"Predicted Labels: {y_pred_rf[:5]}")

Actual Labels: ['6', '2', '7', '5', '7']
Predicted Labels: ['6' '2' '7' '5' '7']


So it took more than twice the time using less dimensions and accuracy score went down.