In [1]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
%%time
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

Wall time: 24.7 s


In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [5]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
%%time
# Fit the data into Random Forest Classifier model
rnd_clf = RandomForestClassifier(random_state=42)
rnd_clf.fit(X_train, y_train)

Wall time: 33.7 s


RandomForestClassifier(random_state=42)

In [7]:
y_pred_rf = rnd_clf.predict(X_test)

In [8]:
base_train_accuracy = round(rnd_clf.score(X_train, y_train)*100,3)
base_test_accuracy = round(rnd_clf.score(X_test, y_test)*100,3)
print(f"Training Data Score: {base_train_accuracy}")
print(f"Testing Data Score: {base_test_accuracy}")

Training Data Score: 100.0
Testing Data Score: 96.737


In [9]:
sorted(zip(rnd_clf.feature_importances_, mnist["feature_names"]), reverse=True)

[(0.008544691445561848, 'pixel434'),
 (0.008509174378376233, 'pixel379'),
 (0.008456366350991115, 'pixel351'),
 (0.008038884943546027, 'pixel407'),
 (0.007464755741269806, 'pixel490'),
 (0.0073969790625365815, 'pixel430'),
 (0.007288557948650137, 'pixel212'),
 (0.007254755178415, 'pixel156'),
 (0.007026599344449003, 'pixel376'),
 (0.006857188619130447, 'pixel489'),
 (0.0068098637032576155, 'pixel461'),
 (0.006769629678781196, 'pixel438'),
 (0.006718753788940182, 'pixel406'),
 (0.0067046553567919965, 'pixel410'),
 (0.006679792887030543, 'pixel515'),
 (0.006591332041841385, 'pixel347'),
 (0.006496196928027661, 'pixel543'),
 (0.006458869155473496, 'pixel462'),
 (0.006133869615630262, 'pixel211'),
 (0.006035880003695437, 'pixel378'),
 (0.00599464397866041, 'pixel463'),
 (0.005868419548010975, 'pixel382'),
 (0.005828748940253234, 'pixel291'),
 (0.005712000303168889, 'pixel597'),
 (0.005677744513981774, 'pixel319'),
 (0.005578604503599027, 'pixel405'),
 (0.00549654021875566, 'pixel544'),
 (0

In [10]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1722
           1       0.99      0.98      0.98      2007
           2       0.97      0.97      0.97      1799
           3       0.97      0.94      0.95      1788
           4       0.97      0.97      0.97      1690
           5       0.97      0.97      0.97      1622
           6       0.97      0.98      0.98      1669
           7       0.97      0.97      0.97      1798
           8       0.95      0.95      0.95      1708
           9       0.94      0.95      0.95      1697

    accuracy                           0.97     17500
   macro avg       0.97      0.97      0.97     17500
weighted avg       0.97      0.97      0.97     17500



In [11]:
print(f"Actual Labels: {list(y_test[:5])}")
print(f"Predicted Labels: {y_pred_rf[:5]}")

Actual Labels: ['6', '2', '7', '5', '7']
Predicted Labels: ['6' '2' '7' '5' '7']


# Bonus - Hyperparameter Tuning

In [12]:
# Get randomforest params
rnd_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [13]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 600, 1000]
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'criterion': ['gini', 'entropy']
}
grid = GridSearchCV(rnd_clf, param_grid, cv=3, verbose=3)

In [14]:
%%time
grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3] END ...............................n_estimators=200; total time=  40.6s
[CV 2/3] END ...............................n_estimators=200; total time=  41.7s
[CV 3/3] END ...............................n_estimators=200; total time=  41.7s
[CV 1/3] END ...............................n_estimators=600; total time= 2.1min
[CV 2/3] END ...............................n_estimators=600; total time= 2.0min
[CV 3/3] END ...............................n_estimators=600; total time= 2.0min
[CV 1/3] END ..............................n_estimators=1000; total time= 3.4min
[CV 2/3] END ..............................n_estimators=1000; total time= 3.8min
[CV 3/3] END ..............................n_estimators=1000; total time= 4.7min
Wall time: 24min 1s


GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
             param_grid={'n_estimators': [200, 600, 1000]}, verbose=3)

In [15]:
grid.best_params_

{'n_estimators': 600}

In [16]:
grid_predictions = grid.predict(X_test)
tuned_train_accuracy = round(grid.score(X_train, y_train)*100,3)
tuned_test_accuracy = round(grid.score(X_test, y_test)*100,3)
print(f"Training Data Score: {tuned_train_accuracy}")
print(f"Testing Data Score: {tuned_test_accuracy}")

Training Data Score: 100.0
Testing Data Score: 97.011


In [17]:
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1722
           1       0.99      0.98      0.98      2007
           2       0.97      0.98      0.97      1799
           3       0.97      0.95      0.96      1788
           4       0.97      0.97      0.97      1690
           5       0.98      0.97      0.97      1622
           6       0.98      0.98      0.98      1669
           7       0.97      0.97      0.97      1798
           8       0.95      0.96      0.95      1708
           9       0.94      0.96      0.95      1697

    accuracy                           0.97     17500
   macro avg       0.97      0.97      0.97     17500
weighted avg       0.97      0.97      0.97     17500

