In [31]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [32]:
train_df = pd.read_csv('./mnist_train.csv')

In [33]:
test_df = pd.read_csv('./mnist_test.csv')

In [34]:
train_df.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
test_df.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
sorted(train_df['label'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [37]:
sorted(test_df['label'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [38]:
X_train = train_df.iloc[:,1:]
y_train = train_df['label']

In [39]:
X_test = test_df.iloc[:,1:]
y_test = test_df['label']

In [40]:
kfold = KFold(n_splits=5)

## Random Forest One

This random forest algorithm uses the default values of 100 trees (estimators), $\sqrt{n}$ features to consider, and no limit to the depth of a tree.

In [11]:
rfc1 = RandomForestClassifier()

In [12]:
accuracyRF1 = cross_val_score(rfc1, X_train, y_train, cv=kfold)

In [13]:
print("Random Forest One Accuracy:", round(accuracyRF1.mean(), 3))

Random Forest One Accuracy: 0.966


## Random Forest Two

For this forest we doubled the maximum number of features to around $2\sqrt{n}$ instead of $\sqrt{n}$.

In [14]:
rfc2 = RandomForestClassifier(max_features=int(2*np.sqrt(len(X_train.columns))))

In [15]:
accuracyRF2 = cross_val_score(rfc2, X_train, y_train, cv=kfold)

In [16]:
print("Random Forest One Accuracy:", round(accuracyRF2.mean(), 3))

Random Forest One Accuracy: 0.967


## Random Forest Three

For this forest we doubled the default number of trees and reduced the maximum number of features that can be considered for any tree to half of the default $\sqrt{n}$.

In [17]:
rfc3 = RandomForestClassifier(n_estimators=200, max_features=14)

In [18]:
accuracyRF3 = cross_val_score(rfc3, X_train, y_train, cv=kfold)

In [19]:
print("Random Forest One Accuracy:", round(accuracyRF3.mean(), 3))

Random Forest One Accuracy: 0.967


## Best Random Forest on Test Set

In [20]:
rfc3.fit(X_train, y_train)

RandomForestClassifier(max_features=14, n_estimators=200)

In [21]:
predictions = rfc3.predict(X_test)

In [24]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.99      0.99      0.99      1135
           2       0.96      0.97      0.97      1032
           3       0.96      0.96      0.96      1010
           4       0.97      0.98      0.98       982
           5       0.97      0.97      0.97       892
           6       0.98      0.98      0.98       958
           7       0.97      0.96      0.97      1028
           8       0.96      0.95      0.96       974
           9       0.96      0.95      0.96      1009

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000



# Boosted Descision Trees

**Gradient Boosted One**

This model uses sklearn's GradientBoostingClassifier's default values of 100 trees, a learning rate of 0.1, and a maximum depth of 3.

In [26]:
# Gradient Boosted D-tree
from sklearn.ensemble import GradientBoostingClassifier
kfold = KFold(n_splits = 5)

 
GBclf = GradientBoostingClassifier(n_estimators=100, 
                                   learning_rate=0.1,
                                   max_features='sqrt')

results = cross_val_score(GBclf, X_train, y_train, cv = kfold)
print(results.mean())

0.9359166666666667


**Gradient Boosted Two**

This model uses sklearn's GradientBoostingClassifier's default value of 100 trees and a learning rate of 0.1 but the maximum depth was increased to 5.

In [27]:
GBclf2 = GradientBoostingClassifier(n_estimators=100, 
                                    learning_rate=0.1,
                                    max_depth = 5,
                                   max_features='sqrt')
results = cross_val_score(GBclf2, X_train, y_train, cv = kfold)
print(results.mean())

0.9616666666666667


**Gradient Boosted Three**

For this model we double number of trees but decreased the maximum depth to 1.

In [28]:
GBclf3 = GradientBoostingClassifier(n_estimators= 200, 
                                    learning_rate=0.1,
                                    max_depth = 1,
                                   max_features='sqrt')
results = cross_val_score(GBclf3, X_train, y_train, cv = kfold)
print(results.mean())

0.8810500000000001


## Testing the Best Gradient Boosted Classifier

In [44]:
GBclf2.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=5, max_features='sqrt')

In [46]:
predictions = GBclf2.predict(X_test)

In [47]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.99      0.99      0.99      1135
           2       0.96      0.96      0.96      1032
           3       0.95      0.96      0.96      1010
           4       0.98      0.96      0.97       982
           5       0.97      0.96      0.96       892
           6       0.97      0.97      0.97       958
           7       0.97      0.95      0.96      1028
           8       0.94      0.96      0.95       974
           9       0.95      0.95      0.95      1009

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000

