In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
%matplotlib inline

In [47]:
train_df = pd.read_csv('./train.csv')

In [48]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
sorted(train_df['label'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [60]:
X = train_df.iloc[:,1:]
y = train_df['label']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Random Forest One

This random forest algorithm uses the default values of 100 trees (estimators), $\sqrt{n}$ features to consider, and no limit to the depth of a tree.

In [62]:
rfc = RandomForestClassifier()

In [63]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [64]:
predictions = rfc.predict(X_test)

In [65]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1333
           1       0.98      0.99      0.98      1520
           2       0.96      0.95      0.96      1414
           3       0.95      0.94      0.95      1471
           4       0.96      0.97      0.96      1358
           5       0.96      0.95      0.95      1205
           6       0.97      0.98      0.97      1397
           7       0.97      0.95      0.96      1480
           8       0.95      0.95      0.95      1334
           9       0.93      0.94      0.94      1348

    accuracy                           0.96     13860
   macro avg       0.96      0.96      0.96     13860
weighted avg       0.96      0.96      0.96     13860



In [66]:
print(confusion_matrix(y_test, predictions))

[[1316    0    1    2    1    1    5    0    6    1]
 [   0 1504    4    1    1    3    4    1    1    1]
 [   5    6 1345    7   13    2    7   11   14    4]
 [   4    3   15 1379    2   22    3   16   17   10]
 [   3    0    1    0 1315    0    8    2    2   27]
 [   6    2    1   22    2 1144   10    1    9    8]
 [  13    3    1    0    3    8 1364    0    5    0]
 [   2    6   17    1   11    0    0 1411    4   28]
 [   3    6    5   12    7   10    6    3 1267   15]
 [   7    4    6   20   18    3    2   10    9 1269]]


## Random Forest Two

For this forest we increased the number of trees to 400 but reduced the maximum depth to around half of the number of features.

In [93]:
rfc = RandomForestClassifier(n_estimators=400, max_depth=390)

In [94]:
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=390, n_estimators=400)

In [95]:
predictions = rfc.predict(X_test)

In [96]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1333
           1       0.98      0.99      0.98      1520
           2       0.96      0.96      0.96      1414
           3       0.96      0.94      0.95      1471
           4       0.97      0.97      0.97      1358
           5       0.97      0.95      0.96      1205
           6       0.96      0.98      0.97      1397
           7       0.97      0.95      0.96      1480
           8       0.96      0.95      0.96      1334
           9       0.93      0.95      0.94      1348

    accuracy                           0.96     13860
   macro avg       0.96      0.96      0.96     13860
weighted avg       0.96      0.96      0.96     13860



In [97]:
print(confusion_matrix(y_test, predictions))

[[1316    0    2    0    1    1    6    0    6    1]
 [   0 1502    4    3    1    3    4    1    1    1]
 [   5    6 1360    7   13    0    6    9    8    0]
 [   5    2   15 1378    1   22    2   16   17   13]
 [   3    0    1    0 1312    0    9    2    2   29]
 [   5    3    0   18    0 1147   16    2    3   11]
 [  11    3    1    0    3    5 1368    0    6    0]
 [   1    6   19    2    9    1    0 1409    2   31]
 [   3    4    5   12    6    8    7    3 1270   16]
 [   6    4    4   22   11    1    3   12    9 1276]]
