# Exercise 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
import seaborn as sns
sns.set_theme(style='darkgrid')

In [5]:
from sklearn.datasets import fetch_openml
dataset = fetch_openml('mnist_784')

In [10]:
X = dataset['data']
y = dataset['target']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=10_000)

## Decision Tree on MNIST

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, clf.predict(X_test))

0.8714

## Random Forest implementation

In [110]:
class MyRandomForestClassifier():
    from collections import Counter
    def __init__(self, n_estimators = 50):
        self._params = {
            'n_estimators': n_estimators
        }
        self.trees = []
    
    def fit(self, X, y):
        # for each tree sample its dataset and train it
        for _ in range(self._params['n_estimators']):
            sample_ixs = np.random.randint(X.shape[0], size=X.shape[1])
            X_sampled = X[sample_ixs, :]
            y_sampled = y[sample_ixs]
            tree = DecisionTreeClassifier(max_features='sqrt')
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)
        
    def predict(self, X):
        predictions = []
        for x in X:
            votes = Counter()
            for tree in self.trees:
                votes[tree.predict([x])[0]] += 1
            predictions.append(votes.most_common(1)[0][0])
        return predictions

## Random Forest on MNIST

In [111]:
myclf = MyRandomForestClassifier()
myclf.fit(X_train, y_train)

In [112]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, myclf.predict(X_test))

0.9028

In [116]:
for n_estimators in np.arange(10, 101, 10):
    myclf = MyRandomForestClassifier(n_estimators)
    myclf.fit(X_train, y_train)
    print(n_estimators, accuracy_score(y_test, myclf.predict(X_test)))

10 0.8415
20 0.8699
30 0.891
40 0.8993
50 0.8967
60 0.9077
70 0.9049


KeyboardInterrupt: 

## Scikit-learn's Random Forest comparison

In [118]:
from sklearn.ensemble import RandomForestClassifier
for n_estimators in np.arange(10, 101, 10):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(X_train, y_train)
    print(n_estimators, accuracy_score(y_test, clf.predict(X_test)))

10 0.9462
20 0.9557
30 0.9606
40 0.9633
50 0.9634
60 0.964
70 0.9671
80 0.9662
90 0.9669
100 0.9658
