In [1]:
# 0) importing useful libraries
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from os.path import exists
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

In [2]:
# 1) loading MNIST dataset
dataset = None
if exists("../datasets/mnist") == True:
    with open("../datasets/mnist", "rb") as f:
        dataset = pickle.load(f)
else:
    dataset = fetch_openml("mnist_784")
    with open("./mnist", "rb") as f:
        pickle.dump(dataset, f)

if dataset is None:
    print("WARNING: DATASET NOT LOADED")
else:
    X = np.array(dataset["data"], dtype=int)
    y = np.array(dataset["target"], dtype=int)

In [3]:
# 2) train single decision tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1/7))
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print("Accuracy score: ", acc_score*100, "%")

Accuracy score:  87.32 %


In [3]:
def majority_voting(a):
    """Return the most frequent class in a.

    - param a: np.array of shape (n, )
    - return: the most recurrent class in a.
    """
    a = np.array(a, dtype=int)
    counts = np.bincount(a)
    return np.argmax(counts)

def build_tree(X, y, N, max_features):
    index = np.random.choice(X.shape[0], N, replace=True)
    X_train = X[index]
    y_train = y[index]
    model = DecisionTreeClassifier(max_features=max_features)
    model.fit(X_train, y_train)
    return model

def build_random_forest(X, y, n_trees, max_features):
    print("Training with ", n_trees, " trees")
    model = MyRandomForestClassifier(n_trees, max_features=max_features)
    model.fit(X, y)
    return model

class MyRandomForestClassifier():
    def __init__(self, n_estimators, max_features=None, N=None, criteria="majority"):
        """Create a random forest model.

        - param n_estimators: number of trees in the random forest
        - param max_features: number of max features that a single tree can consider
        - param N: number of samples extracted with replacement from the dataset. If None N = number of samples in training set.
        - return: a model ready to be trained
        """
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.N = N
        self.trees = list()
        if criteria == "majority":
            self.criteria = majority_voting
        else:
            raise ValueError("criteria parameter is not valid")

    def fit(self, X, y):
        """train the trees of this random forest using subsets of X (and y)"""
        N = self.N if self.N is not None else len(y)
        futures_list = []
        with ThreadPoolExecutor(max_workers=8) as exe:
            for _ in range(self.n_estimators):
                futures_list.append(exe.submit(build_tree, X, y, N, self.max_features))
        wait(futures_list, timeout=None, return_when=ALL_COMPLETED)
        self.trees = [i.result() for i in futures_list]
        
    def predict(self, X):
        """predict the label for each point in X"""
        y_preds = np.zeros((len(X), len(self.trees),))
        for i in range(len(self.trees)):
            y_preds[:, i] = np.array(self.trees[i].predict(X))
        y_pred = np.zeros(len(X))
        for i in range(len(y_pred)):
            y_pred[i] = self.criteria(y_preds[i, :])
        return y_pred

In [4]:
# 4) forests testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1/7))
params = {
    "max_depth": [None, 2, 4, 8],
    "max_features": [int(np.sqrt(X_train.shape[1]))],
    "n_estimators": np.arange(10, 101, 10)
}
for config in ParameterGrid(params):
    print("Parameter configuration: ", config)
    model = MyRandomForestClassifier(config["n_estimators"], config["max_depth"], config["max_features"])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print("SCORE: ", score*100, "%")

Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 10}
SCORE:  44.78 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 20}
SCORE:  55.089999999999996 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 30}
SCORE:  59.96 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 40}
SCORE:  62.519999999999996 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 50}
SCORE:  67.81 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 60}
SCORE:  67.97 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 70}
SCORE:  69.05 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 80}
SCORE:  70.65 %
Parameter configuration:  {'max_depth': None, 'max_features': 28, 'n_estimators': 90}
SCORE:  69.15 %
Parameter configuration:  {'max_depth': None, 'max_featu

In [14]:
with open("./random_forest", "wb") as f:
    pickle.dump(model, f)

In [4]:
with open("./random_forest", "rb") as f:
    model = pickle.load(f)

In [None]:
# 5) 
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

0.9686
