In [19]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from decision_tree import Decision_Tree
from collections import Counter



In [20]:
class RandomForest:
    def __init__ (self, n_trees= 10, max_tree_depth= 10, min_sample_split= 2, n_features = None):
        self.n_trees = n_trees
        self.max_tree_depth = max_tree_depth
        self.min_sample_split = min_sample_split
        self.n_features = n_features
        self.trees = [] 

    def fit(self, X, Y):
        for _ in range(self.n_trees):
            tree = Decision_Tree(self.min_sample_split, self.max_tree_depth, self.n_features)

            X_sample, Y_sample = self._bootstrapping(X, Y)
            tree.fit(X_sample, Y_sample)

            self.trees.append(tree)

    def _bootstrapping(self, X, Y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace= True)

        return X[idxs], Y[idxs]
    
    def _most_common(self, Y):
        counter = Counter(Y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        predictions_matrix = np.array([tree.predict(X) for tree in self.trees]).T
        predictions = np.array([self._most_common(pred) for pred in predictions_matrix])
        return predictions
    

In [21]:
datas = datasets.load_breast_cancer()

X, Y = datas.data, datas.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state= 3119)



In [22]:
clf = RandomForest()
clf.fit(X_train, Y_train)

predictions = clf.predict(X_test)



In [23]:
acc = np.sum(predictions == Y_test)/len(Y_test)

print(f"Accuracy: {acc}")

Accuracy: 0.956140350877193
