<a href="https://colab.research.google.com/github/masao1112/MLFromScratch/blob/main/%5BCompleted%5DDecision_Trees%26Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sklearn
from sklearn.datasets import load_iris
import numpy as np
import matplotlib

In [None]:
data = load_iris()
X = data.data
y = data.target

In [None]:
X[:5], y[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0]))

In [None]:
age = np.array([18, 30, 8, 17, 25, 54, 69, 70, 45])
labels = np.array([0, 1, 0, 0, 1, 2, 2, 2, 1]) # 0: Child, 1: Adult, 2: Elderly

In [None]:
sorted_age = sorted(age, reverse=False)
sorted_age = np.array(sorted_age)
sorted_age

array([ 8, 17, 18, 25, 30, 45, 54, 69, 70])

In [None]:
def calculate_thresholds(X, n_components=2):
  # sort the values in ascending order
  sorted_X = sorted(X, reverse=False)
  sorted_X = np.array(sorted_X)
  result = []
  for i in range(len(sorted_X)-n_components):
    selected_comps = sorted_X[i:i+n_components]
    result.append(sum(selected_comps)/n_components)
  return np.array(result)

In [None]:
thresholds = calculate_thresholds(sorted_age)
thresholds # thresholds for prediicting teenagers or adults

array([12.5, 17.5, 21.5, 27.5, 37.5, 49.5, 61.5])

In [None]:
def split(array, labels, threshold):
  left_node = []
  right_node = []
  left_node_classes = []
  right_node_classes = []
  preds = np.where(array <= threshold, 0, 1)
  for i in range(len(array)):
    if preds[i] == 0:
      left_node.append(array[i])
      left_node_classes.append(labels[i])
    else:
      right_node.append(array[i])
      right_node_classes.append(labels[i])
  return np.array(left_node), np.array(right_node), np.array(left_node_classes), np.array(right_node_classes)

In [None]:
def binary_split(X, y, feature, threshold):
  left_node = []
  right_node = []
  left_node_classes = []
  right_node_classes = []
  m, n = X.shape
  preds = np.where(X[:, feature] <= threshold, 0, 1)
  for i in range(m):
    if preds[i] == 0:
      left_node.append(X[i])
      left_node_classes.append(y[i])
    else:
      right_node.append(X[i])
      right_node_classes.append(y[i])
  return np.array(left_node), np.array(right_node), np.array(left_node_classes), np.array(right_node_classes)

In [None]:
def calculate_entropy(node_labels):
  # get the unique labels
  unique_labels = np.unique(node_labels)
  total_class = len(node_labels) # total number of classes
  # calculate the entropy for each class and sum it up
  entropy = 0
  for i in unique_labels:
    n_classi = sum(node_labels == i) # number of elements of classi
    pi =  n_classi / total_class
    entropy += (-pi * np.log2(pi))
  return entropy # 0: homogenious, 1: class labels are equally divided

In [None]:
# calculate the entropy for the root node
root_entropy = calculate_entropy(labels)
root_entropy

np.float64(1.584962500721156)

In [None]:
n = len(age)
for thresh in thresholds:
  # Split the root
  left_node, right_node, left_node_classes, right_node_classes = split(age, labels, thresh)
  # calculate the entropy of child nodes
  ln_entropy = calculate_entropy(left_node_classes)
  rn_entropy = calculate_entropy(right_node_classes)
  ig = root_entropy - (ln_entropy * len(left_node)/n + rn_entropy * len(right_node)/n)
  print(left_node_classes, right_node_classes, ln_entropy, rn_entropy, ig)

[0] [0 1 0 1 2 2 2 1] 0.0 1.561278124459133 0.1971597234241491
[0 0] [0 1 1 2 2 2 1] 0.0 1.4488156357251847 0.4581058951571235
[0 0 0] [1 1 2 2 2 1] 0.0 1.0 0.9182958340544894
[0 0 0 1] [1 2 2 2 1] 0.8112781244591328 0.9709505944546686 0.6849774484867255
[0 1 0 0 1] [2 2 2 1] 0.9709505944546686 0.8112781244591328 0.6849774484867255
[0 1 0 0 1 1] [2 2 2] 1.0 0.0 0.9182958340544894
[0 1 0 0 1 2 1] [2 2] 1.4488156357251847 0.0 0.4581058951571235


In [None]:
# Assemble everything and return the optimal threshold
def find_optimal_split(X, y):
  optim_threshold = 0
  optim_ig = 0
  optim_left_entropy = 0
  optim_right_entropy = 0
  m, n = X.shape
  # Compute thresholds for each feature
  for i in range(n):
    thresholds = calculate_thresholds(X[:, i])
    # Traverse over thresholds
    for thresh in thresholds:
      # Splitting
      left_node, right_node, left_node_classes, right_node_classes = split(X[:, i], y, thresh)

      # Evaluate the split
      root_entropy = calculate_entropy(y)
      ln_entropy = calculate_entropy(left_node_classes)
      rn_entropy = calculate_entropy(right_node_classes)
      ig = root_entropy - (ln_entropy * len(left_node)/m + rn_entropy * len(right_node)/m)
      # Assign optimal params
      if ig > optim_ig:
        optim_feature = i
        optim_ig = ig
        optim_threshold = thresh
        optim_left_entropy = ln_entropy
        optim_right_entropy = rn_entropy
  return optim_feature, optim_threshold, optim_ig, optim_left_entropy, optim_right_entropy

In [None]:
from sklearn.datasets import load_iris
data = load_iris()
# shuffle the data
X = data.data
y = data.target
# shuffle the dataset
np.random.seed(42)
dataset = np.c_[X, y]
np.random.shuffle(dataset)
shuffled_X = dataset[:, :-1]
shuffled_y = dataset[:, -1]
shuffled_y = shuffled_y.astype(int)

In [None]:
find_optimal_split(shuffled_X, shuffled_y)

(2,
 np.float64(1.9),
 np.float64(0.9182958340544894),
 np.float64(0.0),
 np.float64(1.0))

In [None]:
# testing
X_ = shuffled_X[:, 2]
y_pred = np.where(X_ <= 1.9, 0, 1)
np.where(y_pred == 0, shuffled_y, 1)

array([1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1])

In [None]:
binary_split(X, y, 3, 0.79)

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [None]:
# Assemble everything and return the optimal threshold
def best_split(root_values, root_labels, thresholds):
  optimal_threshold = 0
  optimal_ig = 0
  optim_left_etropy = 0
  optim_right_etropy = 0
  n = len(root_labels)
  for thresh in thresholds:
    # Traverse over thresholds and split
    left_node, right_node, left_node_classes, right_node_classes = split(root_values, root_labels, thresh)

    # Evaluate each split
    root_entropy = calculate_entropy(root_labels)
    ln_entropy = calculate_entropy(left_node_classes)
    rn_entropy = calculate_entropy(right_node_classes)
    ig = root_entropy - (ln_entropy * len(left_node)/n + rn_entropy * len(right_node)/n)
    if ig > optimal_ig:
      optimal_ig = ig
      optimal_threshold = thresh
      optim_left_etropy = ln_entropy
      optim_right_etropy = rn_entropy
  return optimal_threshold, optimal_ig

In [None]:
best_split(age, labels, thresholds)

(np.float64(21.5), np.float64(0.9182958340544894))

In [None]:
np.unique(y, return_counts=True)

(array([0, 1, 2]), array([50, 50, 50]))

In [None]:
#-----------------------Reformatted

In [None]:
def calculate_thresholds(X, n_components=2):
  # sort the values in ascending order
  sorted_X = sorted(X, reverse=False)
  sorted_X = np.array(sorted_X)
  result = []
  for i in range(len(sorted_X)-n_components+1):
    selected_comps = sorted_X[i:i+n_components]
    result.append(sum(selected_comps)/n_components)
  return np.array(result)

In [None]:
def calculate_entropy(node_labels):
  # get the unique labels
  unique_labels = np.unique(node_labels)
  total_class = len(node_labels) # total number of classes
  # calculate the entropy for each class and sum it up
  entropy = 0
  for i in unique_labels:
    n_classi = sum(node_labels == i) # number of elements of classi
    pi =  n_classi / total_class
    entropy += (-pi * np.log2(pi))
  return entropy # 0: homogenious, 1: class labels are equally divided

In [None]:
def binary_split(X, y, feature, threshold):
  left_node = []
  right_node = []
  left_node_classes = []
  right_node_classes = []
  m, n = X.shape
  preds = np.where(X[:, feature] <= threshold, 0, 1)
  for i in range(m):
    if preds[i] == 0:
      left_node.append(X[i])
      if y is not None:
        left_node_classes.append(y[i])
    else:
      right_node.append(X[i])
      if y is not None:
        right_node_classes.append(y[i])
  return np.array(left_node), np.array(right_node), np.array(left_node_classes), np.array(right_node_classes)

In [None]:
# Assemble everything and return the optimal threshold
def find_optimal_split(X, y, feature_constraint=False):
  optim_feature = 0
  optim_threshold = 0
  optim_ig = 0
  optim_left_entropy = 0
  optim_right_entropy = 0
  m, n = X.shape

  # check for feature constraint
  if feature_constraint:
    n_sampled_features = int(np.sqrt(n))
  else:
    n_sampled_features = n
  # randomly sample feature choices for the split
  sampled_features = np.random.choice(n, n_sampled_features, replace=False)

  # Compute thresholds for feature choices
  for i in range(len(sampled_features)):
    thresholds = calculate_thresholds(X[:, sampled_features[i]])

    # Traverse over thresholds
    for thresh in thresholds:
      # Splitting
      left_node, right_node, left_node_classes, right_node_classes = binary_split(X, y, sampled_features[i], thresh)

      # Evaluate the split
      root_entropy = calculate_entropy(y)
      ln_entropy = calculate_entropy(left_node_classes)
      rn_entropy = calculate_entropy(right_node_classes)
      ig = root_entropy - (ln_entropy * len(left_node)/m + rn_entropy * len(right_node)/m)
      #optim_feature, optim_threshold, optim_ig, optim_left_entropy, optim_right_entropy = i, thresh, ig, ln_entropy, rn_entropy

      # Assign optimal params
      if ig >= optim_ig:
        optim_feature = sampled_features[i]
        optim_ig = ig
        optim_threshold = thresh
        optim_left_entropy = ln_entropy
        optim_right_entropy = rn_entropy
  return optim_feature, optim_threshold, optim_ig, optim_left_entropy, optim_right_entropy

In [None]:
def compute_leaf_node(y, n_classes):
  ytoc = np.zeros(n_classes, dtype="int") # y to counts
  unique_labels, counts = np.unique(y, return_counts=True)
  ytoc[unique_labels] = counts
  target_class = unique_labels[np.argmax(counts)]
  entropy_ = calculate_entropy(y)
  n_samples_ = len(y)
  return (entropy_, n_samples_, ytoc, target_class)

In [None]:
class Node:
  def __init__(self, entropy, n_samples, values, label=None, feature=None, threshold=None, left_node=None, right_node=None):
    self.feature = feature
    self.threshold = threshold
    self.entropy = entropy
    self.n_samples = n_samples
    self.values = values
    self.label = label
    self.left_node = left_node
    self.right_node = right_node

  def print_tree(self):
    print(self)
    if self.left_node:
      self.left_node.print_tree()
    if self.right_node:
      self.right_node.print_tree()

  def __repr__(self):
    return f"Node(feature={self.feature}, threshold={self.threshold}, entropy={self.entropy}, n_samples={self.n_samples}, values={self.values}, class={self.label})"

In [None]:
def build_tree(X, y, max_depth, max_leaf, depth=0, leaf=0, n_classes=3, feature_constraint=False):

  # Stopping criteria
  if depth >= max_depth or leaf >= max_leaf:
    entropy, n_samples, target_values, target_class = compute_leaf_node(y, n_classes)
    return Node(entropy, n_samples, target_values, label=target_class)

  # get n_samples and n_features
  m, n = X.shape
  # get target values counts
  target_values = np.zeros(n_classes, dtype="int")
  unique_labels, counts = np.unique(y, return_counts=True)
  target_values[unique_labels] = counts
  # Compute the root entropy
  root_entropy = calculate_entropy(y)
  if root_entropy == 0: # The class is homogeneous
    entropy, n_samples, target_values, target_class = compute_leaf_node(y, n_classes)
    return Node(entropy, n_samples, target_values, label=target_class)
  else:
    # Find the optimal feature and threshold
    optimal_feature, optimal_threshold, optimal_ig, left_node_entropy, right_node_entropy = find_optimal_split(X, y, feature_constraint)
    left_node, right_node, left_node_classes, right_node_classes = binary_split(X, y, optimal_feature, optimal_threshold)

    # Check for leaf node
    if left_node_entropy == 0:
      leaf += 1

    if right_node_entropy == 0:
      leaf += 1

    # Recursively branching
    left_node = build_tree(left_node, left_node_classes, max_depth, max_leaf, depth+1, leaf, feature_constraint=feature_constraint)
    right_node = build_tree(right_node, right_node_classes, max_depth, max_leaf, depth+1, leaf, feature_constraint=feature_constraint)
  return Node(entropy=root_entropy, n_samples=m, values=target_values, feature=optimal_feature, threshold=optimal_threshold, left_node=left_node, right_node=right_node)

In [None]:
from sklearn.datasets import load_iris
data = load_iris()
# shuffle the data
X = data.data
y = data.target
# shuffle the dataset
np.random.seed(42)
dataset = np.c_[X, y]
np.random.shuffle(dataset)
shuffled_X = dataset[:, :-1]
shuffled_y = dataset[:, -1]
shuffled_y = shuffled_y.astype(int)

In [None]:
from sklearn.model_selection import train_test_split
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(shuffled_X, shuffled_y, test_size=0.2, random_state=42)

In [None]:
tree = build_tree(shuffled_X, shuffled_y, 4, 4)

In [None]:
tree.print_tree()

Node(feature=2, threshold=2.45, entropy=1.584962500721156, n_samples=150, values=[50 50 50], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=50, values=[50  0  0], class=0)
Node(feature=3, threshold=1.75, entropy=1.0, n_samples=100, values=[ 0 50 50], class=None)
Node(feature=2, threshold=4.95, entropy=0.44506485705083865, n_samples=54, values=[ 0 49  5], class=None)
Node(feature=3, threshold=1.65, entropy=0.1460942501201363, n_samples=48, values=[ 0 47  1], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=47, values=[ 0 47  0], class=1)
Node(feature=None, threshold=None, entropy=0.0, n_samples=1, values=[0 0 1], class=2)
Node(feature=3, threshold=1.55, entropy=0.9182958340544896, n_samples=6, values=[0 2 4], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=3, values=[0 0 3], class=2)
Node(feature=None, threshold=None, entropy=0.9182958340544896, n_samples=3, values=[0 2 1], class=1)
Node(feature=2, threshold=4.85, entropy

### OOP Assemble

In [None]:
def traverse(X, node):
    # check for leaf node
    if node.feature == None:
      # calculate current n_samples and correspond class
      return node.label

    # get attributes from nodes
    feature = node.feature
    threshold = node.threshold

    if X[feature] <= threshold:
      # go to the left node
      return traverse(X, node.left_node)
    else:
      # go to the right node
      return traverse(X, node.right_node)

In [None]:
class DecisionTreesClassifier:
  def __init__(self, max_depth, max_leaf, min_samples_leaf=None, min_samples_split=None):
    self.max_depth = max_depth
    self.max_leaf = max_leaf
    self.min_samples_leaf = min_samples_leaf
    self.min_samples_split = min_samples_split

  def fit(self, X, y):
    n_classes = np.unique(y).size
    self.tree = build_tree(X, y, max_depth=self.max_depth, max_leaf=self.max_leaf, n_classes=n_classes)

  def predict(self, X, **kwargs):
    m = X.shape[0]
    # create a place holder for predictions
    y_pred = []
    # go through each example
    for i in range(m):
      pred = traverse(X[i], self.tree)
      y_pred.append(pred)
    return np.array(y_pred)

  def score(self, X, y, **kwargs):
    y_pred = self.predict(X)
    return (y_pred == y).sum() / len(y)

In [None]:
classifier = DecisionTreesClassifier(4, 3)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
classifier.tree.print_tree()

Node(feature=3, threshold=0.8, entropy=1.5840680553754911, n_samples=120, values=[39 39 42], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=39, values=[39  0  0], class=0)
Node(feature=3, threshold=1.75, entropy=0.9990102708804813, n_samples=81, values=[ 0 39 42], class=None)
Node(feature=2, threshold=4.95, entropy=0.4537163391869448, n_samples=42, values=[ 0 38  4], class=None)
Node(feature=3, threshold=1.65, entropy=0.1792560669283215, n_samples=37, values=[ 0 36  1], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=36, values=[ 0 36  0], class=1)
Node(feature=None, threshold=None, entropy=0.0, n_samples=1, values=[0 0 1], class=2)
Node(feature=3, threshold=1.55, entropy=0.9709505944546686, n_samples=5, values=[0 2 3], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=3, values=[0 0 3], class=2)
Node(feature=None, threshold=None, entropy=0.0, n_samples=2, values=[0 2 0], class=1)
Node(feature=2, threshold=4.85, entropy=0

In [None]:
y_pred = classifier.predict(X_test)
y_pred

array([2, 1, 1, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 1, 0, 0, 2,
       2, 0, 1, 1, 0, 2, 1, 0])

In [None]:
# evaluate the model
classifier.score(X_test, y_test)

np.float64(0.9666666666666667)

### Improvements

<li>I definitely could optimize the code more...
<li>The predict method could be more parralelism i.e compute all the test data points in one go

### Random Forests

In [None]:
# sampling with replacement
def replacement_sampling(X, S, replace=True):
  """
  Perform sampling with replacement(Bagging)
  Args:
    X: the dataset to sample from
    S: number of samples to sample from dataset

  Return:
    A list of sample from dataset(n_samples, X.shape[1])
  """
  idx = np.random.choice(len(X), S, replace=replace)
  return X[idx]

In [None]:
# sample B batches from dataset with length = n_samples
def batches_sampling(X, S, B):
  """
  Sample B batches from the provided dataset
  Args:
    dataset: dataset to sample from
    n_samples: number of samples/batch
    B: number of batches

  Return:
    A list of sampled batches(B, S, X.shape[1])
  """
  result = []
  for b in range(B-1):
    sampled_batch = replacement_sampling(X, S)
    result.append(sampled_batch)
  return np.array(result)

In [None]:
batches = batches_sampling(dataset, len(dataset), 5)

In [None]:
batches.shape

(4, 150, 5)

In [None]:
# train a decision tree for each sampled batch
for b in batches:
  X = b[:, :-1]
  y = b[:, -1]
  y = y.astype(int) # convert to int
  tree = build_tree(X, y, 4, 4)
  tree.print_tree()
  print('--------------------')

Node(feature=2, threshold=2.5999999999999996, entropy=1.5728297468290475, n_samples=150, values=[41 55 54], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=41, values=[41  0  0], class=0)
Node(feature=3, threshold=1.65, entropy=0.999939284770655, n_samples=109, values=[ 0 55 54], class=None)
Node(feature=1, threshold=2.3, entropy=0.42368057157091055, n_samples=58, values=[ 0 53  5], class=None)
Node(feature=2, threshold=4.75, entropy=0.954434002924965, n_samples=8, values=[0 3 5], class=None)
Node(feature=None, threshold=None, entropy=0.0, n_samples=3, values=[0 3 0], class=1)
Node(feature=None, threshold=None, entropy=0.0, n_samples=5, values=[0 0 5], class=2)
Node(feature=None, threshold=None, entropy=0.0, n_samples=50, values=[ 0 50  0], class=1)
Node(feature=2, threshold=4.85, entropy=0.23868451135100135, n_samples=51, values=[ 0  2 49], class=None)
Node(feature=1, threshold=3.0, entropy=0.7642045065086203, n_samples=9, values=[0 2 7], class=None)
Node(feature

In [None]:
class BasicRandomForest():
  def __init__(self, n_trees, B, max_depth, max_leaf, min_samples_leaf=None, min_samples_split=None): # only support max_depth and max_leaf for now
    self.n_trees = n_trees
    self.B = B
    self.max_depth = max_depth
    self.max_leaf = max_leaf
    self.min_samples_leaf = min_samples_leaf
    self.min_samples_split = min_samples_split

  def fit(self, X, y):
    # concat X and y
    dataset = np.c_[X, y]
    # sample B batches from dataset with length = n_samples
    batches = batches_sampling(dataset, self.B, self.n_trees)
    self.trees = []
    for b in batches:
      X = b[:, :-1]
      y = b[:, -1].astype(int)
      tree = build_tree(X, y, max_depth=self.max_depth, max_leaf=self.max_leaf, feature_constraint=True)
      self.trees.append(tree)

  def predict(self, X, **kwargs):
    m = X.shape[0]
    # create a place holder for predictions
    y_pred = []
    # go through each example
    for i in range(m):
      preds = []
      # predict on each tree
      for tree in self.trees:
        pred = traverse(X[i], tree)
        preds.append(pred)
      labels, counts = np.unique(np.array(preds), return_counts=True)
      y_pred.append(labels[np.argmax(counts)])
    return np.array(y_pred)

  def score(self, X, y, **kwargs):
    y_pred = self.predict(X)
    accuracy = (y_pred == y).sum() / len(y)
    return accuracy.item()


In [None]:
rfc = BasicRandomForest(10, 10, 4, 4)

In [None]:
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_test == y_pred

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True])

In [None]:
rfc.score(X_test, y_test)

0.9666666666666667