#Decission Tree Class

In [13]:
import numpy as np
from collections import Counter



def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (
            depth >= self.max_depth
            or n_labels == 1
            or n_samples < self.min_samples_split
        ):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common



#Random Forest Regressor


In [53]:
import numpy as np
from collections import Counter
from sklearn.tree import DecisionTreeRegressor

class RandomForestRegressor:
    def __init__(self, n_trees=10, max_depth=100, min_samples_split=2, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeRegressor(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                max_features=self.n_feats,
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        # Get predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Swap axes to align tree predictions (one for each sample) as rows
        tree_preds = np.swapaxes(tree_preds, 0, 1)  # Transpose to (n_samples, n_trees)
        # For regression, we take the mean of predictions from all trees
        y_pred = np.mean(tree_preds, axis=1)  # Take mean across trees for each sample
        return y_pred

def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]

# Custom DecisionTree and related code should be available, but if you are having issues,
# you might want to implement it correctly for regression.


In [56]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import numpy as np

# Generate a synthetic regression dataset
X, y = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Now, we can use the RandomForestRegressor class you provided.

# Create and fit the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_trees=10, max_depth=10, min_samples_split=2)
rf_regressor.fit(X_train, y_train)

# Predicting using the trained Random Forest Regressor
y_pred = rf_regressor.predict(X_test)

# Evaluate the model by calculating the Mean Squared Error (MSE)
mse = np.mean((y_pred - y_test) ** 2)
print(f"Mean Squared Error: {mse:.2f}")


Mean Squared Error: 443.04


##Why Does the Mean Squared Error of a Random Forest Differ from the MSE of Individual Trees?
Individual tree predictions can vary quite a bit because each tree is trained on a different random subset of the data (due to bootstrapping). These predictions can be far off from the actual target values and from each other.

However, when you take the mean of the predictions from multiple trees in the random forest, the average prediction tends to be more accurate and closer to the true value. This happens because the errors of individual trees tend to cancel each other out, leading to better overall performance.

In [57]:
from sklearn.metrics import mean_squared_error

for i, tree in enumerate(rf_regressor.trees):
    tree_pred = tree.predict(X_test)  # Get predictions from each individual tree
    mse = mean_squared_error(y_test, tree_pred)  # Calculate MSE for the tree
    print(f"Tree {i+1} MSE: {mse:.2f}")


Tree 1 MSE: 1053.47
Tree 2 MSE: 1162.34
Tree 3 MSE: 1247.05
Tree 4 MSE: 1136.89
Tree 5 MSE: 1158.63
Tree 6 MSE: 1474.86
Tree 7 MSE: 1160.98
Tree 8 MSE: 1183.25
Tree 9 MSE: 1208.67
Tree 10 MSE: 1148.73


#Random Forest Classifier


In [59]:
from collections import Counter
import numpy as np




class RandomForestClassifier:
    def __init__(self, n_trees=10, max_depth=100, min_samples_split=2, n_feats=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth,
                n_feats=self.n_feats,
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1) # This is used get result of each sample one by one by each tree
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]


def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common


In [60]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np


# Loading the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Splitting the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# Create and fit the RandomForestClassifier
rf_clf = RandomForestClassifier(n_trees=10, max_depth=5, min_samples_split=4)
rf_clf.fit(X_train, y_train)

# Predicting using the trained Random Forest
y_pred = rf_clf.predict(X_test)

# Evaluate the model by comparing predictions with the true labels
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%


In [61]:
for i, tree in enumerate(rf_clf.trees):
    tree_pred = tree.predict(X_test)
    accuracy = np.mean(tree_pred == y_test)
    print(f"Tree {i+1} Accuracy: {accuracy * 100:.2f}%")


Tree 1 Accuracy: 97.78%
Tree 2 Accuracy: 100.00%
Tree 3 Accuracy: 100.00%
Tree 4 Accuracy: 100.00%
Tree 5 Accuracy: 100.00%
Tree 6 Accuracy: 93.33%
Tree 7 Accuracy: 95.56%
Tree 8 Accuracy: 95.56%
Tree 9 Accuracy: 97.78%
Tree 10 Accuracy: 95.56%
