In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

class RandomForest:
    def __init__(self, num_trees=100, max_depth=None, max_features='sqrt', random_seed=None):
        """
        Initializes a Random Forest classifier.

        Parameters:
        - num_trees: Number of decision trees in the forest
        - max_depth: Maximum depth allowed for each decision tree
        - max_features: Strategy for feature selection at each split:
                       'sqrt' - square root of total features
                       int - exact number of features
                       None - all features
        - random_seed: Seed for random number generation (for reproducibility)
        """
        self.num_trees = num_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_seed = random_seed
        self.decision_trees = []  # List to store all decision trees
        self.selected_features = []  # Features used for each corresponding tree

    def fit(self, features, targets):
        """
        Trains the Random Forest model on the provided dataset.

        Parameters:
        - features: Input feature matrix (num_samples × num_features)
        - targets: Target values (num_samples,)
        """
        np.random.seed(self.random_seed)
        num_samples, num_features = features.shape

        # Determine how many features to consider at each split
        if self.max_features == 'sqrt':
            features_per_split = int(np.sqrt(num_features))
        elif isinstance(self.max_features, int):
            features_per_split = self.max_features
        else:
            features_per_split = num_features

        self.decision_trees = []
        self.selected_features = []

        for _ in range(self.num_trees):
            # Create bootstrap sample (with replacement)
            bootstrapped_features, bootstrapped_targets = resample(
                features, targets, random_state=self.random_seed
            )

            # Randomly select subset of features
            feature_subset = np.random.choice(
                num_features, features_per_split, replace=False
            )
            bootstrapped_features = bootstrapped_features[:, feature_subset]

            # Train a decision tree on the bootstrap sample
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                random_state=self.random_seed
            )
            tree.fit(bootstrapped_features, bootstrapped_targets)

            # Store the trained tree and its feature subset
            self.decision_trees.append(tree)
            self.selected_features.append(feature_subset)

    def predict_proba(self, features):
        """
        Predicts class probabilities by averaging predictions from all trees.

        Parameters:
        - features: Input feature matrix (num_samples × num_features)

        Returns:
        - probabilities: Array (num_samples × num_classes) with class probabilities
        """
        num_samples = features.shape[0]
        all_probabilities = []

        for tree, features_used in zip(self.decision_trees, self.selected_features):
            # Use only the features this particular tree was trained on
            subset_features = features[:, features_used]
            tree_probabilities = tree.predict_proba(subset_features)
            all_probabilities.append(tree_probabilities)

        # Calculate average probabilities across all trees
        average_probabilities = np.mean(all_probabilities, axis=0)
        return average_probabilities

    def predict(self, features):
        """
        Predicts class labels using majority voting from all trees.

        Parameters:
        - features: Input feature matrix (num_samples × num_features)

        Returns:
        - predictions: Array (num_samples,) with predicted class labels
        """
        probabilities = self.predict_proba(features)
        return np.argmax(probabilities, axis=1)

# Demonstration of usage
if __name__ == "__main__":
    # Load the iris dataset
    iris_data = load_iris()
    X, y = iris_data.data, iris_data.target

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Create and train the Random Forest model
    forest = RandomForest(num_trees=100, max_depth=3, random_seed=42)
    forest.fit(X_train, y_train)

    # Generate predictions on test set
    predictions = forest.predict(X_test)

    # Calculate and display accuracy
    model_accuracy = accuracy_score(y_test, predictions)
    print(f"Random Forest Accuracy: {model_accuracy:.4f}")

Random Forest Accuracy: 0.9667
