In [1]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy import stats
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

## Class Definition of Random Forest Classifier

1. max_features: The maximum number of features to be considered at each split in the decision tree.
2. n_estimators: Number of Decision Tree estimators to bag.
3. sample_size: Number of samples to bootstrap

In [2]:
class RandomForestClassifier:
    def __init__(self, max_features, n_estimators, sample_size):
        self.max_features = max_features
        self.n_estimators = n_estimators
        self.sample_size = sample_size
        
    def fit(self, X, y):
        
        # Placeholder for all decision tree estimators
        self.decision_tree_estimators = [None for i in range(self.n_estimators)]
        
        n_rows = X.shape[0]
        
        # Train different independent decision tree classifiers
        for i in range(self.n_estimators):
            indices_sampled = np.random.choice(range(n_rows), self.sample_size)
            X_sampled, y_sampled = X[indices_sampled, :], y[indices_sampled]
            
            decision_tree_estimator = DecisionTreeClassifier(max_features=self.max_features)
            decision_tree_estimator.fit(X_sampled, y_sampled)
            
            self.decision_tree_estimators[i] = decision_tree_estimator
    
    def predict(self, X):
        
        preds = np.zeros((X.shape[0], self.n_estimators))
        
        for i in range(self.n_estimators):
            preds[:, i] = self.decision_tree_estimators[i].predict(X)
        
        # Return the most frequently predicted class across all estimators (mode)
        mode_object = stats.mode(preds, axis=1)
        return mode_object[0]

In [3]:
model = RandomForestClassifier(max_features=15, n_estimators=100, sample_size=700)

Now let's make a classification dataset with 5 classes, with 20 features (15 informative, 5 redundant) and 1000 samples.

In [4]:
X, y = make_classification(n_classes=5, n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)

Let us take a split of 33 percent for the test set.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)

The Random Forest classifier achieves an accuracy of 66 percent.

In [8]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6666666666666666

Now, let's train a single Decision Tree Classifier and compare the accuracy with that of Random Forest.

In [9]:
model = DecisionTreeClassifier()

In [10]:
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
y_pred = model.predict(X_test)

The accuracy score 42 percent (lower than Random Forest).

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.42424242424242425