In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
                            f1_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.base import ClassifierMixin, BaseEstimator
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
def generate_data(n_samples=10000, n_features=1):
    X, y = make_classification(
        n_features=2,
        n_redundant=0,
        n_informative=2,
        n_clusters_per_class=1,
        n_classes=2,
        random_state=42,
    )
    return X, y

In [None]:
def plot_dataset(X, y):
    plt.figure(figsize=(10,6))
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap='Spectral')
    plt.show()

In [None]:
X, y = generate_data()
plot_dataset(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 42
)

In [None]:
plot_dataset(X_train, y_train)

In [None]:
plot_dataset(X_test, y_test)

In [None]:
#from sklearn.inspection import DecisionBoundaryDisplay
def get_grid(data):
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    return np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))

In [None]:
def plot_prediction(X, y, model):
    xx, yy = get_grid(X)
    predict = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.figure(figsize=(10,6))
    plt.pcolormesh(xx, yy, predict, cmap='Spectral')
    plt.scatter(X[:, 0], X[:, 1], c=y, s=100, cmap='Spectral',edgecolors='k')
    plt.show()

In [None]:
def print_metrics(true, predict):
    acc = accuracy_score(y_test, predict)
    prec = precision_score(y_test, predict)
    recall = recall_score(true, predict)
    f1 = f1_score(y_test, predict)
    print(f'Results:\naccuracy:   {acc:.3f}\nprecision: {prec:.3f}\nrecall:  {recall:.3f}\nf1:  {f1:.3f}') 
    print(classification_report(y_test, predict))
    print(confusion_matrix(y_test, predict))

In [None]:
class MyRFClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, num_trees=25, min_samples_split=2, max_depth=5):
        self.num_trees = num_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.decision_trees = []
                
    def fit(self, X, y):

        for _ in range(self.num_trees):
            clf = DecisionTreeClassifier(
                min_samples_split=self.min_samples_split,
                max_depth=self.max_depth
            )
            X_b, y_b = self.boostrap(X, y)
            clf.fit(X_b, y_b)
            self.decision_trees.append(clf)
    
    @staticmethod
    def boostrap(X, y):
        n_rows, n_cols = X.shape
        samples = np.random.choice(a=n_rows, size=n_rows, replace=True)
        return X[samples], y[samples]
    
    def predict(self, X):
        y_predict = []
        for tree in self.decision_trees:
            y_predict.append(tree.predict(X))
        y_predict = np.swapaxes(a=y_predict, axis1=0, axis2=1)
        predictions = []
        for preds in y_predict:
            predictions.append(np.argmax(np.bincount(preds)))
        return np.array(predictions)

In [None]:
model = MyRFClassifier()
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predict = model.predict(X_test)
print_metrics(y_test, predict)

In [None]:
plot_prediction(X_test, y_test, model)