In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_gaussian_quantiles
from sklearn.model_selection import train_test_split

In [2]:
MAX_DEPTH = 1

In [3]:
X1, y1 = make_gaussian_quantiles(cov=2.,
                                 n_samples=200, n_features=2,
                                 n_classes=2, random_state=1)
X2, y2 = make_gaussian_quantiles(mean=(3, 3), cov=1.5,
                                 n_samples=300, n_features=2,
                                 n_classes=2, random_state=1)
X = np.concatenate((X1, X2))
y = np.concatenate((y1, - y2 + 1))
y[y == 0] = -1

X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
def sample(M, p):
    
    cum_p = np.cumsum(p)
    cum_p = np.insert(cum_p, 0, 0)
    
    out = []
    for j in range(M):
        number = np.random.uniform()
        for i in range(len(p)):
            if cum_p[i] <= number < cum_p[i+1]:
                out.append(i)
                
    return out

In [5]:
class MyAdaBoostClassifier():
    
    def __init__(self, n_estimators):
        
        self.n_estimators = n_estimators
        self.base_estimator = DecisionTreeClassifier(random_state=0, 
                                                     max_depth=1)

    def fit(self, X, y):
        
        self.X_tr = X
        self.y_tr = y
        
        N = len(y)
        
        self.h     = np.empty([self.n_estimators, N])
        self.D     = np.empty([self.n_estimators + 1, N])
        self.alpha = np.empty(self.n_estimators)
        epsilon    = np.empty(self.n_estimators)
        Z          = np.empty(self.n_estimators)
        
        self.D[0] = (1 / N) * np.ones(N)

        for t in range(self.n_estimators):

            ### step 1 ###
            idx = sample(N, self.D[t])
            self.base_estimator.fit(X[idx], y[idx])
            self.h[t] = self.base_estimator.predict(X)

            ### step 2 ###
            idx = np.where(self.h[t] != y)
            epsilon[t] = np.sum(self.D[t][idx])

            ### step 3 ###
            self.alpha[t] = 0.5 * np.log( (1 - epsilon[t]) / epsilon[t] )

            ### step 4 ###
            Z[t] = 2 * np.sqrt( (1 - epsilon[t]) * epsilon[t] )
            self.D[t + 1] = ((self.D[t] / Z[t]) *
                             np.exp(-self.alpha[t] * np.multiply(self.h[t], y)))
            
    def predict(self, X):

        N = len(X)

        F = np.empty(X.shape[0])
        for t in range(self.n_estimators):

            idx = sample(N, self.D[t])
            self.base_estimator.fit(self.X_tr[idx], self.y_tr[idx])

            F += np.multiply(self.alpha[t], self.base_estimator.predict(X))

        return np.sign(F)

In [6]:
title = f'Testing accuracies'
print(title)
print('=' * len(title))
print()

clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_tr, y_tr)

predicts = clf.predict(X_test)

unique, counts = np.unique(predicts == y_test, return_counts=True)
print(f'Accuracy of sklearn.ensemble.AdaBoostClassifier is '\
      f'{counts[1] / np.sum(counts):.2f}')

clf = DecisionTreeClassifier(max_depth=MAX_DEPTH)
clf.fit(X_tr, y_tr)

predicts = clf.predict(X_test)

unique, counts = np.unique(predicts == y_test, return_counts=True)
print(f'Accuracy of sklearn.tree.DecisionTreeClassifier is '\
      f'{counts[1] / np.sum(counts):.2f}')

clf = MyAdaBoostClassifier(n_estimators=100)
clf.fit(X_tr, y_tr)

predicts = clf.predict(X_test)

unique, counts = np.unique(predicts == y_test, return_counts=True)
print(f'Accuracy of MyAdaBoostClassifier is {counts[1] / np.sum(counts):.2f}')

Testing accuracies

Accuracy of sklearn.ensemble.AdaBoostClassifier is 0.81
Accuracy of sklearn.tree.DecisionTreeClassifier is 0.55
Accuracy of MyAdaBoostClassifier is 0.64
