In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2
from sklearn.tree import DecisionTreeClassifier

In [23]:
class AdaBoost:
    def __init__(self) -> None:
        self.list_of_weak_learners = []
        self.learners_weights = []

    def fit(self, X, y, number_of_weak_learners):
        n = len(X)
        w = np.ones(n) / n
        for t in range(number_of_weak_learners):
            weak_learner = DecisionTreeClassifier(max_depth=1)
            weak_learner.fit(X, y, sample_weight=w)
            y_pred = weak_learner.predict(X) 
            epsilon = np.sum(w * (y_pred != y))
            beta = epsilon / (1 - epsilon)
            if epsilon > 0.5:
                break

            self.list_of_weak_learners.append(weak_learner)
            self.alphas.append(beta)

            w /= np.sum(w)

    def predict(self, X):
        predictions = np.zeros(len(X))
        for weight, weak_learner in zip(self.learners_weights, self.list_of_weak_learners):
            predictions += np.log(1/weight) * weak_learner.predict(X)
        return np.sign(predictions)


In [21]:
def generate_hard_data(number_of_sample, number_of_features):
    X = np.random.randn(number_of_sample, number_of_features)
    median_of_chi2 = chi2.median(number_of_features)
    sum_of_squares = np.sum(X**2, axis=1, keepdims=True)
    y = np.where(sum_of_squares > median_of_chi2, 1, -1)
    return X, y