In [64]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd

In [65]:
class TNAdaboost:
    def __init__(self):
        self.n_stumps = None
        self.stumps = []
        self.alphas = []
        self.errors = []

    @staticmethod
    def computeError(y,y_pred,w):
        sum_weight_errors = np.sum(w * (np.not_equal(y,y_pred)).astype(int))
        return sum_weight_errors / np.sum(w)

    @staticmethod
    def computeAlpha(error):
        epsilon = 0.01
        return np.log((1-error + epsilon)/(error + epsilon))

    @staticmethod
    def updateWeight(w,alpha,y,y_pred):
        b = (np.not_equal(y,y_pred)).astype(int) * 2 - 1
        w = w * np.exp(alpha * b)
        w = w / np.sum(w)
        return w

    def fit(self,X,y,M = 100):
        self.stumps = []
        self.alphas = []
        self.errors = []
        self.n_stumps = M

        for i in range(self.n_stumps):
            if i == 0:
                w_m = np.ones(len(y)) * 1 / len(y)
            else:
                w_m = TNAdaboost.updateWeight(w_m,alpha_m,y,y_pred)

            stump = DecisionTreeClassifier(max_depth=1)
            stump.fit(X,y,sample_weight=w_m)
            self.stumps.append(stump)
            y_pred = stump.predict(X)

            error_m = TNAdaboost.computeError(y,y_pred,w_m)
            alpha_m = TNAdaboost.computeAlpha(error_m)

            self.errors.append(error_m)
            self.alphas.append(alpha_m)

    def predict(self,X):
        tab_pred = np.zeros((len(X),self.n_stumps))
        for i in range(self.n_stumps):
            y_pred = self.stumps[i].predict(X) * self.alphas[i]
            tab_pred[:,i] = y_pred
        return (np.sign(tab_pred.sum(axis=1))).astype(int)

### Load dataset

In [66]:
df = pd.read_csv('spambase/spambase.data', header = None)
names = pd.read_csv('spambase/spambase.names', sep = ':', skiprows=range(0, 33), header = None)
col_names = list(names[0])
col_names.append('Spam')
df.columns = col_names

### Split train and test set

In [67]:
df['Spam'] = df['Spam'] * 2 - 1
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'Spam').values, df['Spam'].values, test_size = 0.2, random_state = 2)

### Using user-defined Adaboost

In [68]:
ad_clf = TNAdaboost()
ad_clf.fit(X_train,y_train)

In [69]:
y_pred = ad_clf.predict(X_test)
accuracy_score(y_pred,y_test)

0.9229098805646037

### Using the Library Scikit-Learn implementation of AdaBoost

In [70]:
sklearn_ad_clf = AdaBoostClassifier()
sklearn_ad_clf.fit(X_train,y_train)
y_pred = sklearn_ad_clf.predict(X_test)
accuracy_score(y_pred,y_test)

0.9435396308360477