In [1]:
import numpy as np

In [2]:
class SingleDecisionTree:
    def __init__(self, axis=0, threshold = 0):
        self.axis = axis
        self.threshold = threshold

    def preditct(self, x):
        return 1 if x[self.axis] >= self.threshold else -1

    def preditctArr(self, dataSet):
        result = []
        for x in dataSet:
            result.append(1 if x[self.axis] >= self.threshold else -1)
        return result

class AdaBoost:
    
    def __init__(self, epsilon=0.01):
        self.epsilon = epsilon
        
    def fit(self, dataSet, labels):
        N = np.array(dataSet).shape[0]    #样本总数
        M = np.array(dataSet).shape[1] #样本维度
        self.funList = [] # 存储alpha和决策树桩
        D = np.ones((N, 1)) / float(N) #(1)数据权值分布
        #得到基本分类器 开始
        L = 0.5
        minError = np.inf    #初始化误差大小为最大值（因为要找最小值）
        minTree = None  #误差最小的分类器
        while minError > self.epsilon:
            for axis in range(M):
                min = np.min(np.array(dataSet)[:, axis]) #需要确定阈值的最小值
                max = np.max(np.array(dataSet)[:, axis]) #需要确定阈值的最大值
                for threshold in np.arange(min, max, L):    #左开右闭
                    tree = SingleDecisionTree(axis=axis, threshold = threshold)  #决策树桩
                    em = self.__calc_em(D, tree, dataSet, labels)  #误差率
                    if (minError > em): #选出最小的误差，以及对应的分类器
                        minError = em
                        minTree = tree
                    
            alpha = (0.5) * np.log((1 - minError) / float(minError))    #分类器权重系数
            self.funList.append((alpha, minTree))   #把alpha和分类器写到列表
            exp_items = np.exp(np.multiply(-alpha * np.array(labels), np.array(minTree.preditctArr(dataSet)))).reshape(-1, 1)
            zm = np.dot(D.T, exp_items)
            D = np.multiply(D, exp_items) / zm
            
    
    def score(self, X_test, y_test):
        score = 0
        for X, y in zip(X_test, y_test):
            if self.predict(X) == y:
                score += 1
        print('right rate: %.3f' % (score / len(y_test)))

    def predict(self, x):   #预测方法
        sum = 0
        for fun in self.funList:    #书上最终分类器的代码
            alpha = fun[0]
            tree = fun[1]
            sum += alpha * tree.preditct(x)
        return 1 if sum > 0 else -1

    def __calc_em(self, D, Gm, dataSet, labels):    #计算误差
        value = [0 if Gm.preditct(row) == labels[i] else 1 for (i, row) in enumerate(dataSet)]
        return np.dot(D.T, np.array(value).reshape(-1, 1))


In [3]:
from sklearn.datasets import load_digits
from sklearn.model_selection import  train_test_split

data = load_digits(n_class=2)
X_data = data['data']
y_data = data['target']
inds = np.where(y_data == 0)[0]
y_data[inds] = -1

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

adaboost = AdaBoost()
adaboost.fit(X_train, y_train)
adaboost.score(X_test, y_test)

right rate: 1.000
