# Mnist
- 训练集：60000
- 测试集：10000
- 层数： 40

In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
# load data
def load_data(fileName):
    """
    加载数据
    """
    dataArr = []
    labelArr = []
    
    fr = open(fileName, 'r')
    
    for line in tqdm(fr.readlines()):
        curLine = line.strip().split(',')
        
        # 二值化处理
        dataArr.append([int(int(num) > 128) for num in curLine[1: ]])
        
        if int(curLine[0]) == 0:
            labelArr.append(1)
        else:
            labelArr.append(-1)
            
    return dataArr, labelArr

In [15]:
# 

class AdaBoost():
    """
    AdaBoost
    """
    
    def calc_e_Gx(self, trainDataArr, trainLabelArr, n, div, rule, D):
        """
        计算分类错误率
        """
        
        e = 0
        
        x = trainDataArr[:, n]
        y = trainLabelArr
        
        predict = []
        
        if rule == 'LisOne':
            L = 1
            H = -1
        else:
            L = -1
            H = 1
            
        for i in range(trainDataArr.shape[0]):
            if x[i] < div:
                predict.append(L)
                if y[i] != L:
                    e += D[i]
            elif x[i] >= div:
                predict.append(H)
                if y[i] != H:
                    e += D[i]
                    
        return np.array(predict), e
    
    def createSigleBoostingTree(self, trainDataArr, trainLabelArr, D):
        """
        创建单层提升树
        """
        
        m, n = np.shape(trainDataArr)
        
        sigleBoostTree = {}
        
        sigleBoostTree['e'] = 1
        
        for i in range(n):
            for div in [-0.5, 0.5, 1.5]:
                for rule in ['LisOne', 'HisOne']:
                    Gx, e = self.calc_e_Gx(trainDataArr, trainLabelArr, i, div, rule, D)
                    if e < sigleBoostTree['e']:
                        
                        sigleBoostTree['e'] = e
                        sigleBoostTree['div'] = div
                        sigleBoostTree['rule'] = rule
                        sigleBoostTree['Gx'] = Gx
                        sigleBoostTree['feature'] = i
                        
        return sigleBoostTree
    
    
    def createBoostingTree(self, trainDataList, trainLabelList, treeNum = 50):
        """
        创建提升树
        """
        
        trainDataArr = np.array(trainDataList)
        trainLabelArr = np.array(trainLabelList)
        
        finallpredict = [0] * len(trainLabelArr)
        
        m, n = np.shape(trainDataArr)
        
        D = [1 / m] * m
        
        tree = []
        
        for i in range(treeNum):
            curTree = self.createSigleBoostingTree(trainDataArr, trainLabelArr, D)
            alpha = 1/2 * np.log((1 - curTree['e']) / curTree['e'])
            Gx = curTree['Gx']

            D = np.multiply(D, np.exp(-1 * alpha * np.multiply(trainLabelArr, Gx))) / sum(D)
            
            curTree['alpha'] = alpha
            tree.append(curTree)
            
            finallpredict += alpha * Gx
            
            error = sum([1 for i in range(len(trainDataList)) if np.sign(finallpredict[i]) != trainLabelArr[i]])
            
            finallError = error / len(trainDataList)
            
            if finallError == 0:
                return tree
            
            print("iter : %d: %d, sigle error: %.4f, finall error: %.4f" % (i, treeNum, curTree['e'], finallError))
        
        return tree
        
    def predict(self, x, div, rule, feature):
        """
        预测标签
        """
        if rule == 'LisOne':
            L = 1
            H = -1
        else:
            L = -1
            H = 1
            
        if x[feature] < div:
            return L 
        else:
            return H
        
    def model_test(testDataList, testLabelList, tree):
        """
        测试
        """
        error_count = 0
        for i in range(len(testDataList)):
            result = 0
            for curTree in tree:
                div = curTree['div']
                rule = curTree['rule']
                feature = curTree['feature']
                alpha = curTree['alpha']
                result += alpha * self.predict(testDataList[i], div, rule, feature)
            
            if np.sign(result) != testLabelList[i]:
                error_count += 1
        
        return 1 - error_count / len(testDataList)

    

In [5]:
trainDataList, trainLabelList = load_data('./mnist/mnist_train.csv')
testDataList, testLabelList = load_data('./mnist/mnist_test.csv')

100%|███████████████████████████████████| 60000/60000 [00:11<00:00, 5173.02it/s]
100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 5248.14it/s]


In [16]:
clf = AdaBoost()
clf.createBoostingTree(trainDataList[0: 1000], trainLabelList[0: 1000], 40)

iter : 0: 40, sigle error: 0.0780, finall error: 0.0780
iter : 1: 40, sigle error: 0.1289, finall error: 0.0780
iter : 2: 40, sigle error: 0.1737, finall error: 0.0940
iter : 3: 40, sigle error: 0.1825, finall error: 0.0730
iter : 4: 40, sigle error: 0.2495, finall error: 0.0720
iter : 5: 40, sigle error: 0.2099, finall error: 0.0640
iter : 6: 40, sigle error: 0.1967, finall error: 0.0480
iter : 7: 40, sigle error: 0.2306, finall error: 0.0480
iter : 8: 40, sigle error: 0.2498, finall error: 0.0480
iter : 9: 40, sigle error: 0.2383, finall error: 0.0310
iter : 10: 40, sigle error: 0.2282, finall error: 0.0350
iter : 11: 40, sigle error: 0.2508, finall error: 0.0310
iter : 12: 40, sigle error: 0.2006, finall error: 0.0270
iter : 13: 40, sigle error: 0.2596, finall error: 0.0250
iter : 14: 40, sigle error: 0.2529, finall error: 0.0210
iter : 15: 40, sigle error: 0.2574, finall error: 0.0120
iter : 16: 40, sigle error: 0.2984, finall error: 0.0160
iter : 17: 40, sigle error: 0.3060, final

[{'e': 0.07800000000000006,
  'div': 0.5,
  'rule': 'HisOne',
  'Gx': array([-1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,
          1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        

In [None]:
tree = clf.createBoostingTree(trainDataList[0: 1000], trainLabelList[0: 1000], 40)

In [None]:
acc = clf.model_test(testDataList[0: 1000], testLabelList[0: 1000], tree)