# 导入模型

In [1]:
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import seaborn as sns
sns.set_style('white')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# 导入数据

In [3]:
# import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancers = load_breast_cancer()
x = cancers.data[:,:]     #获取特征值
y = cancers.target     #获取标签
print(x.shape)         #查看特征形状
print(y.shape)         #查看标签形状
print(x)
print(y,np.sum(y))

(569, 30)
(569,)
[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0

# 使用sklearn的AdaBoostClassifier

# 决策树作为基分类器

In [57]:
boost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 1, max_leaf_nodes=2), algorithm='SAMME', n_estimators=10, learning_rate=0.8)
boost.fit(x,y)
print(boost.score(x,y))
prediction = cross_val_predict(boost, x, y.astype('int'), cv = 10)

0.9806678383128296


# 计算评价指标

In [21]:
acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

0.945518453427065 | 0.9527777777777777 | 0.9607843137254902 | 0.9567642956764296


# SVC作为基分类器

In [22]:
boost = AdaBoostClassifier(algorithm='SAMME', base_estimator = SVC(kernel='rbf',C=0.1,random_state=32,max_iter = 5000), n_estimators=10, learning_rate=0.8)
boost.fit(x,y)
boost.score(x,y)
prediction = cross_val_predict(boost, x, y.astype('int'), cv = 10)
acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

0.6274165202108963 | 0.6274165202108963 | 1.0 | 0.7710583153347732


In [55]:
boost = AdaBoostClassifier(algorithm='SAMME', base_estimator = LinearSVC(C=0.1,random_state=32,max_iter = 5000), n_estimators=10, learning_rate=0.8)
boost.fit(x,y)
boost.score(x,y)
prediction = cross_val_predict(boost, x, y.astype('int'), cv = 10)
acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)







0.9367311072056239 | 0.9373297002724795 | 0.9635854341736695 | 0.9502762430939227




###### AdaBoostClassifier

基学习器|精度|查准率|查全率|F1
-|-|-|-|-
DecisionTree | 0.945518453427065 | 0.9527777777777777 | 0.9607843137254902 | 0.9567642956764296
SVC(rbf) | 0.6274165202108963 | 0.6274165202108963 | 1.0 | 0.7710583153347732
LinearSVC | 0.9367311072056239 | 0.9373297002724795 | 0.9635854341736695 | 0.9502762430939227

# 实现MyAdaBoost

In [4]:
class MyAdaBoost(object):
    def fit(self, X, y, base_estimator = DecisionTreeClassifier(max_depth = 1, max_leaf_nodes=2), M = 10, learning_rate = 1):
        #Initialization of utility variables
        N = len(y)
        self.n_estimators = M
        estimator_list, y_predict_list, estimator_error_list, estimator_weight_list, sample_weight_list = [], [],[],[],[]

        #Initialize the sample weights
        sample_weight = np.ones(N) / N
        sample_weight_list.append(sample_weight.copy())

        #For m = 1 to M
        for m in range(M):   
            #Fit a classifier
            estimator = base_estimator
            estimator.fit(X, y, sample_weight=sample_weight)
            y_predict = estimator.predict(X)

            #Misclassifications
            incorrect = (y_predict != y)

            #Estimator error
            estimator_error = np.mean( np.average(incorrect, weights=sample_weight, axis=0))

            #Boost estimator weights
            estimator_weight =  learning_rate * np.log((1. - estimator_error) / estimator_error)

            #Boost sample weights
            sample_weight *= np.exp(estimator_weight * incorrect * ((sample_weight > 0) | (estimator_weight < 0)))

            #Save iteration values
            estimator_list.append(estimator)
            y_predict_list.append(y_predict.copy())
            estimator_error_list.append(estimator_error.copy())
            estimator_weight_list.append(estimator_weight.copy())
            sample_weight_list.append(sample_weight.copy())

        #Convert to np array for convenience   
        self.estimator_list = np.asarray(estimator_list)
        self.estimator_weight_list = np.asarray(estimator_weight_list)
        sample_weight_list = np.asarray(sample_weight_list)

        return self.estimator_list, self.estimator_weight_list, sample_weight_list
    
    def predict(self, X):
        y_pred = np.empty((self.n_estimators, len(X)))  # 预测结果二维数组，其中每一行代表一个弱学习器的预测结果
        for i in range(self.n_estimators):
            y_pred[i,:] = self.estimator_list[i].predict(X)
        preds = (np.array([np.sign((y_pred[:,point] * self.estimator_weight_list).sum()) for point in range(len(X))]))
        return preds
    
    def score(self, y, pred):
        score = (pred == y).sum() / len(y)
        print('Accuracy = ',score ) 
        return score

In [6]:
myboost = MyAdaBoost()
estimator_list, estimator_weight_list, sample_weight_list  = myboost.fit(x, y, M=10, learning_rate = 0.8)
print(estimator_list,estimator_weight_list)

[DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
 DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)] [1.9833669  1.68282903 1.1132973  0.97067468 1.05279478 0.69901679
 0.75874963 0.80797006 0.50906092 0.58058414]


In [8]:
prediction = myboost.predict(x)
myboost.score(y, prediction)
#print(prediction)

Accuracy =  0.8787346221441125


0.8787346221441125

In [9]:
# myboost.fit(x[56:,:], y[56:], M=10, learning_rate = 0.8)
# pred = myboost.predict(x[0:56,:])
# print(x[0:56,:])
# print(y[0:56])
# myboost.score(y[0:56], pred)
# print(pred)

# 实现十折交叉验证

In [5]:
def K_flod(model, base, X, Y, k_fold):
    lenx = len(X)
    leny = len(Y)
    preds = []
    leni = int((lenx/k_fold))
    for i in range (k_fold):
        if i==0:
            trainX = X[0:leni*(k_fold-1),:]
            trainY = Y[0:leni*(k_fold-1)]
            testX = X[leni*(k_fold-1):,:]
            testY = Y[leni*(k_fold-1):]
        else:
            trainX = X[0:lenx-leni,:]
            trainY = Y[0:lenx-leni]
            testX = X[lenx-leni:,:]
            testY = Y[lenx-leni:]
        print('trian data:',trainX.shape,'test_data',testX.shape)
        model.fit(trainX,trainY, M=10, learning_rate = 0.8, base_estimator = base)
        predi = model.predict(testX)
        preds = np.append(predi,preds)
        #print(predi)
        model.score(testY,predi)
        
        if i == 0:
            X = np.vstack((X[leni*(k_fold-1):,:], X[0:leni*(k_fold-1),:]))
            Y = np.hstack((Y[leni*(k_fold-1):], Y[0:leni*(k_fold-1)]))
        else:
            X = np.vstack((X[lenx-leni:,:], X[0:lenx-leni,:]))
            Y = np.hstack((Y[lenx-leni:], Y[0:lenx-leni]))
        #print(X)
    
    
    return preds

In [11]:
base = DecisionTreeClassifier(max_depth = 1, max_leaf_nodes=2)
prediction = K_flod(myboost,base,x,y,10)
#print(prediction)

trian data: (504, 30) test_data (65, 30)
Accuracy =  0.5692307692307692
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.7321428571428571
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.375
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6071428571428571
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.8928571428571429
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.7321428571428571
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.7321428571428571
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.625
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.8035714285714286
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.5178571428571429


# 计算评价指标

In [12]:
acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

0.6572934973637962 | 0.7120418848167539 | 0.7619047619047619 | 0.7361299052774019


# 使用MyAdaBoost

# 打乱数据集

In [51]:
c = load_breast_cancer()
c1 = np.hstack((c.data,c.target.reshape([569,1])))
#打乱数组顺序
np.random.shuffle(c1)
#print(np.sum(c1[:,30]))

x = c1[:,0:30]
y = c1[:,30].astype('int')
print(y)

[1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 0 0 1 1 0 1 1 1 1 0 1 0 0 0 1 1 1 1
 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0
 0 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1
 1 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1
 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 0
 1 0 1 0 0 1 1 1 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1 0 0
 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1
 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 0 0 0
 1 0 0 0 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 0 1
 1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 0
 1 1 0 0 1 1 1 1 0 0 1 1 

# 决策树作为基分类器

In [52]:
base = DecisionTreeClassifier(max_depth = 1, max_leaf_nodes=2)
prediction = K_flod(myboost,base, x,y,10)
#print(prediction)

trian data: (504, 30) test_data (65, 30)
Accuracy =  0.8923076923076924
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6607142857142857
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.7678571428571429
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6964285714285714
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.8392857142857143
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.75
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.8214285714285714
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.75
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.75
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.5357142857142857


In [53]:
acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

0.7486818980667839 | 0.7757731958762887 | 0.8431372549019608 | 0.8080536912751678


# SVM作为基分类器

In [19]:
base = SVC(kernel='rbf',C=0.1,random_state=32,max_iter = 5000)
prediction = K_flod(myboost,base, x,y,10)


acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

trian data: (504, 30) test_data (65, 30)
Accuracy =  0.6153846153846154
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.5535714285714286
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.625
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6607142857142857
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.5
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6428571428571429
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6785714285714286
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.7321428571428571
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.6428571428571429
trian data: (513, 30) test_data (56, 30)
Accuracy =  0.625
0.6274165202108963 | 0.6274165202108963 | 1.0 | 0.7710583153347732


In [54]:
base = LinearSVC(C=0.1,random_state=32,max_iter = 5000)
prediction = K_flod(myboost,base, x,y,10)


acc = accuracy_score(y,prediction)
pre = precision_score(y,prediction)
rec = recall_score(y,prediction)
f1 = f1_score(y,prediction)

print(acc,"|",pre,"|",rec,"|",f1)

trian data: (504, 30) test_data (65, 30)




Accuracy =  0.9846153846153847
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9107142857142857
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9285714285714286
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9107142857142857
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9464285714285714
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9821428571428571
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9107142857142857
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.875
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9107142857142857
trian data: (513, 30) test_data (56, 30)




Accuracy =  0.9464285714285714
0.9314586994727593 | 0.9491525423728814 | 0.9411764705882353 | 0.9451476793248945




###### Myadaboost

基学习器|精度|查准率|查全率|F1
-|-|-|-|-
DecisionTree | 0.7486818980667839 | 0.7757731958762887 | 0.8431372549019608 | 0.8080536912751678
SVC(rbf) | 0.6274165202108963 | 0.6274165202108963 | 1.0 | 0.7710583153347732
LinearSVC |0.9314586994727593 | 0.9491525423728814 | 0.9411764705882353 | 0.9451476793248945