###  本程序比较了sklearn中不同集成学习算法
1. Bagging (Base Classifier： Decision Tree)
2. RandomForest
3. ExtraTrees
4. Boosting
 - 4.1 AdaBoost (Base Classifier： Decision Tree)
 - 4.2 Stochastic Gradient Boosting  (Base Classifier： Decision Tree)
5. Voting

In [13]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier  # GBDT
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
import math as ma

def get_dataset_by_uci(url, names):
    """ 从url中读取csv文件
    """
    df = pd.read_csv(url , names=names)
    array = df.values
    X = array[: , 0:8]
    Y = array[:, 8]
    return X , Y

In [14]:
url = r'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
X, Y = get_dataset_by_uci(url, names)

In [15]:
# 1.Bagged Decision Trees for Classification

seed = 7
kfold = model_selection.KFold(n_splits= 10 , random_state=seed) # 10折交叉验证
cart = DecisionTreeClassifier()
num_trees = 100   
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print ('%0.4f' %(results.mean() * 100))

77.0745


In [16]:
#  2.RandomForest Classification

seed = 7
num_trees = 100
max_features = round(ma.sqrt(len(names)))  # max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print ('%0.4f' %(results.mean() * 100))

75.9057


In [17]:
#  3.Extra Trees Classification

seed = 7 
num_trees = 100
max_features = 7 
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print ('%0.4f' %(results.mean() * 100))

76.1671


In [18]:
#  4.1 AdaBoost

seed = 7
num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print ('%0.4f' %(results.mean() * 100))

76.0458


In [19]:
#  4.2 Stochastic Gradient Boosting Classification

seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier()
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print ('%0.4f' %(results.mean() * 100))

76.5584


In [20]:
#  5. Voting

seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)

# create the  sub  models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators=estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print ('%.4f' %(results.mean() * 100))

73.8158


---
####  

In [21]:
# Bagging

from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5 , max_features=0.5)
#  Bagging方法可以传入不同的基分类器，  需指定： max_samples= 最大采样比 ， 最大特征比  max_features


In [22]:
from sklearn.datasets import make_blobs
from sklearn.cross_validation import cross_val_score
X, Y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0)


In [32]:
# 决策树的最大深度 = None 表示不设置最大深度
# min_sampltes_split=1 , 当样本类别是唯一一类时
#  min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

# Decision Tree

clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2 , random_state=0)
score = cross_val_score(clf, X, Y)
print (score.mean())

0.979408793821


###  min_samples_split 参数初始化问题
####  诸多模型在构造时都有min_samples_split参数， 如RF  DT 
####  该参数的初始化要求 ：  integer greater than 1 or a float in (0.0, 1.0]


In [51]:
# RandomForest  多个采样集分别训练多个随机树

clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=0.05, random_state=0)
score = cross_val_score(clf, X, Y)
print(score.mean())



0.975831847891


In [39]:
# ExtraTreesClassifier  从总体样本中训练多个随机树

clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=0.2, random_state=0)
score = cross_val_score(clf , X, Y)
print (score.mean())

0.999898989899


In [53]:
#  AdaBoost
from sklearn.datasets import load_iris

iris = load_iris()
clf = AdaBoostClassifier(n_estimators=100)
X = iris.data
Y = iris.target
scores = cross_val_score(clf, X, Y)
print(scores.mean())


0.959967320261


In [54]:
#   Gradient Boosted Regression Trees (GBRT)
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[: 2000], X[ 2000:]
y_train, y_test = y[: 2000], y[2000:]

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)


# 弱分类器的数量由参数n_estimators分类;每棵树的大小要么由 max_depth设定，要么由max_leaf_nodes设定。
# learning_rate是一个取值范围为(0.0, 1.0]  通过shrinkage控制过拟合。


0.91300000000000003

In [57]:
# Regression  GBRT  回归预测
# GradientBoostingRegressor假设指定许多不同的损失函数； 回归问题默认的损失函数是最小二乘least squares ('ls')。

import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import  make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[: 200], X[200 :]
y_train, y_test = y[: 200] , y[200 :]

est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test))

5.0091548599603213