In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier();
svm_clf = SVC();

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard' # or soft
)

voting_clf.fit(X_train, y_train)

0,1,2
,estimators,"[('lr', ...), ('rf', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [7]:
from sklearn.metrics import accuracy_score

In [8]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.896
VotingClassifier 0.888


# Bagging and pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

In [11]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [12]:
y_pred

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [13]:
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.904


In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred, y_test)

array([[57,  8],
       [ 4, 56]])

# Out of Bag Evaluation

In [15]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)

In [16]:
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.896

In [17]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [18]:
bag_clf.oob_decision_function_

array([[0.35227273, 0.64772727],
       [0.37297297, 0.62702703],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.09090909, 0.90909091],
       [0.30319149, 0.69680851],
       [0.01744186, 0.98255814],
       [0.98445596, 0.01554404],
       [0.9875    , 0.0125    ],
       [0.73      , 0.27      ],
       [0.        , 1.        ],
       [0.78857143, 0.21142857],
       [0.86243386, 0.13756614],
       [0.98170732, 0.01829268],
       [0.05464481, 0.94535519],
       [0.        , 1.        ],
       [0.95721925, 0.04278075],
       [0.93975904, 0.06024096],
       [1.        , 0.        ],
       [0.01694915, 0.98305085],
       [0.31794872, 0.68205128],
       [0.92090395, 0.07909605],
       [1.        , 0.        ],
       [0.97894737, 0.02105263],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.61325967, 0.38674033],
       [0.

# Random Forests

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,16
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
y_pred_rf = rnd_clf.predict(X_test)

In [22]:
print(rnd_clf.__class__.__name__, accuracy_score(y_test, y_pred_rf))

RandomForestClassifier 0.912


In [23]:
confusion_matrix(y_pred_rf, y_test)

array([[59,  9],
       [ 2, 55]])

# Extra Tree Classifier

In [24]:
from sklearn.ensemble import ExtraTreesClassifier

In [25]:
ex_tree_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
ex_tree_clf.fit(X_train, y_train)


0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,16
,min_impurity_decrease,0.0
,bootstrap,False


In [26]:
y_pred_ex_tree_clf = ex_tree_clf.predict(X_test)

In [27]:
print(ex_tree_clf.__class__.__name__, accuracy_score(y_test, y_pred_ex_tree_clf))

ExtraTreesClassifier 0.92


# Feature Importance

In [28]:
from sklearn.datasets import load_iris

In [29]:
iris = load_iris()

In [30]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_): # zip creates a tuple of both of its inputs
    print(name, score)

sepal length (cm) 0.09629250025960623
sepal width (cm) 0.023856455049013417
petal length (cm) 0.4296899997931066
petal width (cm) 0.45016104489827374


# Boosting

In [32]:
from sklearn.ensemble import AdaBoostClassifier

In [33]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm='SAMME',
    learning_rate=0.5
)

In [34]:
ada_clf.fit(X_train, y_train)



0,1,2
,estimator,DecisionTreeC...r(max_depth=1)
,n_estimators,200
,learning_rate,0.5
,algorithm,'SAMME'
,random_state,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


# Gradient Boosting

In [35]:
from sklearn.tree import DecisionTreeRegressor

In [36]:
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [43]:
# Train a second DecisionTreeRegressor on the residual errors made by first predictor
y2_train = y_train - tree_reg1.predict(X_train) # residual errors

In [44]:
y_train

array([1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,

In [45]:
y2_train

array([ 0.14795918, -0.85204082, -0.06040268,  0.14795918,  0.14795918,
        0.14795918, -0.85204082,  0.14795918, -0.06040268, -0.85204082,
       -0.85204082,  0.14795918, -0.06040268, -0.06040268, -0.06040268,
        0.14795918,  0.14795918, -0.06040268,  0.14795918,  0.        ,
        0.14795918,  0.14795918, -0.06040268, -0.06040268, -0.06040268,
        0.14795918,  0.        , -0.06040268,  0.14795918, -0.06040268,
        0.14795918, -0.06040268,  0.14795918,  0.        ,  0.14795918,
       -0.06040268,  0.14795918,  0.14795918,  0.14795918, -0.06040268,
        0.14795918,  0.14795918, -0.06040268, -0.06040268,  0.14795918,
       -0.06040268, -0.06040268,  0.14795918, -0.06040268,  0.14795918,
       -0.06040268,  0.93959732, -0.06040268,  0.14795918,  0.14795918,
        0.14795918,  0.        ,  0.14795918,  0.14795918,  0.14795918,
        0.        , -0.85204082, -0.06040268,  0.93959732,  0.14795918,
       -0.06040268, -0.06040268, -0.06040268,  0.        , -0.06

In [47]:
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [48]:
y3_train = y2_train - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [49]:
y_pred_gb = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

NameError: name 'X_new' is not defined

# GBRT sklearn

In [50]:
from sklearn.ensemble import GradientBoostingRegressor

In [51]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1)
gbrt.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,3
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0
