In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier();
svm_clf = SVC();

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard' # or soft
)

voting_clf.fit(X_train, y_train)

0,1,2
,estimators,"[('lr', ...), ('rf', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [5]:
from sklearn.metrics import accuracy_score

In [6]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.92
SVC 0.896
VotingClassifier 0.904


# Bagging and pasting

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [8]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

In [9]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
y_pred

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [11]:
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.912


In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred, y_test)

array([[58,  8],
       [ 3, 56]])

# Out of Bag Evaluation

In [13]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)

In [14]:
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8986666666666666

In [15]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.888

In [16]:
bag_clf.oob_decision_function_

array([[0.38068182, 0.61931818],
       [0.36458333, 0.63541667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.10194175, 0.89805825],
       [0.40625   , 0.59375   ],
       [0.02298851, 0.97701149],
       [1.        , 0.        ],
       [0.95266272, 0.04733728],
       [0.78238342, 0.21761658],
       [0.00534759, 0.99465241],
       [0.7606383 , 0.2393617 ],
       [0.85245902, 0.14754098],
       [0.96685083, 0.03314917],
       [0.03626943, 0.96373057],
       [0.        , 1.        ],
       [0.99425287, 0.00574713],
       [0.95169082, 0.04830918],
       [0.99479167, 0.00520833],
       [0.01923077, 0.98076923],
       [0.36931818, 0.63068182],
       [0.94736842, 0.05263158],
       [1.        , 0.        ],
       [0.96590909, 0.03409091],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.64705882, 0.35294118],
       [0.

# Random Forests

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,16
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
y_pred_rf = rnd_clf.predict(X_test)

In [16]:
print(rnd_clf.__class__.__name__, accuracy_score(y_test, y_pred_rf))

RandomForestClassifier 0.912


In [17]:
confusion_matrix(y_pred_rf, y_test)

array([[59,  9],
       [ 2, 55]])

# Extra Tree Classifier

In [49]:
from sklearn.ensemble import ExtraTreesClassifier

In [50]:
ex_tree_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
ex_tree_clf.fit(X_train, y_train)


0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,16
,min_impurity_decrease,0.0
,bootstrap,False


In [51]:
y_pred_ex_tree_clf = ex_tree_clf.predict(X_test)

In [53]:
print(ex_tree_clf.__class__.__name__, accuracy_score(y_test, y_pred_ex_tree_clf))

ExtraTreesClassifier 0.912


# Feature Importance

In [55]:
from sklearn.datasets import load_iris

In [56]:
iris = load_iris()

In [57]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_): # zip creates a tuple of both of its inputs
    print(name, score)

sepal length (cm) 0.0923423159388423
sepal width (cm) 0.025886715016786317
petal length (cm) 0.4276274892570669
petal width (cm) 0.45414347978730457


# Boosting

In [59]:
from sklearn.ensemble import AdaBoostClassifier

In [67]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm='SAMME',
    learning_rate=0.5
)

In [68]:
ada_clf.fit(X_train, y_train)



0,1,2
,estimator,DecisionTreeC...r(max_depth=1)
,n_estimators,200
,learning_rate,0.5
,algorithm,'SAMME'
,random_state,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0
