In [48]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, auc
import numpy as np

In [12]:
data = load_iris()

In [13]:
df = pd.concat([pd.DataFrame(data.data, columns= data.feature_names), pd.DataFrame(data.target, columns= ['target'])], axis=1)

In [14]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [16]:
X, y = df.drop('target', axis=1).values, df.target.values

## Stratified Kfold validation

In [44]:
skf = StratifiedKFold(10)

In [28]:
from pprint import pprint

#### RandomForest

In [45]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=6, min_samples_split=5, n_jobs=-1)

In [52]:
scores = []

for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index] 
    
    rf_clf.fit(X_train, y_train)
    
    pred = rf_clf.predict(X_test)
    
    score = accuracy_score(y_test, pred)
    
    scores.append(score)
print(np.mean(scores), np.std(scores))

0.9533333333333334 0.059999999999999984


In [65]:
rf_cv =cross_val_score(estimator=rf_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, verbose=1)
rf_cv.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.9s finished


0.9533333333333334

In [72]:
rf_pred = cross_val_predict(estimator=rf_clf, X=X, y=y, cv=10, n_jobs=-1)

#### ExtraTreesClassifier

In [50]:
xt_clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5, min_samples_split=4, bootstrap=True, verbose=1)

In [53]:
scores = []

for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index] 
    
    xt_clf.fit(X_train, y_train)
    
    pred = rf_clf.predict(X_test)
    
    score = accuracy_score(y_test, pred)
    
    scores.append(score)
    
print(np.mean(scores), np.std(scores))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

0.9666666666666666 0.033333333333333326


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [67]:
xt_cv =cross_val_score(estimator=xt_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, verbose=1)
xt_cv.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


0.9533333333333334

In [73]:
xt_pred = cross_val_predict(estimator=xt_clf, X=X, y=y, cv=10, n_jobs=-1)

#### GradientBoostingClassifier

In [58]:
gbc_clf = GradientBoostingClassifier(n_estimators=1000, max_depth=10, min_samples_leaf=5, min_samples_split=4, verbose=1)

In [59]:
scores = []

for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index] 
    
    gbc_clf.fit(X_train, y_train)
    
    pred = rf_clf.predict(X_test)
    
    score = accuracy_score(y_test, pred)
    
    scores.append(score)
    
print(np.mean(scores), np.std(scores))

      Iter       Train Loss   Remaining Time 
         1         125.3093            3.99s
         2         106.9426            3.99s
         3          92.0511            3.99s
         4          79.9348            3.98s
         5          69.7843            3.98s
         6          61.1890            4.64s
         7          53.9747            5.11s
         8          47.7660            5.46s
         9          42.5132            5.29s
        10          37.8211            5.15s
        20          13.0282            4.12s
        30           5.1219            3.36s
        40           2.2403            2.98s
        50           1.0361            2.66s
        60           0.4629            2.51s
        70           0.2340            2.34s
        80           0.1289            2.16s
        90           0.0694            2.14s
       100           0.0374            2.09s
       200           0.0176            1.28s
       300           0.0176            0.92s
       40

In [68]:
gbc_cv =cross_val_score(estimator=gbc_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, verbose=1)
gbc_cv.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.7s finished


0.96

In [74]:
gbc_pred = cross_val_predict(estimator=gbc_clf, X=X, y=y, cv=10, n_jobs=-1)

# AdaBoostClassifier

In [60]:
abc_clf = AdaBoostClassifier(n_estimators=500)

In [61]:
scores = []

for train_index, test_index in skf.split(X, y):
    
    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index] 
    
    abc_clf.fit(X_train, y_train)
    
    pred = rf_clf.predict(X_test)
    
    score = accuracy_score(y_test, pred)
    
    scores.append(score)
    
print(np.mean(scores), np.std(scores))

0.9666666666666666 0.033333333333333326


In [69]:
abc_cv =cross_val_score(estimator=abc_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, verbose=1)
abc_cv.mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.8s finished


0.9466666666666667

In [75]:
abc_pred = cross_val_predict(estimator=abc_clf, X=X, y=y, cv=10, n_jobs=-1)

In [87]:
pred_df = pd.DataFrame(np.stack([rf_pred, xt_pred, gbc_pred, abc_pred, df.target.values], axis=1), columns=['rf', 'xt', 'gbc', 'abc', 'target'])

In [90]:
X_pred, y_pred = pred_df.drop('target', axis=1).values, pred_df.target.values

In [91]:
gbc_pred_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=1000)

In [92]:
gbc_pred_clf.fit(X_pred, y_pred)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [93]:
gbc_stack_pred = gbc_pred_clf.predict(X_pred)

In [94]:
accuracy_score(y_pred, gbc_stack_pred)

0.96