[http://ml-ensemble.com/info/tutorials/start.html](http://ml-ensemble.com/info/tutorials/start.html)

In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import load_iris

seed = 2017
np.random.seed(seed)

data = load_iris()
idx = np.random.permutation(150)
X = data.data[idx]
y = data.target[idx]

In [9]:
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# --- Build ---
# Passing a scoring function will create cv during fitting
# the scorer should be a simple function accepting to vectors and returning a scalar
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed, verbose=2)

# build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), SVC()])

# Attach the final meta estimator
ensemble.add_meta(LogisticRegression())

# --- Use ---

# Fit ensemble
ensemble.fit(X[:75], y[:75])

# Predict
preds = ensemble.predict(X[75:])


Fitting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Fit complete                        | 00:00:00

Predicting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:00


In [10]:
print("Fit data:\n%r" % ensemble.data)

Fit data:
                                   ft-m  ft-s  pt-m  pt-s  score-m  score-s
layer-1  randomforestclassifier    0.03  0.00  0.00  0.00     0.84     0.06
layer-1  svc                       0.00  0.00  0.00  0.00     0.89     0.05



In [12]:
print("Prediction score: %.3f" % accuracy_score(preds, y[75:]))
# print("Prediction auc: %.4f" % roc_auc_score(preds, y[75:]))

Prediction score: 0.960


#### Multi-layer ensembles

In [14]:
ensemble = SuperLearner(scorer=accuracy_score, random_state=seed)

# Build the first layer
ensemble.add([RandomForestClassifier(random_state=seed), LogisticRegression()])
# Build the second layer
ensemble.add([LogisticRegression(), SVC()])

# Attach the final meta estimator
ensemble.add_meta(SVC())

ensemble.fit(X[:75], y[:75])
preds = ensemble.predict(X[75:])
print("Fit data:\n%r" % ensemble.data)

Fit data:
                                   ft-m  ft-s  pt-m  pt-s  score-m  score-s
layer-1  logisticregression        0.01  0.01  0.00  0.00     0.74     0.16
layer-1  randomforestclassifier    0.03  0.01  0.00  0.00     0.84     0.06
layer-2  logisticregression        0.00  0.00  0.00  0.00     0.67     0.12
layer-2  svc                       0.00  0.00  0.00  0.00     0.89     0.00



In [15]:
print("Prediction score: %.3f" % accuracy_score(preds, y[75:]))

Prediction score: 0.960


### Model selection guide

In [17]:
from mlens.metrics import make_scorer
accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True)

from mlens.model_selection import Evaluator
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint

# Here we name the estimator ourselves
ests = [('gnb', GaussianNB()), ('knn', KNeighborsClassifier())]

# Now we map parameters to these
pars = {'n_neighbors': randint(2,20)}
params = {'knn': pars}

evaluator = Evaluator(accuracy_scorer, cv=10, random_state=seed, verbose=1)
evaluator.fit(X, y, ests, params, n_iter=10)

  "settings.".format(key))


Launching job
Job           done | 00:00:00


<mlens.model_selection.model_selection.Evaluator at 0x7fb79b668e80>

In [21]:
print("Score comparision with best params founds:\n\n%r" % evaluator.results)

Score comparision with best params founds:

       test_score-m  test_score-s  pred_time-m  pred_time-s  train_score-m  train_score-s  fit_time-m  fit_time-s               params
gnb           0.960         0.033        0.005        0.008          0.957          0.006       0.003       0.001                     
knn           0.967         0.033        0.007        0.004          0.980          0.005       0.001       0.001  {'n_neighbors': 15}



#### Preprocessing

In [23]:
from mlens.preprocessing import Subset
from sklearn.preprocessing import StandardScaler

# Map preprocessing cases through a dictionary
preprocess_cases = {
    'none': [],
    'sc': [StandardScaler()],
    'sub': [Subset([0,1])]
}

evaluator.fit(X, y, preprocessing=preprocess_cases)

Launching job
Job           done | 00:00:00


<mlens.model_selection.model_selection.Evaluator at 0x7fb79b668e80>

In [24]:
evaluator.fit(X, y, ests, params, n_iter=10)
print("\nComparsion across preprocessing pipelines:\n\n%r" % evaluator.results)

  "settings.".format(key))
  "settings.".format(key))
  "settings.".format(key))


Launching job
Job           done | 00:00:00

Comparsion across preprocessing pipelines:

             test_score-m  test_score-s  pred_time-m  pred_time-s  train_score-m  train_score-s  fit_time-m  fit_time-s               params
none  gnb           0.960         0.033        0.003        0.004          0.957          0.006       0.003       0.003                     
none  knn           0.967         0.033        0.004        0.006          0.980          0.005       0.002       0.005  {'n_neighbors': 15}
sc    gnb           0.960         0.033        0.002        0.001          0.957          0.006       0.003       0.001                     
sc    knn           0.960         0.044        0.003        0.004          0.965          0.003       0.001       0.001   {'n_neighbors': 8}
sub   gnb           0.780         0.133        0.001        0.001          0.791          0.020       0.006       0.007                     
sub   knn           0.800         0.126        0.004        0.004

#### Map different estimators to different preprocessing folds, and map different parameter distribution to each case

In [27]:
pars_1 = {'n_neighbors': randint(20,30)}
pars_2 = {'n_neighbors': randint(2,10)}
params = {'sc.knn': pars_1,
         'none.knn': pars_2,
         'sub.knn': pars_2}

# We can map different estimators to different cases
ests_1 = [('gnb', GaussianNB()), ('knn', KNeighborsClassifier())]
ests_2 = [('knn', KNeighborsClassifier())]
estimators = {'sc': ests_1,
             'none': ests_2,
             'sub': ests_1}
evaluator.fit(X, y, estimators, params, n_iter=10)
print("\nComparison with different parameter dists:\n\n%r" % evaluator.results)

  "settings.".format(key))
  "settings.".format(key))


Launching job
Job           done | 00:00:00

Comparison with different parameter dists:

             test_score-m  test_score-s  pred_time-m  pred_time-s  train_score-m  train_score-s  fit_time-m  fit_time-s               params
none  knn           0.967         0.045        0.003        0.006          0.961          0.007       0.002       0.003   {'n_neighbors': 3}
sc    gnb           0.960         0.033        0.001        0.002          0.957          0.006       0.001       0.001                     
sc    knn           0.940         0.055        0.005        0.004          0.963          0.006       0.002       0.002  {'n_neighbors': 20}
sub   gnb           0.780         0.133        0.001        0.001          0.791          0.020       0.001       0.001                     
sub   knn           0.800         0.126        0.002        0.002          0.837          0.015       0.003       0.004   {'n_neighbors': 9}



In [28]:
preds = ensemble.predict(X[75:])
print("Fit data:\n%r" % ensemble.data)

Fit data:
                                   ft-m  ft-s  pt-m  pt-s  score-m  score-s
layer-1  logisticregression        0.01  0.01  0.00  0.00     0.74     0.16
layer-1  randomforestclassifier    0.03  0.01  0.00  0.00     0.84     0.06
layer-2  logisticregression        0.00  0.00  0.00  0.00     0.67     0.12
layer-2  svc                       0.00  0.00  0.00  0.00     0.89     0.00

