# Super Learner Ensembles in Python
https://machinelearningmastery.com/super-learner-ensemble-in-python/

# Manually Develop a Super Learner With scikit-learn

# Super Learner for Regression

In [2]:
# create the inputs and outputs
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
X, y = make_regression(n_samples=1000, n_features=100, noise=0.5)
# split
X, X_val, y, y_val = train_test_split(X, y, test_size=0.50)
print('Train', X.shape, y.shape, '\nTest', X_val.shape, y_val.shape)


Train (500, 100) (500,) 
Test (500, 100) (500,)


In [3]:
from sklearn.linear_model import LinearRegression # for building linear Regression model
from sklearn.linear_model import ElasticNet # for building Elasticnet model
from sklearn.svm import SVR # for building SVR model
from sklearn.tree import DecisionTreeRegressor  # for building Decision Tree Regressor model
from sklearn.neighbors import KNeighborsRegressor # for building K Nearest Neighbors Regressor model
from sklearn.ensemble import AdaBoostRegressor # for building Adaboost Regressor model
from sklearn.ensemble import BaggingRegressor # for building Bagging Regressor model
from sklearn.ensemble import RandomForestRegressor # for building RandomForest Regressor model
from sklearn.ensemble import ExtraTreesRegressor # for building ExtraTree Regressor model

# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(ElasticNet())
	models.append(SVR(gamma='scale'))
	models.append(DecisionTreeRegressor())
	models.append(KNeighborsRegressor())
	models.append(AdaBoostRegressor())
	models.append(BaggingRegressor(n_estimators=10))
	models.append(RandomForestRegressor(n_estimators=10))
	models.append(ExtraTreesRegressor(n_estimators=10))
	return models

In [22]:
from sklearn.model_selection import KFold
import numpy as np
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
	meta_X, meta_y = list(), list()
	# define split of data
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ix, test_ix in kfold.split(X):
		fold_yhats = list()
		# get data
		train_X, test_X = X[train_ix], X[test_ix]
		train_y, test_y = y[train_ix], y[test_ix]
		meta_y.extend(test_y)
		# fit and make predictions with each sub-model
		for model in models:
			model.fit(train_X, train_y)
			yhat = model.predict(test_X)
			# store columns
			fold_yhats.append(yhat.reshape(len(yhat),1))
		# store fold yhats as columns
		meta_X.append(np.hstack(fold_yhats))
	return np.vstack(meta_X), np.asarray(meta_y)

In [23]:
# get models
models = get_models()
# get out of fold predictions
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

Meta  (500, 9) (500,)


In [22]:

# example of a super learner model for regression
from math import sqrt
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
 
# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(ElasticNet())
	models.append(SVR(gamma='scale'))
	models.append(DecisionTreeRegressor())
	models.append(KNeighborsRegressor())
	models.append(AdaBoostRegressor())
	models.append(BaggingRegressor(n_estimators=10))
	models.append(RandomForestRegressor(n_estimators=10))
	models.append(ExtraTreesRegressor(n_estimators=10))
	return models
 
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
	meta_X, meta_y = list(), list()
	# define split of data
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ix, test_ix in kfold.split(X):
		fold_yhats = list()
		# get data
		train_X, test_X = X[train_ix], X[test_ix]
		train_y, test_y = y[train_ix], y[test_ix]
		meta_y.extend(test_y)
		# fit and make predictions with each sub-model
		for model in models:
			model.fit(train_X, train_y)
			yhat = model.predict(test_X)
			# store columns
			fold_yhats.append(yhat.reshape(len(yhat),1))
		# store fold yhats as columns
		meta_X.append(hstack(fold_yhats))
	return vstack(meta_X), asarray(meta_y)
 
# fit all base models on the training dataset
def fit_base_models(X, y, models):
	for model in models:
		model.fit(X, y)
 
# fit a meta model
def fit_meta_model(X, y):
	model = LinearRegression()
	model.fit(X, y)
	return model
 
# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
	for model in models:
		yhat = model.predict(X)
		mse = mean_squared_error(y, yhat)
		print('%s: RMSE %.3f' % (model.__class__.__name__, sqrt(mse)))
 
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
	meta_X = list()
	for model in models:
		yhat = model.predict(X)
		meta_X.append(yhat.reshape(len(yhat),1))
	meta_X = hstack(meta_X)
	# predict
	return meta_model.predict(meta_X)
 
# create the inputs and outputs
X, y = make_regression(n_samples=1000, n_features=100, noise=0.5)
# split
X, X_val, y, y_val = train_test_split(X, y, test_size=0.50)
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)
#print('X and y', X[[1,1]], y[1])
print('X', X[[1,1]])
print('y', y[1:10])
# get models
models = get_models()
# get out of fold predictions
print('===================get out of fold=========================')
print('Shape of X and y', X.shape, y.shape)
#print('X and y', X[[1,1]], y[1])
print('X', X[[1,1]])
print('y', y[1:10])
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)
# fit base models
print('===================fit base models=========================')
print('Shape of X and y', X.shape, y.shape)
#print('X and y', X[[1,1]], y[1])
print('X', X[[1,1]])
print('y', y[1:10])
fit_base_models(X, y, models)
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)
print('=====================fit the meta models=======================')
print('Shape of meta_X and meta_y', meta_X.shape, meta_y.shape)
print('meta_X', meta_X[[1,1]])
print('meta_y', meta_y[1:10])
# evaluate base models
print('=====================evaluate base models=======================')
print('Shape of X_val and y_val', X_val.shape, y_val.shape)
#print('X_val and y_val', X[[1,1]], y[1])
print('X_val', X_val[[1,1]])
print('y_val', y_val[1:10])
evaluate_models(X_val, y_val, models)
# evaluate meta model
print('=====================evaluate meta models=======================')
print('Shape of X_val', X_val.shape)
print('X_val', X_val[[1,1]])
yhat = super_learner_predictions(X_val, models, meta_model)
print('Super Learner: RMSE %.3f' % (sqrt(mean_squared_error(y_val, yhat))))

Train (500, 100) (500,) Test (500, 100) (500,)
X [[-1.45412587  0.08008638  1.08519412 -0.11204679  0.29468594  0.84492951
  -1.00811116  1.62160639 -0.20228665 -0.69050357 -1.21407836 -1.43345206
  -1.35024259  0.84127711  0.91664021 -1.47066424  1.44626011 -0.73186133
   0.01483951 -0.4863752  -1.78476461  0.94430145 -0.21079608 -0.28556668
   0.43884436  1.05015894 -0.43188818  1.02402773 -0.43862051 -0.23308659
   0.12886743  0.30232832 -0.75358659  1.61284737  1.46670653 -1.13486625
  -0.2023176  -1.35092154  1.10786344 -0.72280319 -0.69040814 -0.68819222
   0.49421152  2.32401083  0.18345993 -0.4445231   2.49940396 -0.273991
  -0.2918462   0.52700002 -2.70403663  0.35069929  0.23660702  2.02505685
   0.95695629 -0.3640541  -0.74261116 -0.73875395  0.52848976  0.05788144
   0.50223642  0.07417785 -0.11082769 -0.44532347  0.02634377  0.72540289
  -0.80219861  0.12737085 -0.24921959  0.84343955 -0.93032247  0.79961265
   0.05713767  0.12553157  1.03636252 -0.03898067  0.93443905  1.

# Build Super Learner Regression Model with mlens library

In [24]:
!pip install mlens

Collecting mlens
  Downloading mlens-0.2.3-py2.py3-none-any.whl (227 kB)
Installing collected packages: mlens
Successfully installed mlens-0.2.3


In [25]:

# example of a super learner for regression using the mlens library
from math import sqrt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from mlens.ensemble import SuperLearner
 
# create a list of base-models
def get_models():
	models = list()
	models.append(LinearRegression())
	models.append(ElasticNet())
	models.append(SVR(gamma='scale'))
	models.append(DecisionTreeRegressor())
	models.append(KNeighborsRegressor())
	models.append(AdaBoostRegressor())
	models.append(BaggingRegressor(n_estimators=10))
	models.append(RandomForestRegressor(n_estimators=10))
	models.append(ExtraTreesRegressor(n_estimators=10))
	return models
 
# cost function for base models
def rmse(yreal, yhat):
	return sqrt(mean_squared_error(yreal, yhat))
 
# create the super learner
def get_super_learner(X):
	ensemble = SuperLearner(scorer=rmse, folds=10, shuffle=True, sample_size=len(X))
	# add base models
	models = get_models()
	ensemble.add(models)
	# add the meta model
	ensemble.add_meta(LinearRegression())
	return ensemble
 
# create the inputs and outputs
X, y = make_regression(n_samples=1000, n_features=100, noise=0.5)
# split
X, X_val, y, y_val = train_test_split(X, y, test_size=0.50)
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)
# create the super learner
ensemble = get_super_learner(X)
# fit the super learner
ensemble.fit(X, y)
# summarize base learners
print(ensemble.data)
# evaluate meta model
yhat = ensemble.predict(X_val)
print('Super Learner: RMSE %.3f' % (rmse(y_val, yhat)))

[MLENS] backend: threading


Train (500, 100) (500,) Test (500, 100) (500,)
                                  score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  adaboostregressor          93.37    14.18  0.79  0.02  0.05  0.01
layer-1  baggingregressor          107.92    13.90  0.32  0.03  0.01  0.00
layer-1  decisiontreeregressor     152.68    16.43  0.04  0.00  0.00  0.00
layer-1  elasticnet                 61.29    10.28  0.01  0.00  0.00  0.00
layer-1  extratreesregressor       104.00    16.33  0.19  0.02  0.02  0.01
layer-1  kneighborsregressor       144.17    18.33  0.00  0.00  0.16  0.06
layer-1  linearregression            0.61     0.05  0.02  0.01  0.00  0.00
layer-1  randomforestregressor     110.37    14.50  0.28  0.03  0.00  0.00
layer-1  svr                       161.13    24.01  0.02  0.00  0.01  0.00

Super Learner: RMSE 0.540


# Build Super Learner Classification Model with mlens library

In [26]:

# example of a super learner using the mlens library
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from mlens.ensemble import SuperLearner
 
# create a list of base-models
def get_models():
	models = list()
	models.append(LogisticRegression(solver='liblinear'))
	models.append(DecisionTreeClassifier())
	models.append(SVC(gamma='scale', probability=True))
	models.append(GaussianNB())
	models.append(KNeighborsClassifier())
	models.append(AdaBoostClassifier())
	models.append(BaggingClassifier(n_estimators=10))
	models.append(RandomForestClassifier(n_estimators=10))
	models.append(ExtraTreesClassifier(n_estimators=10))
	return models
 
# create the super learner
def get_super_learner(X):
	ensemble = SuperLearner(scorer=accuracy_score, folds=10, shuffle=True, sample_size=len(X))
	# add base models
	models = get_models()
	ensemble.add(models)
	# add the meta model
	ensemble.add_meta(LogisticRegression(solver='lbfgs'))
	return ensemble
 
# create the inputs and outputs
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# split
X, X_val, y, y_val = train_test_split(X, y, test_size=0.50)
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)
# create the super learner
ensemble = get_super_learner(X)
# fit the super learner
ensemble.fit(X, y)
# summarize base learners
print(ensemble.data)
# make predictions on hold out set
yhat = ensemble.predict(X_val)
print('Super Learner: %.3f' % (accuracy_score(y_val, yhat) * 100))

Train (500, 100) (500,) Test (500, 100) (500,)
                                   score-m  score-s  ft-m  ft-s  pt-m  pt-s
layer-1  adaboostclassifier           0.91     0.03  0.71  0.02  0.07  0.01
layer-1  baggingclassifier            0.85     0.07  0.27  0.01  0.01  0.00
layer-1  decisiontreeclassifier       0.71     0.08  0.04  0.00  0.00  0.00
layer-1  extratreesclassifier         0.84     0.07  0.11  0.02  0.01  0.00
layer-1  gaussiannb                   0.98     0.02  0.01  0.00  0.00  0.00
layer-1  kneighborsclassifier         0.95     0.04  0.00  0.00  0.20  0.02
layer-1  logisticregression           0.97     0.03  0.01  0.00  0.00  0.00
layer-1  randomforestclassifier       0.83     0.04  0.19  0.03  0.01  0.00
layer-1  svc                          0.98     0.02  0.09  0.01  0.01  0.01

Super Learner: 98.800
