In [64]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.datasets import make_classification
from sklearn.metrics import r2_score
from scipy.optimize import minimize 
from scipy.optimize import nnls 
# NOTE: THIS CODE ONLY WORKS FOR UNI-DIMENSIONAL OUTPUT (REGRESSION OR CLASSIFICATION)

N = 1000
output = 'cls'  # 'cls' for classification 'proba' for probabilities, 'reg' for regression
k = 10


In [157]:
x, y = make_classification(n_samples=N, n_features=8, n_clusters_per_class=4,
                            n_informative=4, n_redundant=0, hypercube=False,
                            random_state=0, shuffle=True, class_sep=0.1)

def fn(x, A, b):
    return np.linalg.norm(A.dot(x) - b)

def combiner_solve(x, y):
    # adapted from https://stackoverflow.com/questions/33385898/how-to-include-constraint-to-scipy-nnls-function-solution-so-that-it-sums-to-1/33388181
    beta_0, rnorm = nnls(x,y)
    cons = {'type': 'eq', 'fun': lambda x:  np.sum(x)-1}
    bounds = [[0.0,None]]*x.shape[1]
    minout = minimize(fn, beta_0, args=(x, y), method='SLSQP',bounds=bounds,constraints=cons)
    beta = minout.x
    return beta

class SuperLearner(object):
    def __init__(self, output, est_dict, k):
        self.k = k  # number of cross validation folds
        self.beta = None
        self.trained_superlearner = None
        self.output = output
        self.est_dict = est_dict  # dictionary of learners/algos
        
    def train_combiner(self, x, y):
        if (self.output == 'cls') or (self.output=='proba'):
            num_classes = np.unique(y)
            if len(num_classes) == 2:
                num_classes = 1
            elif len(num_classes) > 2:
                num_classes = len(num_classes)
        else:
            num_classes = 1

        all_preds = np.zeros((len(y), num_classes, len(self.est_dict)))
        all_gts = np.zeros(len(y))
        risks = []

        i = 0
        for key in self.est_dict.keys():

            kf = KFold(n_splits=k, shuffle=True, random_state=0)

            probs = []
            preds = []
            gts = []

            for train_index, test_index in kf.split(x):
                x_train = x[train_index]
                x_test = x[test_index]
                y_train = y[train_index]
                y_test = y[test_index]

                est = self.est_dict[key]
                est.fit(x_train, y_train)

                if self.output == 'proba' or (self.output == 'cls'):
                    p = est.predict(x_test)
                    p_robs = est.predict_proba(x_test)
                    preds.append(p)
                    probs.append(p_robs)
                elif (self.output == 'reg'):
                    p = est.predict(x_test)
                    preds.append(p)
                gts.append(y_test)

            preds = np.concatenate(preds)
            probs = np.concatenate(probs)
            if num_classes == 1:
                probs = probs[:,1].reshape(-1,1)
                preds = preds.reshape(-1,1)

            gts = np.concatenate(gts)

            if (self.output == 'cls') or (self.output == 'proba'):
                all_preds[:,:,i] = probs
            elif self.output == 'reg':
                all_preds[:,:,i] = preds

            i += 1

        beta = combiner_solve(all_preds[:,0,:],gts)
        self.beta = beta
        return beta 


    def train_superlearner(self, x, y):
        assert self.beta is not None, 'Train combiner first using SuperLearner.train_combiner(x,y)'
        # now we have the coefficients we can retrain the networks on all the data and apply this weighting
        risks = []
        trained_superlearner = {}

        for key in self.est_dict.keys():

            est = self.est_dict[key]
            est.fit(x, y)
            trained_superlearner[key] = est
        self.trained_superlearner = trained_superlearner
        return trained_superlearner

  
    def estimation(self, x, y):

        all_preds = np.zeros((len(y), len(self.trained_superlearner)))
        i = 0
        for key in self.trained_superlearner.keys():
            est = self.trained_superlearner[key]

            if (self.output == 'cls') or self.output == 'proba':
                preds = est.predict_proba(x)[:, 1]
            else:
                preds =est.predict(x)
            all_preds[:, i] = preds

            i += 1
        weighted_preds = np.dot(all_preds, self.beta)
        weighted_preds = weighted_preds.reshape(-1,1)
        return weighted_preds




In [158]:
est_dict = {'LR':LogisticRegression()}

SL = SuperLearner(output='cls', est_dict=est_dict, k=k)
SL.train_combiner(x,y)
SL.train_superlearner(x,y)
preds = SL.estimation(x,y)
preds = np.round(preds)
print(SL.beta, balanced_accuracy_score(y, preds))



[1.] 0.552


In [159]:

est_dict = {'LR':LogisticRegression(), 'SVC':SVC(probability=True)}
SL = SuperLearner(output='cls', est_dict=est_dict, k=k)
SL.train_combiner(x,y)
SL.train_superlearner(x,y)
preds = SL.estimation(x,y)
preds = np.round(preds)
print(SL.beta, balanced_accuracy_score(y, preds))


[3.65888855e-17 1.00000000e+00] 0.781


In [160]:

est_dict = {'LR':LogisticRegression(), 'SVC':SVC(probability=True), 'RF':RandomForestClassifier()}
SL = SuperLearner(output='cls', est_dict=est_dict, k=k)
SL.train_combiner(x,y)
SL.train_superlearner(x,y)
preds = SL.estimation(x,y)
preds = np.round(preds)
print(SL.beta, balanced_accuracy_score(y, preds))


[1.68957552e-16 8.37470055e-01 1.62529945e-01] 0.839
