In [46]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import statsmodels.api as sm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.datasets import make_classification
from sklearn.metrics import r2_score
# NOTE: THIS CODE ONLY WORKS FOR UNI-DIMENSIONAL OUTPUT (REGRESSION OR CLASSIFICATION)

N = 1000
output = 'cls'  # 'cls' for classification 'proba' for probabilities, 'reg' for regression
k = 10


In [60]:
x, y = make_classification(n_samples=N, n_features=8, n_clusters_per_class=4,
                            n_informative=4, n_redundant=0, hypercube=False,
                            random_state=0, shuffle=True, class_sep=0.1)


def train_combiner(x, y, est_dict, output):
    if (output == 'cls') or (output=='proba'):
        num_classes = np.unique(y)
        if len(num_classes) == 2:
            num_classes = 1
        elif len(num_classes) > 2:
            num_classes = len(num_classes)
    else:
        num_classes = 1

    all_preds = np.zeros((len(y), num_classes, len(est_dict)))
    all_gts = np.zeros(len(y))
    risks = []

    i = 0
    for key in est_dict.keys():

        kf = KFold(n_splits=k, shuffle=True, random_state=0)

        probs = []
        preds = []
        gts = []

        for train_index, test_index in kf.split(x):
            x_train = x[train_index]
            x_test = x[test_index]
            y_train = y[train_index]
            y_test = y[test_index]

            est = est_dict[key]
            est.fit(x_train, y_train)

            if output == 'proba' or (output == 'cls'):
                p = est.predict(x_test)
                p_robs = est.predict_proba(x_test)
                preds.append(p)
                probs.append(p_robs)
            elif (output == 'reg'):
                p = est.predict(x_test)
                preds.append(p)
            gts.append(y_test)

        preds = np.concatenate(preds)
        probs = np.concatenate(probs)
        if num_classes == 1:
            probs = probs[:,1].reshape(-1,1)
            preds = preds.reshape(-1,1)

        gts = np.concatenate(gts)

        if (output == 'cls') or (output == 'proba'):
            all_preds[:,:,i] = probs
        elif output == 'reg':
            all_preds[:,:,i] = preds

        if (output == 'cls') or (output == 'proba'):
            risks.append(balanced_accuracy_score(gts, preds))
        elif output == 'reg':
            risks.append(r2_score(gts, preds))       
        i += 1


    weighted_preds = np.zeros((len(y),num_classes))
    weighted_probs = np.zeros((len(y),num_classes))
    risks = []

    if (output == 'cls') or (output == 'proba'):
        combiner = LogisticRegression(fit_intercept=False)
    elif (output == 'reg'):
        combiner = LinearRegression(fit_intercept=False)
    combiner.fit(all_preds[:,0,:], gts)
    coefs = combiner.coef_

    preds = combiner.predict(all_preds[:,0,:])
    probs = combiner.predict_proba(all_preds[:,0,:])

    weighted_preds[:,0] = preds
    weighted_probs[:, 0] = probs[:,1]

    if (output == 'cls') or (output == 'proba'):
        risk = balanced_accuracy_score(gts, preds)
    elif output == 'reg':
        risk = r2_score(gts, preds)  
    risks.append(risk)

    return combiner 


def train_superlearner(x, y, est_dict):
    # now we have the coefficients we can retrain the networks on all the data and apply this weighting
    risks = []
    trained_superlearner = {}
    
    for key in est_dict.keys():

        est = est_dict[key]
        est.fit(x, y)
        trained_superlearner[key] = est
        
    return trained_superlearner

  
def estimation(x, y, combiner, trained_superlearner, proba=False):
    
    all_preds = np.zeros((len(y), len(trained_superlearner)))
    i = 0
    for key in trained_superlearner.keys():
        est = trained_superlearner[key]
        
        if proba == True:
            preds = est.predict_proba(x)[:, 1]
        else:
            preds =est.predict(x)
        all_preds[:, i] = preds
        
        i += 1
    
    if proba:
        weighted_preds = combiner.predict_proba(all_preds)
    else:
        weighted_preds = combiner.predict(all_preds)
        
    return weighted_preds
        


In [55]:

est_dict = {'LR':LogisticRegression()}

combiner =  train_combiner(x, y, est_dict, output)
trained_superlearner = train_superlearner(x, y, est_dict)
preds = estimation(x, y, combiner, trained_superlearner, proba=False)
balanced_accuracy_score(y, preds)

0.5

In [56]:

est_dict = {'LR':LogisticRegression(), 'SVC':SVC(probability=True)}

combiner =  train_combiner(x, y, est_dict)
trained_superlearner = train_superlearner(x, y, est_dict)
preds = estimation(x, y, combiner, trained_superlearner, proba=False)

balanced_accuracy_score(y, preds)

0.784

In [59]:

est_dict = {'LR':LogisticRegression(), 'SVC':SVC(probability=True), 'RF':RandomForestClassifier()}

combiner =  train_combiner(x, y, est_dict)
trained_superlearner = train_superlearner(x, y, est_dict)
preds = estimation(x, y, combiner, trained_superlearner, proba=False)

balanced_accuracy_score(y, preds)

0.915