<img src = "../../Data/bgsedsc_0.jpg">

# Project: Ensembles

In [25]:
## Set up ----
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns

import random
import time
import scipy
import datetime
import pandas as pd
import numpy as np
import sklearn
import pandas as pd
import numpy as np
import time

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.preprocessing import StandardScaler, OneHotEncoder,  scale
import category_encoders as ce
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# kernel approximators
from sklearn.kernel_approximation import Nystroem, RBFSampler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import gc
from sklearn.metrics import accuracy_score

# Random state
rand_state = 1111
np.random.seed(rand_state) # impose random seed for reproducibility

# Some base models
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import mlens
from mlens.visualization import corrmat
from mlens.ensemble import SuperLearner, BlendEnsemble, Subsemble, SequentialEnsemble, TemporalEnsemble

# Training dataset
data=pd.read_csv('../Data/mimic_train.csv')
data_test=pd.read_csv('../Data/mimic_test_los.csv')

## Pre-processing

I have moved the preprocessing to a separate file as it's quite extensive and this way it's more easily shared across prediction problems. It saves data which I read below in order to not have to run pre-processing each time. If need be, pre-processing can be run by uncommenting the code in the cell below.

In [27]:
#%run ./preproc.ipynb

In [28]:
# Read pre-processed data:
y = data.loc[:,'LOS']
X = pd.read_csv("../Data/los/X_preproc.csv")
X_test = pd.read_csv("../Data/los/X_test_preproc.csv")
data = pd.concat([y,X], axis=1)

In [29]:
# Sub-sampling:
#data = data.sample(
#    frac=0.1, random_state=rand_state
#)

In [30]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 41)
(5221, 41)
(20885,)


# Prediction

In [31]:
# Check:
print(X.shape)
print(X_test.shape)
print(y.shape)

(20885, 41)
(5221, 41)
(20885,)


In [None]:
def get_models():
    #Generate a library of simple learners
    svc = SVC(C=100, probability=True, gamma='scale', random_state=SEED)
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=100, random_state=SEED, solver='lbfgs', max_iter=1000)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

    models = {'svm': svc,
              'knn': knn,
              'random forest': rf,
              'logistic': lr,
              }

    return models


def train_predict(model_list,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    #Fit models in list on training set and return preds
    P = np.zeros((ytest.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    cols = list()
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(xtrain, ytrain)
        P.iloc[:, i] = m.predict_proba(xtest)[:, 1]
        cols.append(name)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P


def score_models(P, y):
    # Score model in test set
    print("Scoring models.")
    scores=[]
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        scores.append(score)
        print("%-26s: %.3f" % (m, score))
    return P.columns,scores

In [None]:
models = get_models()
P = train_predict(models,xtrain,ytrain,xtest,ytest)
my_models,my_scores= score_models(P, ytest)

In [None]:
#Plot correlations
sns.heatmap(P.corr(), annot=True)
plt.show()

In [None]:
# Save results
results=[(model,score) for model,score in zip(my_models,my_scores)]
results

### Averaging

In [None]:
# Simple average
AvgPred=pd.DataFrame(P.mean(axis=1),columns=['Avg'])
my_model_avg,my_scores_avg= score_models(AvgPred, ytest)

In [None]:
# Save results
results=results+[(model,score) for model,score in zip(my_model_avg,my_scores_avg)]

### Stacking

In [None]:
meta_learner = ExtraTreesClassifier(
    n_estimators=50,
    bootstrap=True,
    max_features=0.7,
    random_state=SEED
)

# Instantiate the ensemble with 5 folds (stacking meta-learner)
sl = SuperLearner(
    folds=5,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing",
    n_jobs=3
)

# Add the base learners and the meta learner
sl.add(list(models.values()), proba=True)
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest2)
scoreStackXtres=roc_auc_score(ytest2, p_sl[:, 1])
print("\nSuper Learner ROC-AUC score: %.3f" % scoreStackXtres)

In [None]:
## save my model
#filename = 'stacking_model.sav'
#pickle.dump(sl, open(filename, 'wb'))
# 
## check load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#loaded_model.predict_proba(xtest2)

In [None]:
# Save results
results=results+[('StackingExtraTrees',scoreStackXtres)]
results

### Feature propagation

In [None]:
## Select features to propagate
#to_propagate=['pclass','sex_male','age'] # you can add the most important according to random forest, for example
#pointer= [i for i,x in enumerate(df.columns) if x in to_propagate]
#df.columns[pointer]
#sl2 = SuperLearner(
#    folds=5,
#    random_state=SEED,
#    verbose=2,
#    backend="multiprocessing",
#    n_jobs=3
#)
#
## Add the base learners and the meta learner
#sl2.add(list(models.values()), proba=True, propagate_features=pointer)
#sl2.add_meta(meta_learner, proba=True)
#
## Train the ensemble
#sl2.fit(xtrain, ytrain)
#
## Predict the test set
#p_sl2 = sl2.predict_proba(xtest)
#scoreStackXtres2=roc_auc_score(ytest, p_sl2[:, 1])
#print("\nSuper Learner 2 ROC-AUC score: %.3f" % scoreStackXtres2)

In [None]:
## Save results
#results=results+[('StackingExtraTrees2',scoreStackXtres2)]
#results

### Temporale Ensemble

In [None]:
#tl = TemporalEnsemble()
#tl.add(list(models.values()), proba=True)
#tl.add_meta(meta_learner, proba=True)
## Train the ensemble
#tl.fit(xtrain, ytrain)
#p_tl = tl.predict_proba(xtest)
#scoreTempEns=roc_auc_score(ytest, p_tl[:, 1])
#print("\nTemp ROC-AUC score: %.3f" % scoreTempEns)
## Save results
#results=results+[('BlendXtrees',scoreBlendXtrees)]
#results

### Blending

In [None]:
#bl =  BlendEnsemble(test_size=0.2, random_state=SEED,n_jobs=3)
#
## Add the base learners and the meta learner
#bl.add(list(models.values()), proba=True, propagate_features=pointer)
#bl.add_meta(meta_learner, proba=True)
#
## Train the ensemble
#bl.fit(xtrain, ytrain)
#
## Predict the test set
#p_bl = bl.predict_proba(xtest)
#scoreBlendXtrees=roc_auc_score(ytest, p_bl[:, 1])
#print("\nBlend ROC-AUC score: %.3f" % scoreBlendXtrees)

In [None]:
## Save results
#results=results+[('BlendXtrees',scoreBlendXtrees)]
#results

### Subsemple

In [None]:
#sub = Subsemble(partitions=3, folds=4,partition_estimator=KMeans(3, random_state=SEED),
#               random_state=SEED)
#
#sub.add(list(models.values()), proba=True, propagate_features=pointer)
#sub.add_meta(meta_learner, proba=True)
#
## Train the ensemble
#sub.fit(xtrain, ytrain)
#
## Predict the test set
#p_sub = sub.predict_proba(xtest)
#scoreSubS=roc_auc_score(ytest, p_sub[:, 1])
#print("\nSubSamble ROC-AUC score: %.3f" % scoreSubS)

In [None]:
## Save results
#results=results+[('SubSambleXtrees',scoreSubS)]
#results

### Multi-layer

In [None]:
#ensemble = SequentialEnsemble(random_state=SEED)
#
## The initial layer is a blended layer, same as a layer in the BlendEnsemble
#ensemble.add('blend',
#             list(models.values()), proba=True, propagate_features=pointer,random_state=SEED)
#
## The second layer is a stacked layer, same as a layer of the SuperLearner
#ensemble.add('stack', [meta_learner, meta_learner2], proba=True,random_state=SEED)
#
## The third layer is a subsembled layer, same as a layer of the Subsemble
#ensemble.add('subsemble', [meta_learner, meta_learner2], proba=True,random_state=SEED)
#
## The meta estimator is added as in any other ensemble
#ensemble.add_meta(LogisticRegression(), proba=True)
## Train the ensemble
#ensemble.fit(xtrain, ytrain)
#
## Predict the test set
#p_multi = ensemble.predict_proba(xtest)
#scoreMultiE=roc_auc_score(ytest, p_multi[:, 1])
#print("\nMulti-Ensemble ROC-AUC score: %.3f" % scoreMultiE)

In [None]:
## Save results
#results=results+[('Multi-Layer',scoreMultiE)]
#results

## Report

In [None]:
results_df=pd.DataFrame(results)
results_df.columns=['Method','AUC_score']
results_df= results_df.sort_values(by='AUC_score', ascending=False)
results_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")
ax = sns.barplot(x="Method", y="AUC_score", data=results_df)
ax= ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

plt.ylim(0.65,0.85)