# Ensemble Stacking

NOTE: Functions and Example added to NPML

In [155]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split

In [156]:
def get_out_of_fold(model, X_train, y_train, X_test, k=5, random_state=42):
    """
    Gets Out-Of-Fold Predictions for a model
    
    Parameters
    ----------
    model : list of pandas.DataFrame
        Model to get Out-Of-Fold predictions for
    X_train : numpy.ndarray or pandas.DataFrame
        Features of Training Set
    y_train : numpy.ndarray or pandas.DataFrame
        Target of Training Set
    X_test : numpy.ndarray or pandas.DataFrame
        Featrues of Testing Set
    k : int
        Number of folds
    random_state : int
        The seed of the pseudo random number generator to use when shuffling the data.
        
    Returns
    -------
    numpy.array
        Out-Of-Fold X Train
    numpy.array
        Out-Of-Fold X Test
        
    Raises
    ------
    TypeError
        when X_train, y_train, or X_test isn't a Numpy ndarray or a Pandas DataFrame
    """
    # Check Input
    if (not isinstance(X_train, (np.ndarray, pd.Series, pd.DataFrame))
            or not isinstance(y_train, (np.ndarray, pd.Series, pd.DataFrame))
            or not isinstance(X_test, (np.ndarray, pd.Series, pd.DataFrame))):
        raise TypeError('Input Data must be either a Numpy ndarray, Pandas Series, or Pandas DataFrame')
    
    # Convert to Numpy Array
    if isinstance(X_train, (pd.Series, pd.DataFrame)):
        X_train = X_train.values
    if isinstance(y_train, (pd.Series, pd.DataFrame)):
        y_train = y_train.values
    if isinstance(X_test, (pd.Series, pd.DataFrame)):
        X_test = X_test.values
    
    # Create Folds
    kf = KFold(n_splits=k, random_state=random_state)
    
    # Init oof predictions arrays
    oof_train = np.zeros((len(X_train),))
    oof_test = np.zeros((len(X_test),))
    
    # Create matrix to hold X_test predictions across folds
    # The oof esimations you apply to train, you need to apply to X_test as well. 
    oof_test_folds = np.empty((k, len(X_test)))
    
    # For each fold, create predictions on the out fold (out-of-fold predictions)
    for i, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
        # Train on "in-folds"
        X_in_folds = X_train[train_idx]
        y_in_folds = y_train[train_idx]
        
        # Predict "Out Folds"
        X_out_fold = X_train[test_idx]
        
        # Fit Model
        model.fit(X_in_folds, y_in_folds)
        
        # Make Out-Of-Fold Predictions
        oof_train[test_idx] = model.predict(X_out_fold)
        oof_test_folds[i, :] = model.predict(X_test)
        
    # Take the mean of test across all folds    
    oof_test[:] = oof_test_folds.mean(axis=0)
    
    # Return new X_train, and X_test as numpy arrays for the model. 
    return oof_train, oof_test


def stack(models, X_train, y_train, X_test, k=5, random_state=42):
    """
    Gets Out-Of-Fold predictions for a list of models
    
    Parameters
    ----------
    models : list of pandas.core.frame.DataFrame
        List of models to stack
    X_train : pandas.core.frame.DataFrame
        Features of Training Set
    y_train : pandas.core.frame.DataFrame
        Target of Training Set
    X_test : pandas.core.frame.DataFrame
        Featrues of Testing Set
    k : int
        Number of folds
    random_state : int
        The seed of the pseudo random number generator to use when shuffling the data.
    
    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of the predictions of each of the models
    """
    # Init stacked dicts
    X_train_stacked = {}
    X_test_stacked = {}
    
    # for each model, create out of fold predictions 
    for i, model in enumerate(models):
        X_train_oof, X_test_oof = get_out_of_fold(model, X_train, y_train, X_test, k=k, random_state=random_state)
        
        X_train_stacked[f'model_{i+1}'] = X_train_oof
        X_test_stacked[f'model_{i+1}'] = X_test_oof
        
    return pd.DataFrame(X_train_stacked), pd.DataFrame(X_test_stacked)

In [157]:
# Generate a sample dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=7, n_redundant=1, n_classes=2)

# Train Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List of Models
models = [
    LogisticRegression(solver='lbfgs'),
    RandomForestClassifier(n_estimators=100)
]

# Get Stacked X
X_train_l2, X_test_l2 = stack(models, X_train, y_train, X_test)

In [158]:
# Individual Accuracy
print('--Individual Accuracy--')
for model in models:
    print(f'{model.__class__.__name__} Accuracy: {model.fit(X_train, y_train).score(X_test, y_test):0.4f}')
    
#Stacked Accuracy
print('\n--Stacked Accuracy--')
stacked_model = LogisticRegression(solver='lbfgs')
print(f'Stacked Accuracy: {stacked_model.fit(X_train_l2, y_train).score(X_test_l2, y_test):0.4f}')

--Individual Accuracy--
LogisticRegression Accuracy: 0.9533
RandomForestClassifier Accuracy: 0.9533

--Stacked Accuracy--
Stacked Accuracy: 0.9667
