# Stacking([Stacking Classifier](https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/#example-1-simple-stacked-classification))

## Load Package

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")

In [25]:
#one hot encoder
from  sklearn  import  metrics
def one_hot_encoder(df, label, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if col != label and (df[col].dtype == 'object')]
    #categorical_columns = [col for col in df.columns if col != label and (df[col].dtype == 'object' or len(df[col].unique().tolist()) < 20)]
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    #replace NAs with mean
    df = df.fillna(df.mean())
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns, categorical_columns

# Split to feature and label 
def split_train_test(df, label,key = None, seed = 7, test_size = 0.3):
    from sklearn import cross_validation
    
    #setting
    seed = seed
    test_size = test_size
    
    #give label y
    y = df[label]
    
    #give feature X
    try:
        cols = [col for col in df.columns if col not in [label, key]]
        X = one_hot_encoder(df = df[cols], label = label)[0]
        categorical_columns = one_hot_encoder(df = df[cols], label = label)[2]
    except:
        X = one_hot_encoder(df = df.loc[:, df.columns != label], label = label)[0]
        categorical_columns = one_hot_encoder(df = df.loc[:, df.columns != label], label = label)[2]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)
    return X_train, X_test, y_train, y_test, categorical_columns

# Function for Measure Performance
def measure_performance(X,y,clf, show_accuracy=False, show_classification_report=False, show_confusion_matrix=False, show_roc_auc = False, show_mae = False):
    y_pred = clf.predict(X)
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred))),"\n"

    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred)),"\n"
        
    if show_confusion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred)),"\n"  
        
    if show_roc_auc:
        print("ROC AUC Score:{0:.3f}".format(metrics.roc_auc_score(y,clf.predict_proba(X)[:,1]))),"\n"
        
    if show_mae:
        print("Mean Absolute Error:{0:.3f}".format(metrics.mean_absolute_error(y, y_pred, multioutput='raw_values')[0])),"\n"

## Load Iris Dataset

In [7]:
#load iris data
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

In [8]:
clf1 = XGBClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
lr = LogisticRegression() 
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, verbose= 1)

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['XGBoost', 
                       'Random Forest', 
                       'Extra Tree',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.95 (+/- 0.02) [XGBoost]
Accuracy: 0.92 (+/- 0.03) [Random Forest]
Accuracy: 0.93 (+/- 0.02) [Extra Tree]
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Accuracy: 0.96 (+/- 0.03) [StackingClassifier]


In [12]:
sclf.fit(X,y)

Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)


StackingClassifier(average_probas=False,
          classifiers=[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=...stimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=1)

## Load Home Credit Dataset

In [143]:
# read data
default_path = "/Users/mayritaspring/Desktop/Github/Home-Credit-Default-Risk/"
import os
os.chdir(default_path)

#use function split_train_test can help to 1.set label and dataset 2.One-hot encoding
#training
application_train = pd.read_csv('../Kaggle data/application_train.csv')
application_train = one_hot_encoder(df = application_train, label = 'TARGET')[0]

#testing
application_test = pd.read_csv('../Kaggle data/application_test.csv')
application_test = one_hot_encoder(df = application_test, label = 'TARGET')[0]

#combine training and testing to resolve one hot enscoding problem
application_df = one_hot_encoder(df = pd.concat([application_train,application_test],keys=[0,1]), label = 'TARGET')[0]

In [144]:
application_train, application_test = application_df.xs(0), application_df.xs(1)
X= application_train.drop('TARGET', axis=1)
y= application_train.TARGET

In [145]:
from sklearn import cross_validation
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)

In [147]:
clf1 = XGBClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
lr = LogisticRegression() 
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, verbose= 1)

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['XGBoost', 
                       'Random Forest', 
                       'Extra Tree',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.92 (+/- 0.00) [XGBoost]
Accuracy: 0.92 (+/- 0.00) [Random Forest]
Accuracy: 0.92 (+/- 0.00) [Extra Tree]
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Accuracy: 0.92 (+/- 0.00) [StackingClassifier]


In [148]:
sclf.fit(X_train,y_train)

Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)


StackingClassifier(average_probas=False,
          classifiers=[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=...stimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=1)

In [161]:
measure_performance(X_test, y_test, sclf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True)
print(measure_performance)

Accuracy:0.919
Classification report
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96     84807
        1.0       0.35      0.01      0.01      7447

avg / total       0.87      0.92      0.88     92254

Confusion matrix
[[84715    92]
 [ 7398    49]]
<function measure_performance at 0x109cae510>


In [150]:
test_df = application_test.drop('TARGET', axis=1)

In [157]:
#Stacking
out_stacking = pd.DataFrame({"SK_ID_CURR":test_df.SK_ID_CURR, "TARGET":sclf.predict_proba(test_df)[:,1]})
out_stacking.to_csv("submissions_toy_stacking.csv", index=False)