## 1. Package and Function

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")

In [3]:
#one hot encoder
from  sklearn  import  metrics
def one_hot_encoder(df, label, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if col != label and (df[col].dtype == 'object')]
    #categorical_columns = [col for col in df.columns if col != label and (df[col].dtype == 'object' or len(df[col].unique().tolist()) < 20)]
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    #replace NAs with mean
    df = df.fillna(df.mean())
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns, categorical_columns

# Split to feature and label 
def split_train_test(df, label,key = None, seed = 7, test_size = 0.3):
    from sklearn import cross_validation
    
    #setting
    seed = seed
    test_size = test_size
    
    #give label y
    y = df[label]
    
    #give feature X
    try:
        cols = [col for col in df.columns if col not in [label, key]]
        X = one_hot_encoder(df = df[cols], label = label)[0]
        categorical_columns = one_hot_encoder(df = df[cols], label = label)[2]
    except:
        X = one_hot_encoder(df = df.loc[:, df.columns != label], label = label)[0]
        categorical_columns = one_hot_encoder(df = df.loc[:, df.columns != label], label = label)[2]
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)
    return X_train, X_test, y_train, y_test, categorical_columns

# Function for Measure Performance
def measure_performance(X,y,clf, show_accuracy=False, show_classification_report=False, show_confusion_matrix=False, show_roc_auc = False, show_mae = False):
    y_pred = clf.predict(X)
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred))),"\n"

    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred)),"\n"
        
    if show_confusion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred)),"\n"  
        
    if show_roc_auc:
        print("ROC AUC Score:{0:.3f}".format(metrics.roc_auc_score(y,clf.predict_proba(X)[:,1]))),"\n"
        
    if show_mae:
        print("Mean Absolute Error:{0:.3f}".format(metrics.mean_absolute_error(y, y_pred, multioutput='raw_values')[0])),"\n"

## 2. Example
### 2.1 StackingClassifier([Reference](https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/#example-1-simple-stacked-classification))
- **Iris Dataset**

In [3]:
#load iris data
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

In [8]:
clf1 = XGBClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
lr = LogisticRegression() 
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, verbose= 1)

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['XGBoost', 
                       'Random Forest', 
                       'Extra Tree',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.95 (+/- 0.02) [XGBoost]
Accuracy: 0.92 (+/- 0.03) [Random Forest]
Accuracy: 0.93 (+/- 0.02) [Extra Tree]
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Accuracy: 0.96 (+/- 0.03) [StackingClassifier]


In [12]:
sclf.fit(X,y)

Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)


StackingClassifier(average_probas=False,
          classifiers=[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=...stimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=1)

- **Home Credit Dataset**

In [4]:
# read data
default_path = "/Users/mayritaspring/Desktop/Github/Home-Credit-Default-Risk/"
import os
os.chdir(default_path)

#use function split_train_test can help to 1.set label and dataset 2.One-hot encoding
#training
application_train = pd.read_csv('../Kaggle data/application_train.csv')
application_train = one_hot_encoder(df = application_train, label = 'TARGET')[0]

#testing
application_test = pd.read_csv('../Kaggle data/application_test.csv')
application_test = one_hot_encoder(df = application_test, label = 'TARGET')[0]

#combine training and testing to resolve one hot enscoding problem
application_df = one_hot_encoder(df = pd.concat([application_train,application_test],keys=[0,1]), label = 'TARGET')[0]

In [5]:
application_train, application_test = application_df.xs(0), application_df.xs(1)
X= application_train.drop('TARGET', axis=1)
y= application_train.TARGET

In [6]:
from sklearn import cross_validation
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)

In [147]:
clf1 = XGBClassifier()
clf2 = RandomForestClassifier()
clf3 = ExtraTreesClassifier()
lr = LogisticRegression() 
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr, verbose= 1)

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['XGBoost', 
                       'Random Forest', 
                       'Extra Tree',
                       'StackingClassifier']):
    scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.92 (+/- 0.00) [XGBoost]
Accuracy: 0.92 (+/- 0.00) [Random Forest]
Accuracy: 0.92 (+/- 0.00) [Extra Tree]
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)
Accuracy: 0.92 (+/- 0.00) [StackingClassifier]


In [148]:
sclf.fit(X_train,y_train)

Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)
Fitting classifier2: randomforestclassifier (2/3)
Fitting classifier3: extratreesclassifier (3/3)


StackingClassifier(average_probas=False,
          classifiers=[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=...stimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)],
          meta_classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=1)

In [161]:
measure_performance(X_test, y_test, sclf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True)
print(measure_performance)

Accuracy:0.919
Classification report
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96     84807
        1.0       0.35      0.01      0.01      7447

avg / total       0.87      0.92      0.88     92254

Confusion matrix
[[84715    92]
 [ 7398    49]]
<function measure_performance at 0x109cae510>


In [150]:
test_df = application_test.drop('TARGET', axis=1)

In [157]:
#Stacking
out_stacking = pd.DataFrame({"SK_ID_CURR":test_df.SK_ID_CURR, "TARGET":sclf.predict_proba(test_df)[:,1]})
out_stacking.to_csv("submissions_toy_stacking.csv", index=False)

## 2.2 Define Function
- **Sklearn Dataset**

In [53]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

#Split to Training and Testing
from sklearn import cross_validation
seed = 7
test_size = 0.3
X, y = make_classification(n_samples=1000, n_features=4,
                            n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=seed)

In [8]:
from sklearn.model_selection import KFold

In [95]:
NFOLDS = 5
ntrain = X_train.shape[0] #700
ntest = X_test.shape[0] #300
kf = KFold(n_splits = NFOLDS, random_state=0)
print('Training data is', ntrain)
print('Testing data is', ntest) 
print(kf)
print(kf.get_n_splits(X))

Training data is 215257
Testing data is 92254
KFold(n_splits=5, random_state=0, shuffle=False)
5


In [96]:
def get_oof(clf, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,)) #700 * 1; 長度為700的全0 array
    oof_test = np.zeros((ntest,)) #300 * 1; 長度為300的全0 array
    oof_test_skf = np.empty((NFOLDS, ntest)) #一個用隨機值填充的5*300的矩陣，用來存放5次交叉驗證後的預測結果
    for i, (train_index, test_index) in enumerate(kf.split(X_train)): #700 * 5
        #5次交叉，5次循環
        #kf實際上是一个迭代器，是從700個樣本中分成了5組訓練集和測試集的索引號
        X_tr = X_train[train_index] #560 * 4(n_features=4); 當前循環，當前實驗的訓練數據
        y_tr = y_train[train_index] #560 * 1; 當前循環的訓練數據標籤
        X_te = X_train[test_index] #140 * 4; d當前循環的測試數據
        clf.fit(X_tr, y_tr) #用模型去fit數據，也就是訓練預測模型
        oof_train[test_index] = clf.predict(X_te) #把140 * 1; 測試數據的預測標籤按照對應索引，放到oof_train對應索引處，做完5次交叉驗證會補齊成為700 * 1
        oof_test_skf[i, :] = clf.predict(X_test) #300 * 1; 用當前的模型，預測所有測試數據的標籤，並放到oof_test_skf的一行中
        
        #5次實驗做完，把5次得到的结果求平均
        oof_test[:] = oof_test_skf.mean(axis=0) #300 * 1
        return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #oof_train.reshape(-1, 1): 700 * 1; oof_test.reshape(-1, 1): 300 * 1

In [97]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [None]:
rpt1, rpt2 = get_oof(clf, X_train, y_train, X_test)     

# 2.3 Ensembling & Stacking models
- **Python Classes**

def init : Python standard for invoking the default constructor for the class. This means that when you want to create an object (classifier), you have to give it the parameters of clf (what sklearn classifier you want), seed (random seed) and params (parameters for the classifiers).

In [9]:
from sklearn.model_selection import KFold
# Some useful parameters which will come in handy later on
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
#kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
kf = KFold(n_splits = NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

- **Out-of-Fold Predictions**

In [10]:
def get_oof(clf, X_train, y_train, X_test):
    oof_train = np.zeros((ntrain,)) #700 * 1; 長度為700的全0 array
    oof_test = np.zeros((ntest,)) #300 * 1; 長度為300的全0 array
    oof_test_skf = np.empty((NFOLDS, ntest)) #一個用隨機值填充的5*300的矩陣，用來存放5次交叉驗證後的預測結果
    for i, (train_index, test_index) in enumerate(kf.split(X_train)): #700 * 5
        #5次交叉，5次循環
        #kf實際上是一个迭代器，是從700個樣本中分成了5組訓練集和測試集的索引號
        X_tr = X_train[train_index] #560 * 4(n_features=4); 當前循環，當前實驗的訓練數據
        y_tr = y_train[train_index] #560 * 1; 當前循環的訓練數據標籤
        X_te = X_train[test_index] #140 * 4; d當前循環的測試數據
        clf.fit(X_tr, y_tr) #用模型去fit數據，也就是訓練預測模型
        oof_train[test_index] = clf.predict(X_te) #把140 * 1; 測試數據的預測標籤按照對應索引，放到oof_train對應索引處，做完5次交叉驗證會補齊成為700 * 1
        oof_test_skf[i, :] = clf.predict(X_test) #300 * 1; 用當前的模型，預測所有測試數據的標籤，並放到oof_test_skf的一行中
        
        #5次實驗做完，把5次得到的结果求平均
        oof_test[:] = oof_test_skf.mean(axis=0) #300 * 1
        return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #oof_train.reshape(-1, 1): 700 * 1; oof_test.reshape(-1, 1): 300 * 1

In [11]:
X_train.shape
y_train.shape
X_test.shape

(92254, 261)

### 2.3.1 Generating our Base First-Level Models

===
Set parameters for 5 classifiers


In [12]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [13]:
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

In [14]:
# Create 5 objects that represent our 5 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

===
Creating NumPy arrays out of our train and test sets

In [15]:
# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our models
y_train = y_train.ravel()
y_test  = y_test.ravel()
X_train = X_train.values 
X_test = X_test.values 

In [16]:
print("X_train is",X_train.shape)
print("y_train is",y_train.shape)
print("X_test is",X_test.shape)
print("y_test is",y_test.shape)

X_train is (215257, 261)
y_train is (215257,)
X_test is (92254, 261)
y_test is (92254,)


===
Output of the First level Predictions

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,X_train, y_train, X_test) # Support Vector Classifier

print("Training is complete")

## 2.3.2 Second-Level Predictions from the First-level Output

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

In [None]:
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [None]:
X_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
X_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

> Second level learning model via XGBoost

In [None]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1)
gbm.fit(X_train, y_train)
predictions = gbm.predict(X_test)