In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split


class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state']=seed
        self.clf=clf(**params)
        
    def train(self, xtrain, ytrain):
        self.clf.fit(xtrain, ytrain)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x,y)
    
    def feature_importances(self, x, y):
        return self.clf.fit(x,y).feature_importances_
    
def train_multi_clf(clfs, xtrain, ytrain, xtest, n=5):
    skf = StratifiedKFold(n_splits=n)
    train_pred = np.zeros((xtrain.shape[0], len(clfs)))
    test_pred = np.zeros((xtest.shape[0], len(clfs)))

    for i, clf in enumerate(clfs):
        #for each classifier, perform kfold validation
        print('training using [%s]' % clf.__class__.__name__)
        test_pred_i = np.zeros((xtest.shape[0], n))
        for j, (train_index, cv_index) in enumerate(skf.split(xtrain, ytrain)):
            print('Fold {0}, size 0f train is {1}, size of test is {2}'.format(j, len(train_index), len(cv_index)))
            x_train, x_cv = xtrain1.iloc[train_index,:], xtrain1.iloc[cv_index, :]
            y_train, y_cv = ytrain1.iloc[train_index], ytrain1.iloc[cv_index]
        
            clf.fit(x_train, y_train)
        
            train_pred[cv_index, i]=clf.predict(x_cv)[:,1]
            test_pred_i[:, j]=clf.predict(xtest1)[:,1]
        test_pred[:,i]=test_pred_i.mean(1)
    return train_pred, test_pred  #numpy array

def pred_test_using_multi_clfs(clf, train_pred_comb, test_pred_comb):
    '''
    input: predictions generated from the train_multi_clf.
    combine predictions:
    train_pred_comb=np.concatenate((et_train, rf_train), axis=1)
    '''

def feat_imp(clfs, xtrain, ytrain):
    feat_imp_dict = {}
    feat_imp_dict['features'] = xtrain.columns.values
    for clf in clfs:
        feat_imp_dict[clf.__class__.__name__] = clf.feature_importances(xtrain, ytrain)
    feat_df = pd.DataFrame.from_dict(feat_imp_dict)
    feat_df.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
    return feat_df
        
        
        
def plot_conf_roc(test_true, test_pred, pred_proba):
    '''
    - input: true value, predict value and predicted probabilities for roc-auc curve
    '''
    cnf_matrix = metrics.confusion_matrix(test_true, test_pred)
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=[0,1], normalize=True,
                      title='Confusion matrix')
    plt.show()
    
    print('\n-----------roc curve-------------')
    fpr, tpr, thresholds= metrics.roc_curve(test_true, pred_proba)
    roc_auc=metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

if __name__=='main':
    rf_params = {'n_jobs':-1, 'n_estimators':500, 'max_depth':6, 'max_features':'sqrt', 'verbose':0}
    et_params = {'n_jobs':-1, 'n_estimators':500, 'max_depth': 8, 'min_sample_leaf': 2, 'verbose':0}
    rf = SklearnHelper(clf=RandomForestClassifier, seed=0, params=rf_params)
    et = SklearnHelper(clf=ExtraTreesClassifier, seed=0, params=et_params)
    xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.3)
    clf_train, clf_test = train_multi_clf([rf, et], xtrain, ytrain, xtest, n=5)
    #et_train, et_test = train_multi_clf(et, xtrain, ytrain, xtest, n=5)
    
    #perform the second level learning model via XGBoost
    gbm = xgb.XGBClassifier(
        n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8,
        colsample_bytree=0.8, objective='binary:logistic', nthread=-1, scale_pos_weight=1
    ).fit(clf_train, ytrain)
    predictions=gbm.predict(xtest)
    '''
    generate datframe to check distribution status
    base_predictions_train=pd.DataFrame({'RandomForest': rf_train.ravel(),
    'ExtraTrees': et_train.ravel(),
    'AdaBoost':ada_train.ravel(),
    'GradientBoost':gb_train.ravel()})
    -----------------------------------------------
    |AdaBoost|ExtraTrees|GradientBoost|RandomForest|
    |0.0     |0.0       |0.0          |0.0
    |1.0     |1.0       |1.0          |1.0
    |1.0     |0.0       |1.0          |0.0
    |1.0     |1.0       |1.0          |1.0
    '''