In [1360]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

import warnings
warnings.filterwarnings('ignore')

In [1452]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

In [795]:
# load training and test examples

loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [796]:
# take a sneak peak at some of the examples
loan_train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


In [797]:
# number of training examples
print 'Number of training examples {0} '.format(loan_train.shape[0])

Number of training examples 614 


In [798]:
# number of test examples
print 'Number of test examples {0} '.format(loan_test.shape[0])

Number of test examples 367 


In [799]:
# class distribution
loan_train.Loan_Status.value_counts()

Y    422
N    192
dtype: int64

** Most of the applications for loan were accepted. **

In [1226]:
loan_train.Self_Employed.value_counts()

No     500
Yes     82
dtype: int64

## Importing external scripts

In [1501]:
%run scripts/helper.py
%run scripts/model.py
%run scripts/data.py

In [1414]:
data = Data(loan_train, loan_test, 'Loan_Status')

# preprocessing includes replacing missing values with meaninigful substitutions
data.pre_processing()

In [1483]:
train_df = data.get_train_X()
train_target = data.get_train_Y()
test_df = data.get_test_X()

In [1416]:
# split into training and hold out set
X_train, X_test, y_train, y_test = split_dataset(train_df, train_target)

In [1417]:
# encode variables
X_train_encode, X_test_encode = encode(X_train, X_test)

In [1418]:
# one hot encoding
X_train_hot, X_test_hot = vectorizer(X_train, X_test)

## Logistic Regression

In [1420]:
lr = LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.1, 1, 10, 100]}

model = Model()

best_model, best_params, best_score, grid_scores = model.grid_search(X_train_encode, y_train, lr, parameters, 'accuracy')

In [1421]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0) 
 best params are {'C': 0.0001} 
 best score is 0.680652680653 
 and grid scores are [mean: 0.68065, std: 0.00400, params: {'C': 0.0001}, mean: 0.67832, std: 0.00550, params: {'C': 0.001}, mean: 0.67599, std: 0.00942, params: {'C': 0.1}, mean: 0.67832, std: 0.01176, params: {'C': 1}, mean: 0.67832, std: 0.01176, params: {'C': 10}, mean: 0.67599, std: 0.01403, params: {'C': 100}] 


In [1422]:
lr = LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.1, 1, 10, 100]}

model = Model()

best_model, best_params, best_score, grid_scores = model.grid_search(X_train_hot, y_train, lr, parameters, 'accuracy')

In [1423]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0) 
 best params are {'C': 0.1} 
 best score is 0.694638694639 
 and grid scores are [mean: 0.68298, std: 0.00293, params: {'C': 0.0001}, mean: 0.68298, std: 0.00293, params: {'C': 0.001}, mean: 0.69464, std: 0.01372, params: {'C': 0.1}, mean: 0.68298, std: 0.01166, params: {'C': 1}, mean: 0.67832, std: 0.02195, params: {'C': 10}, mean: 0.66900, std: 0.01327, params: {'C': 100}] 


## Random Forest Classifier

In [1396]:
rf = RandomForestClassifier()
parameters = {"n_estimators": [50, 100, 200, 500, 1000],
              "min_samples_leaf": [1, 2, 3, 5],
              "criterion": ['gini', 'entropy']}

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_encode, y_train, rf, parameters, 'accuracy')

In [1397]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 
 best params are {'n_estimators': 500, 'criterion': 'entropy', 'min_samples_leaf': 3} 
 best score is 0.808857808858 
 and grid scores are [mean: 0.76923, std: 0.02914, params: {'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.78089, std: 0.04450, params: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.78555, std: 0.03450, params: {'n_estimators': 200, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.78555, std: 0.04020, params: {'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.78089, std: 0.04444, params: {'n_estimators': 1000, 'criterio

In [1424]:
rf = RandomForestClassifier()
parameters = {"n_estimators": [50, 100, 200, 500, 1000],
              "min_samples_leaf": [1, 2, 3, 5],
              "criterion": ['gini', 'entropy']}

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_hot, y_train, rf, parameters, 'accuracy')

In [1425]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 
 best params are {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 3} 
 best score is 0.689976689977 
 and grid scores are [mean: 0.66200, std: 0.03882, params: {'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.66434, std: 0.03410, params: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.65501, std: 0.02986, params: {'n_estimators': 200, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.67599, std: 0.02957, params: {'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.66667, std: 0.02993, params: {'n_estimators': 1000, 'criterion': 'g

## Gradient Boosting Classifier

In [1426]:
gbc = GradientBoostingClassifier()
parameters = {"n_estimators": [200],
              "learning_rate": [0.01],
              "subsample": [.7, .8, .9]
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_encode, y_train, gbc, parameters, 'accuracy')

In [1427]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False) 
 best params are {'n_estimators': 200, 'subsample': 0.8, 'learning_rate': 0.01} 
 best score is 0.785547785548 
 and grid scores are [mean: 0.78322, std: 0.04221, params: {'n_estimators': 200, 'subsample': 0.7, 'learning_rate': 0.01}, mean: 0.78555, std: 0.04026, params: {'n_estimators': 200, 'subsample': 0.8, 'learning_rate': 0.01}, mean: 0.78555, std: 0.04026, params: {'n_estimators': 200, 'subsample': 0.9, 'learning_rate': 0.01}] 


In [1429]:
gbc = GradientBoostingClassifier()
parameters = {"n_estimators": [200],
              "learning_rate": [0.01],
              "subsample": [.7, .8, .9]
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_hot.toarray(), y_train, 
                                                                     gbc, parameters, 'accuracy')

In [1430]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False) 
 best params are {'n_estimators': 200, 'subsample': 0.8, 'learning_rate': 0.01} 
 best score is 0.69696969697 
 and grid scores are [mean: 0.68998, std: 0.01430, params: {'n_estimators': 200, 'subsample': 0.7, 'learning_rate': 0.01}, mean: 0.69697, std: 0.00264, params: {'n_estimators': 200, 'subsample': 0.8, 'learning_rate': 0.01}, mean: 0.69697, std: 0.01148, params: {'n_estimators': 200, 'subsample': 0.9, 'learning_rate': 0.01}] 


## Extra Trees Classifier

In [1407]:
etr = ExtraTreesClassifier()
parameters = {"n_estimators": [10, 50, 100, 500],
             "min_samples_leaf": [1, 2, 3],
             "criterion": ['gini', 'entropy']
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_encode, y_train, etr, parameters, 'accuracy')

In [1408]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False) 
 best params are {'n_estimators': 100, 'criterion': 'entropy', 'min_samples_leaf': 2} 
 best score is 0.783216783217 
 and grid scores are [mean: 0.70163, std: 0.04474, params: {'n_estimators': 10, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.73193, std: 0.02523, params: {'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.72727, std: 0.03709, params: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.73893, std: 0.02216, params: {'n_estimators': 500, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.74359, std: 0.02156, params: {'n_estimators': 10, 'criterion': 'gini', 'min_sam

In [1431]:
etr = ExtraTreesClassifier()
parameters = {"n_estimators": [10, 50, 100],
             "min_samples_leaf": [1, 2, 3],
             "criterion": ['gini', 'entropy']
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_hot, y_train, etr, parameters, 'accuracy')

In [1432]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False) 
 best params are {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 2} 
 best score is 0.687645687646 
 and grid scores are [mean: 0.60373, std: 0.06356, params: {'n_estimators': 10, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.64103, std: 0.03288, params: {'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.64336, std: 0.03423, params: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}, mean: 0.66900, std: 0.03568, params: {'n_estimators': 10, 'criterion': 'gini', 'min_samples_leaf': 2}, mean: 0.67599, std: 0.03214, params: {'n_estimators': 50, 'criterion': 'gini', 'min_samples_le

## Extreme Gradient Boosting

In [1579]:
xgb_est = xgb.XGBClassifier()
parameters = {"n_estimators": [100, 300],
             "subsample": [0.8, 0.9],
             "colsample_bytree": [0.6, 0.7, 0.8],
             "learning_rate": [0.01],
             "min_child_weight": [1, 5, 10],
             "max_depth": [3, 5, 7],
             "gamma": [1, 2, 3],
             "nthread": [4]
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train_encode, y_train, xgb_est, parameters, 'accuracy')

In [1580]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is XGBClassifier(base_score=0.5, colsample_bytree=0.6, gamma=1,
       learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=5, missing=None, n_estimators=300, nthread=4,
       objective='binary:logistic', seed=0, silent=True, subsample=0.8) 
 best params are {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'nthread': 4, 'min_child_weight': 5, 'n_estimators': 300, 'subsample': 0.8, 'max_depth': 3, 'gamma': 1} 
 best score is 0.804195804196 
 and grid scores are [mean: 0.79953, std: 0.02792, params: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'nthread': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8, 'max_depth': 3, 'gamma': 1}, mean: 0.79720, std: 0.02406, params: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'nthread': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9, 'max_depth': 3, 'gamma': 1}, mean: 0.79487, std: 0.02309, params: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'nthread': 4, 'min_child_weight'

## Train Models

In [1581]:
# train 3 models on the training examples
lr_final_enc = LogisticRegression(C=0.001).fit(X_train_encode, y_train)
lr_final_hot = LogisticRegression(C=0.1).fit(X_train_hot, y_train)

rf_final_enc = RandomForestClassifier(n_estimators=500, criterion='entropy', min_samples_leaf=3).fit(X_train_encode, y_train)
rf_final_hot = RandomForestClassifier(n_estimators=100, criterion='entropy', min_samples_leaf=3).fit(X_train_hot, y_train)

etc_final_enc = ExtraTreesClassifier(n_estimators=100, criterion='entropy', min_samples_leaf=2).fit(X_train_encode, y_train)
etc_final_hot = ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_leaf=2).fit(X_train_hot, y_train)

gbc_final_enc = GradientBoostingClassifier(n_estimators=200, learning_rate=.01, subsample=0.8).fit(X_train_encode, y_train)
gbc_final_hot = GradientBoostingClassifier(n_estimators=200, learning_rate=.01, subsample=0.8).fit(X_train_hot.toarray(), y_train)

xgb_est_final_enc = xgb.XGBClassifier(n_estimators=300, learning_rate=.01, min_child_weight=5, 
                                  subsample=0.7, colsample_bytree=0.6).fit(X_train_encode, y_train)

In [1582]:
lr_predict_enc = lr_final_enc.predict(X_train_encode).reshape(-1, 1)
lr_predict_hot = lr_final_hot.predict(X_train_hot).reshape(-1, 1)

rf_predict_enc = rf_final_enc.predict(X_train_encode).reshape(-1, 1)
rf_predict_hot = rf_final_hot.predict(X_train_hot).reshape(-1, 1)

etc_predict_enc = etc_final_enc.predict(X_train_encode).reshape(-1, 1)
etc_predict_hot = etc_final_hot.predict(X_train_hot).reshape(-1, 1)

gbc_predict_enc = gbc_final_enc.predict(X_train_encode).reshape(-1, 1)
gbc_predict_hot = gbc_final_hot.predict(X_train_hot.toarray()).reshape(-1, 1)

xgb_predict_enc = xgb_est_final_enc.predict(X_train_encode).reshape(-1, 1)

In [1470]:
# lr_predict_enc = lr_final_enc.predict_proba(X_train_encode)
# lr_predict_hot = lr_final_hot.predict_proba(X_train_hot)

# rf_predict_enc = rf_final_enc.predict_proba(X_train_encode)
# rf_predict_hot = rf_final_hot.predict_proba(X_train_hot)

# etc_predict_enc = etc_final_enc.predict_proba(X_train_encode)
# etc_predict_hot = etc_final_hot.predict_proba(X_train_hot)

# gbc_predict_enc = gbc_final_enc.predict_proba(X_train_encode)
# gbc_predict_hot = gbc_final_hot.predict_proba(X_train_hot.toarray())

# xgb_predict_enc = xgb_est_final_enc.predict_proba(X_train_encode)

In [1589]:
all_preds = np.hstack([rf_predict_enc, gbc_predict_enc, xgb_predict_enc])

## Second Stage Classifier

In [1613]:
lr_meta = LogisticRegression(C=0.01)
lr_meta.fit(all_preds, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [1614]:
lr_predict_test_enc = lr_final_enc.predict(X_test_encode).reshape(-1, 1)
lr_predict_test_hot = lr_final_hot.predict(X_test_hot).reshape(-1, 1)

rf_predict_test_enc = rf_final_enc.predict(X_test_encode).reshape(-1, 1)
rf_predict_test_hot = rf_final_hot.predict(X_test_hot).reshape(-1, 1)

etc_predict_test_enc = etc_final_enc.predict(X_test_encode).reshape(-1, 1)
etc_predict_test_hot = etc_final_hot.predict(X_test_hot).reshape(-1, 1)

gbc_predict_test_enc = gbc_final_enc.predict(X_test_encode).reshape(-1, 1)
gbc_predict_test_hot = gbc_final_hot.predict(X_test_hot.toarray()).reshape(-1, 1)

xgb_est_predict_test_enc = xgb_est_final_enc.predict(X_test_encode).reshape(-1, 1)

In [1615]:
# lr_predict_test_enc = lr_final_enc.predict_proba(X_test_encode)
# lr_predict_test_hot = lr_final_hot.predict_proba(X_test_hot)

# rf_predict_test_enc = rf_final_enc.predict_proba(X_test_encode)
# rf_predict_test_hot = rf_final_hot.predict_proba(X_test_hot)

# etc_predict_test_enc = etc_final_enc.predict_proba(X_test_encode)
# etc_predict_test_hot = etc_final_hot.predict_proba(X_test_hot)

# gbc_predict_test_enc = gbc_final_enc.predict_proba(X_test_encode)
# gbc_predict_test_hot = gbc_final_hot.predict_proba(X_test_hot.toarray())

# xgb_est_predict_test_enc = xgb_est_final_enc.predict_proba(X_test_encode)

In [1616]:
all_preds_test = np.hstack([rf_predict_test_enc, gbc_predict_test_enc, xgb_est_predict_test_enc])

In [1617]:
# scores of individual model on the hold out set
print 'score of logistic regression %f ' %(lr_final_enc.score(X_test_encode, y_test))
print 'score of logistic regression %f ' %(lr_final_hot.score(X_test_hot, y_test))

print 'score of random forest classifier %f ' %(rf_final_enc.score(X_test_encode, y_test))
print 'score of random forest classifier %f ' %(rf_final_hot.score(X_test_hot, y_test))

print 'score of extra trees classifier %f ' %(etc_final_enc.score(X_test_encode, y_test))
print 'score of extra trees classifier %f ' %(etc_final_hot.score(X_test_hot, y_test))

print 'score of gradient boosting classifier %f ' %(gbc_final_enc.score(X_test_encode, y_test))
print 'score of gradient boosting classifier %f ' %(gbc_final_hot.score(X_test_hot.toarray(), y_test))

print 'score of extreme gradient boosting classifier %f ' %(xgb_est_final_enc.score(X_test_encode, y_test))


score of logistic regression 0.702703 
score of logistic regression 0.691892 
score of random forest classifier 0.805405 
score of random forest classifier 0.670270 
score of extra trees classifier 0.767568 
score of extra trees classifier 0.637838 
score of gradient boosting classifier 0.805405 
score of gradient boosting classifier 0.670270 
score of extreme gradient boosting classifier 0.832432 


In [1618]:
print 'score on the hold out examples %f ' %(lr_meta.score(all_preds_test, y_test))

score on the hold out examples 0.821622 


In [1560]:
all_preds_test_ensemble = .25 * all_preds_test[:, 1] + .15 * all_preds_test[:, 3] + \
                          .25 * all_preds_test[:, 5] + .35 * all_preds_test[:, 7]

In [1561]:
def pred_with_threshold(preds, threshold=0.40):
    return [1.0 if pred >= threshold else 0.0 for pred in preds]

In [1562]:
all_preds_test_ensemble_binary = pred_with_threshold(all_preds_test_ensemble)

In [1563]:
print 'score on the hold out example %f ' %(accuracy_score(y_test, all_preds_test_ensemble_binary))

score on the hold out example 0.821622 


In [1484]:
# Train Extreme Gradient Boosting Classifier on full training dataset
train_df_encode, test_df_encode = encode(train_df, test_df)

In [1620]:
rf_est_whole = RandomForestClassifier(n_estimators=500, 
                                      criterion='entropy', min_samples_leaf=3).fit(train_df_encode, train_target)
etc_est_whole = ExtraTreesClassifier(n_estimators=100, 
                                     criterion='entropy', min_samples_leaf=2).fit(train_df_encode, train_target)
gbc_est_whole = GradientBoostingClassifier(n_estimators=200, 
                                           learning_rate=.01, subsample=0.8).fit(train_df_encode, train_target)
xgb_est_whole = xgb.XGBClassifier(n_estimators=300, learning_rate=.01, min_child_weight=5,
                                  subsample=0.7, colsample_bytree=0.6).fit(train_df_encode, train_target)

In [1626]:
rf_pred_second_stage = rf_est_whole.predict(train_df_encode).reshape(-1, 1)
etc_pred_second_stage = etc_est_whole.predict(train_df_encode).reshape(-1, 1)
gbc_pred_second_stage = gbc_est_whole.predict(train_df_encode).reshape(-1, 1)
xgb_pred_second_stage = xgb_est_whole.predict(train_df_encode).reshape(-1, 1)

In [1627]:
second_stage_preds = np.hstack([rf_pred_second_stage, gbc_pred_second_stage, xgb_pred_second_stage])

In [1629]:
lr_meta.fit(second_stage_preds, train_target)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [1630]:
rf_pred = rf_est_whole.predict(test_df_encode).reshape(-1, 1)
etc_pred = etc_est_whole.predict(test_df_encode).reshape(-1, 1)
gbc_pred = gbc_est_whole.predict(test_df_encode).reshape(-1, 1)
xgb_pred = xgb_est_whole.predict(test_df_encode).reshape(-1, 1)

In [1631]:
second_stage_test_preds = np.hstack([rf_pred, gbc_pred, xgb_pred])

In [1633]:
l2_classifier_preds = lr_meta.predict(second_stage_test_preds)

In [1564]:
# ensemble_preds = .25 * rf_pred + .15 * etc_pred + .25 * gbc_pred + 0.35 * xgb_pred

In [1565]:
# ensemble_preds_binary = pred_with_threshold(ensemble_preds)

In [1635]:
xgb_predictions_labels = map(inverse_mapping, l2_classifier_preds)

In [1636]:
create_submissions(loan_test.index.values, xgb_predictions_labels, 'l2_classifier_preds_lr_meta.csv')