In [1]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

In [1245]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [795]:
# load training and test examples

loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [796]:
# take a sneak peak at some of the examples
loan_train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


In [797]:
# number of training examples
print 'Number of training examples {0} '.format(loan_train.shape[0])

Number of training examples 614 


In [798]:
# number of test examples
print 'Number of test examples {0} '.format(loan_test.shape[0])

Number of test examples 367 


In [799]:
# class distribution
loan_train.Loan_Status.value_counts()

Y    422
N    192
dtype: int64

** Most of the applications for loan were accepted. **

In [1226]:
loan_train.Self_Employed.value_counts()

No     500
Yes     82
dtype: int64

## Importing external scripts

In [1248]:
%run scripts/helper.py
%run scripts/model.py
%run scripts/data.py

In [1237]:
data = Data(loan_train, loan_test, 'Loan_Status')

# preprocessing incluedes replacing missing values with mean of the values
# and encoding categorical variables
data.pre_processing()

In [1238]:
train_df = data.get_train_X()
train_target = data.get_train_Y()
test_df = data.get_test_X()

In [1240]:
# split into training and hold out set
X_train, X_test, y_train, y_test = split_dataset(train_df, train_target) 

## Logistic Regression

In [1249]:
lr = LogisticRegression()
parameters = {"C": [0.0001, 0.001, 0.1, 1, 10, 100]}

model = Model()

best_model, best_params, best_score, grid_scores = model.grid_search(X_train, y_train, lr, parameters, 'accuracy')

In [1251]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0) 
 best params are {'C': 0.0001} 
 best score is 0.701631701632 
 and grid scores are [mean: 0.70163, std: 0.00368, params: {'C': 0.0001}, mean: 0.69930, std: 0.00397, params: {'C': 0.001}, mean: 0.69464, std: 0.00651, params: {'C': 0.1}, mean: 0.68998, std: 0.01164, params: {'C': 1}, mean: 0.68065, std: 0.01737, params: {'C': 10}, mean: 0.69231, std: 0.00705, params: {'C': 100}] 


## Random Forest Classifier

In [1252]:
rf = RandomForestClassifier()
parameters = {"n_estimators": [50, 100, 200, 500, 1000]}

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train, y_train, rf, parameters, 'accuracy')

In [1253]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 
 best params are {'n_estimators': 200} 
 best score is 0.799533799534 
 and grid scores are [mean: 0.77622, std: 0.02521, params: {'n_estimators': 50}, mean: 0.78555, std: 0.02880, params: {'n_estimators': 100}, mean: 0.79953, std: 0.02280, params: {'n_estimators': 200}, mean: 0.79021, std: 0.03251, params: {'n_estimators': 500}, mean: 0.79021, std: 0.02552, params: {'n_estimators': 1000}] 


## Gradient Boosting Classifier

In [1255]:
gbc = GradientBoostingClassifier()
parameters = {"n_estimators": [100, 300, 500, 700, 1000],
              "learning_rate": [0.3, 0.1, 0.05, 0.01],
              "subsample": [.7, .8, .9]
             }

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(X_train, y_train, gbc, parameters, 'accuracy')

In [1256]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=0.7, verbose=0,
              warm_start=False) 
 best params are {'n_estimators': 100, 'subsample': 0.7, 'learning_rate': 0.01} 
 best score is 0.825174825175 
 and grid scores are [mean: 0.77389, std: 0.03999, params: {'n_estimators': 100, 'subsample': 0.7, 'learning_rate': 0.3}, mean: 0.78788, std: 0.05166, params: {'n_estimators': 100, 'subsample': 0.8, 'learning_rate': 0.3}, mean: 0.76923, std: 0.05807, params: {'n_estimators': 100, 'subsample': 0.9, 'learning_rate': 0.3}, mean: 0.78788, std: 0.06645, params: {'n_estimators': 300, 'subsample': 0.7, 'learning_rate': 0.3}, mean: 0.76923, std: 0.05589, params: {'n_estimators': 300, 'subsample': 0.8, 'learning_rate': 0.3}, m

In [1257]:
# train 3 models on the training examples
lr_final = LogisticRegression(C=0.0001).fit(X_train, y_train)
rf_final = RandomForestClassifier(n_estimators=200).fit(X_train, y_train)
gbc_final = GradientBoostingClassifier(n_estimators=100, learning_rate=.01, subsample=0.7).fit(X_train, y_train)

In [1274]:
lr_preds = lr_final.predict(X_train).reshape(-1, 1)
rf_preds = rf_final.predict(X_train).reshape(-1, 1)
gbc_preds = gbc_final.predict(X_train).reshape(-1, 1)

In [1275]:
all_preds = np.hstack([lr_preds, rf_preds, gbc_preds])

## Second Stage Classifier

In [1278]:
rf_meta = RandomForestClassifier()
parameters = {"n_estimators": [100]}

model = Model()
best_model, best_params, best_score, grid_scores = model.grid_search(all_preds, y_train, rf_meta, parameters, 'accuracy')

In [1279]:
print 'best model is %s \n best params are %s \n best score is %s \n and grid scores are %s ' %(best_model, best_params,
                                                                                               best_score, grid_scores)

best model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 
 best params are {'n_estimators': 100} 
 best score is 1.0 
 and grid scores are [mean: 1.00000, std: 0.00000, params: {'n_estimators': 100}] 


In [1282]:
lr_predict_test = lr_final.predict(X_test).reshape(-1, 1)
rf_predict_test = rf_final.predict(X_test).reshape(-1, 1)
gbc_predict_test = gbc_final.predict(X_test).reshape(-1, 1)

all_preds_test = np.hstack([lr_predict_test, rf_predict_test, gbc_predict_test])

In [1284]:
# scores of individual model on the hold out set
print 'score of logistic regression %f ' %(lr_final.score(X_test, y_test))
print 'score of random forest classifier %f ' %(rf_final.score(X_test, y_test))
print 'score of gradient boosting classifier %f ' %(gbc_final.score(X_test, y_test))

score of logistic regression 0.654054 
score of random forest classifier 0.735135 
score of gradient boosting classifier 0.762162 


In [1283]:
print 'score on the hold out examples %f ' %(best_model.score(all_preds_test, y_test))

score on the hold out examples 0.735135 


In [1194]:
# create instance of logistic regression model
log_reg = LogRegression(loan_train, loan_test, 'Loan_Status')

In [1195]:
# do the require preprocessing
log_reg.pre_processing()

# split dataset
log_reg.split_dataset()

# feature selection
log_reg.feature_selection()

In [1196]:
# model
est = LogisticRegression(C=10.)

In [1197]:
# cross validation scores
scores = log_reg.get_cross_validation_scores(est)

In [1198]:
# print min, max and mean of scores
print 'Logistic Regression Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Logistic Regression Minimum 0.716666666667, Maximum 0.783333333333 and Mean 0.746666666667 


In [1199]:
# Train the model
logreg = log_reg.fit_model(est)

In [1200]:
# lets test it out on the hold out examples
test_preds_lr, test_score_lr = log_reg.test(logreg)
print 'Accuracy on the hold out set is ', test_score_lr

Accuracy on the hold out set is  0.67027027027


In [1163]:
# Train on whole dataset
final_model = log_reg.train_model(est, 'Loan_Status')

In [1164]:
predictions_lr = log_reg.predict(final_model)

In [276]:
# create submission for logistic regression model
create_submissions(loan_test.index.values, predictions, 'logistic_regression.csv')

## Random Forest Classifier

In [1201]:
rf_model = RandomForestModel(loan_train, loan_test, 'Loan_Status')

In [1202]:
# do the require preprocessing
rf_model.pre_processing()

# split dataset
rf_model.split_dataset()

# feature selection
rf_model.feature_selection()

In [1203]:
# model
est = RandomForestClassifier(n_estimators=500, min_samples_leaf=10, n_jobs=-1)

In [1204]:
# cross validation scores
scores = rf_model.get_cross_validation_scores(est)

In [1205]:
print 'Random Forest Classifier Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Random Forest Classifier Minimum 0.716666666667, Maximum 0.783333333333 and Mean 0.746666666667 


In [1206]:
# Train model
rf = rf_model.fit_model(est)

In [1207]:
# lets test it out on the hold out examples
test_preds_rf, test_score_rf = rf_model.test(rf)
print 'Accuracy on the hold out set is ', test_score_rf

Accuracy on the hold out set is  0.778378378378


In [1079]:
# Train on full dataset
final_model = rf_model.train_model(est, 'Loan_Status')

In [879]:
predictions_rf = rf_model.predict(final_model)

In [435]:
# create submission for random forest model
create_submissions(loan_test.index.values, predictions, 'random_forest_200_trees.csv')

## Gradient Boosting Classifier

In [1208]:
gbm_model = GradientBoostingModel(loan_train, loan_test, 'Loan_Status')

In [1209]:
# preprocessing
gbm_model.pre_processing()

# split dataset
gbm_model.split_dataset()

# feature selection
gbm_model.feature_selection()

In [1210]:
est = GradientBoostingClassifier(learning_rate=0.01, min_samples_leaf=3, 
                                 n_estimators=500, subsample=0.9, min_weight_fraction_leaf=0.005)

In [1211]:
# cross validation scores
scores = gbm_model.get_cross_validation_scores(est)

In [1212]:
print 'Gradient Boosting Classifier Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Gradient Boosting Classifier Minimum 0.75, Maximum 0.85 and Mean 0.79 


In [1213]:
# Fit the model
gbc = gbm_model.fit_model(est)

In [1215]:
# lets test it out on the hold out examples
test_preds_gbc, test_scores_gbc = gbm_model.test(gbc)
print 'Accuracy on the hold out set is ', test_scores_gbc

Accuracy on the hold out set is  0.837837837838


In [1095]:
# Train on full dataset
final_model = gbm_model.train_model(est, 'Loan_Status')

In [1096]:
predictions_gbc = gbm_model.predict(final_model)

In [1097]:
create_submissions(loan_test.index.values, predictions_gbc, 'gbm_point4_threshold.csv')

## Extreme Gradient Boosting

In [936]:
import xgboost as xgb

In [937]:
xgb_model = GradientBoostingModel(loan_train, loan_test, 'Loan_Status')

In [938]:
# preprocessing
xgb_model.pre_processing()

# split dataset
xgb_model.split_dataset()

# feature selection
xgb_model.feature_selection()

In [989]:
param = {'bst:max_depth':6, 'bst:eta':.01,
         'silent':1, 'objective':'binary:logistic',
         'bst:subsample': 0.9, 'bst:colsample_bytree': 0.8,
         'bst:gamma': 5, 'bst:min_child_weight': 5}
param['nthread'] = 4
param['eval_metric'] = 'error'

In [990]:
# training and validation
dtrain = xgb.DMatrix(xgb_model.X_train, xgb_model.y_train)
dval = xgb.DMatrix(xgb_model.X_val, xgb_model.y_val)

In [991]:
evallist  = [(dval,'eval'), (dtrain,'train')]

In [992]:
num_round = 150
bst = xgb.train( param, dtrain, num_round, evallist )

[0]	eval-error:0.194595	train-error:0.188811
[1]	eval-error:0.194595	train-error:0.188811
[2]	eval-error:0.194595	train-error:0.188811
[3]	eval-error:0.194595	train-error:0.188811
[4]	eval-error:0.194595	train-error:0.188811
[5]	eval-error:0.194595	train-error:0.188811
[6]	eval-error:0.194595	train-error:0.188811
[7]	eval-error:0.194595	train-error:0.188811
[8]	eval-error:0.194595	train-error:0.188811
[9]	eval-error:0.194595	train-error:0.188811
[10]	eval-error:0.194595	train-error:0.188811
[11]	eval-error:0.194595	train-error:0.188811
[12]	eval-error:0.194595	train-error:0.188811
[13]	eval-error:0.194595	train-error:0.188811
[14]	eval-error:0.194595	train-error:0.188811
[15]	eval-error:0.194595	train-error:0.188811
[16]	eval-error:0.194595	train-error:0.188811
[17]	eval-error:0.194595	train-error:0.188811
[18]	eval-error:0.194595	train-error:0.188811
[19]	eval-error:0.194595	train-error:0.188811
[20]	eval-error:0.194595	train-error:0.188811
[21]	eval-error:0.194595	train-error:0.18881

In [1218]:
dval_preds_probs = bst.predict(dval)

In [1219]:
test_preds_xgb = binary_from_prob(dval_preds_probs)

In [996]:
# full dataset
X_full = xgb_model.train_df[xgb_model.train_df.columns.drop('Loan_Status')]
y_full = xgb_model.train_df.Loan_Status

X_test_full = xgb_model.test_df


dfull = xgb.DMatrix(X_full, y_full)
dtest = xgb.DMatrix(X_test_full)

In [997]:
final_model = xgb.train(param, dfull, num_round)

In [998]:
xgb_predictions = final_model.predict(dtest)

In [1006]:
xgb_predictions_binary = binary_from_prob(xgb_predictions)

In [1008]:
create_submissions(loan_test.index.values, xgb_predictions_binary, 'xgboost.csv')

## Meta Classifier

In [1223]:
X_second_stage = np.hstack([test_preds_lr, test_preds_rf, test_preds_gbc, test_preds_xgb])
y_second_stage = gbm_model.X_val

In [1224]:
c = np.hstack([a, b])

In [1225]:
c

array([[1, 2, 3, 3, 4, 5],
       [7, 8, 9, 1, 2, 3]])

## Exploratory Data Analysis

In [495]:
# Relationship with Gender and Loan Status
loan_train.groupby(['Gender', 'Loan_Status']).size()

Gender  Loan_Status
Female  N               37
        Y               75
Male    N              150
        Y              339
dtype: int64

** 50% of the loan applications for both gender were rejected **

In [498]:
# Relationship between Applicant Income and Loan Status
loan_train.groupby('Loan_Status')['ApplicantIncome'].mean()

Loan_Status
N    5446.078125
Y    5384.068720
Name: ApplicantIncome, dtype: float64

In [499]:
# Relationship between Co-applicant income and Loan Status
loan_train.groupby('Loan_Status')['CoapplicantIncome'].mean()

Loan_Status
N    1877.807292
Y    1504.516398
Name: CoapplicantIncome, dtype: float64