In [1]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

In [277]:
# load training and test examples

loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [278]:
# take a sneak peak at some of the examples
loan_train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360,1,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360,1,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360,1,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0,141.0,360,1,Urban,Y


In [279]:
# number of training examples
print 'Number of training examples {0} '.format(loan_train.shape[0])

Number of training examples 614 


In [280]:
# number of test examples
print 'Number of test examples {0} '.format(loan_test.shape[0])

Number of test examples 367 


In [281]:
# class distribution
loan_train.Loan_Status.value_counts()

Y    422
N    192
dtype: int64

** Most of the applications for loan were accepted. **

In [480]:
%run scripts/model.py
%run scripts/helper.py

In [380]:
# logistic regression model
log_reg = LogRegression(loan_train, loan_test, 'Loan_Status')

In [381]:
# do the require preprocessing
# fill nan values

log_reg.pre_processing()

In [382]:
# train a logistic regression model
est = log_reg.train_model(loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [383]:
est

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [384]:
# cross validation scores
scores = log_reg.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [385]:
# print min, max and mean of scores
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.670588235294, Maximum 0.717647058824 and Mean 0.694621129928 


In [386]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', log_reg.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  0.648648648649


In [273]:
predictions = log_reg.predict(est, loan_test.columns)

In [276]:
# create submission for logistic regression model
create_submissions(loan_test.index.values, predictions, 'logistic_regression.csv')

## Random Forest Classifier

In [427]:
rf_model = RandomForestModel(loan_train, loan_test, 'Loan_Status')

In [428]:
rf_model.pre_processing()

In [429]:
est = rf_model.train_model(loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [430]:
est

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [431]:
# cross validation scores
scores = rf_model.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [432]:
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.720930232558, Maximum 0.813953488372 and Mean 0.78094564209 


In [433]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', rf_model.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  1.0


In [434]:
predictions = rf_model.predict(est, loan_test.columns)

In [435]:
# create submission for random forest model
create_submissions(loan_test.index.values, predictions, 'random_forest_200_trees.csv')

## Gradient Boosting Classifier

In [481]:
gbm_model = GradientBoostingModel(loan_train, loan_test, 'Loan_Status')

In [482]:
gbm_model.pre_processing()
est = gbm_model.train_model(loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [483]:
est

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False)

In [484]:
# cross validation scores
scores = gbm_model.get_cross_validation_scores(est, loan_train.columns.drop('Loan_Status'), 'Loan_Status')

In [485]:
print 'Minimum {0}, Maximum {1} and Mean {2} '.format(scores.min(), scores.max(), scores.mean())

Minimum 0.767441860465, Maximum 0.835294117647 and Mean 0.801959840873 


In [486]:
# lets test it out on the hold out examples
print 'Accuracy on the hold out set is ', gbm_model.test(est, loan_train.columns.drop('Loan_Status'))

Accuracy on the hold out set is  0.832432432432


In [487]:
predictions = gbm_model.predict(est, loan_test.columns)

In [488]:
create_submissions(loan_test.index.values, predictions, 'gbm.csv')