# Machine Learning Model Approach

In [1]:
import os
import re
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../data/train.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
print('{} rows and {} columns'.format(*data.shape))

614 rows and 13 columns


In [5]:
print('List of columns is:', list(data.columns))

List of columns is: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']


`Loan_status` is the target feature, all the others are `predictor features`.

In [6]:
for col in data.columns:
    print('The number of missing values in {} -> {}'.format(col, data[col].isnull().sum()))

The number of missing values in Loan_ID -> 0
The number of missing values in Gender -> 13
The number of missing values in Married -> 3
The number of missing values in Dependents -> 15
The number of missing values in Education -> 0
The number of missing values in Self_Employed -> 32
The number of missing values in ApplicantIncome -> 0
The number of missing values in CoapplicantIncome -> 0
The number of missing values in LoanAmount -> 22
The number of missing values in Loan_Amount_Term -> 14
The number of missing values in Credit_History -> 50
The number of missing values in Property_Area -> 0
The number of missing values in Loan_Status -> 0


In [7]:
missing_features = list(data.columns[data.isnull().sum() > 0])
missing_features.remove('LoanAmount')

for missing in missing_features:
    print('Unique labels for {}:'.format(missing), data[missing].unique())

Unique labels for Gender: ['Male' 'Female' nan]
Unique labels for Married: ['No' 'Yes' nan]
Unique labels for Dependents: ['0' '1' '2' '3+' nan]
Unique labels for Self_Employed: ['No' 'Yes' nan]
Unique labels for Loan_Amount_Term: [360. 120. 240.  nan 180.  60. 300. 480.  36.  84.  12.]
Unique labels for Credit_History: [ 1.  0. nan]


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\
            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

In [10]:
X_train, X_val, y_train, y_val = train_test_split(data[pred_var], data['Loan_Status'], test_size=.25, random_state=42)

In [11]:
X_train['Dependents'].fillna('0', inplace=True)
X_train['Self_Employed'].fillna('No', inplace=True)
X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean(), inplace=True)
X_train['Credit_History'].fillna(0, inplace=True)
X_train['Married'].fillna('No', inplace=True)
X_train['Gender'].fillna('Male', inplace=True)
X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean(), inplace=True)

In [12]:
for missing in missing_features:
    print('Unique labels for {}:'.format(missing), X_train[missing].unique())

Unique labels for Gender: ['Male' 'Female']
Unique labels for Married: ['Yes' 'No']
Unique labels for Dependents: ['2' '0' '3+' '1']
Unique labels for Self_Employed: ['No' 'Yes']
Unique labels for Loan_Amount_Term: [360.          60.         180.          84.         300.
 480.         339.10022272 120.         240.          12.
  36.        ]
Unique labels for Credit_History: [1. 0.]


In [13]:
gender_values = {'Female' : 0, 'Male' : 1, 'Missing' : 2} 
married_values = {'No' : 0, 'Yes' : 1, 'Missing' : 2}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1, 'Missing' : 2}
property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
dependent_values = {'Missing' : 4, '3+': 3, '0': 0, '2': 2, '1': 1}
X_train.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values,
                 'Self_Employed': employed_values, 'Property_Area': property_values, 'Dependents': dependent_values}, inplace=True)

In [14]:
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
92,1,1,2,1,0,3273,1820.0,81.0,360.0,1.0,1
304,1,0,0,0,0,4000,2500.0,140.0,360.0,1.0,0
68,1,1,3,1,1,7100,0.0,125.0,60.0,1.0,1
15,1,0,0,0,0,4950,0.0,125.0,360.0,1.0,1
211,1,1,3,0,0,3430,1250.0,128.0,360.0,0.0,2


In [15]:
X_train.dtypes

Gender                object
Married               object
Dependents             int64
Education              int64
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object

In [16]:
for col in X_train.columns:
    print('The number of missing values in {} -> {}'.format(col, X_train[col].isnull().sum()))

The number of missing values in Gender -> 0
The number of missing values in Married -> 0
The number of missing values in Dependents -> 0
The number of missing values in Education -> 0
The number of missing values in Self_Employed -> 0
The number of missing values in ApplicantIncome -> 0
The number of missing values in CoapplicantIncome -> 0
The number of missing values in LoanAmount -> 0
The number of missing values in Loan_Amount_Term -> 0
The number of missing values in Credit_History -> 0
The number of missing values in Property_Area -> 0


In [17]:
X_train = X_train.as_matrix()
X_train.shape

(460, 11)

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessing(BaseEstimator, TransformerMixin):
    """
        Custom Pre-Processing estimator
    """
    
    def __init__(self):
        pass
    
    def transform(self, df):
        """Regular transform() that is a help for training, validation & testing datasets
           (NOTE: The operations performed here are the ones that we did prior to this cell)
        """
        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',
                    'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']
        
        df = df[pred_var]
        
        df['Dependents'] = df['Dependents'].fillna(0)
        df['Self_Employed'] = df['Self_Employed'].fillna('No')
        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)
        df['Credit_History'] = df['Credit_History'].fillna(1)
        df['Married'] = df['Married'].fillna('No')
        df['Gender'] = df['Gender'].fillna('Male')
        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)
        
        gender_values = {'Female' : 0, 'Male' : 1} 
        married_values = {'No' : 0, 'Yes' : 1}
        education_values = {'Graduate' : 0, 'Not Graduate' : 1}
        employed_values = {'No' : 0, 'Yes' : 1}
        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}
        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
        df.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values,
                    'Self_Employed': employed_values, 'Property_Area': property_values,
                    'Dependents': dependent_values}, inplace=True)
        
        return df.as_matrix()

    def fit(self, df, y=None, **fit_params):
        """Fitting the Training dataset & calculating the required values from train
           e.g: We will need the mean of X_train['Loan_Amount_Term'] that will be used in
                transformation of X_test
        """
        
        self.term_mean_ = df['Loan_Amount_Term'].mean()
        self.amt_mean_ = df['LoanAmount'].mean()
        return self

In [19]:
X_train, X_val, y_train, y_val = train_test_split(data[pred_var], data['Loan_Status'],
                                                    test_size=0.25, random_state=42)

In [20]:
X_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
92,Male,Yes,2,Not Graduate,No,3273,1820.0,81.0,360.0,1.0,Urban
304,Male,No,0,Graduate,No,4000,2500.0,140.0,360.0,1.0,Rural
68,Male,Yes,3+,Not Graduate,Yes,7100,0.0,125.0,60.0,1.0,Urban
15,Male,No,0,Graduate,No,4950,0.0,125.0,360.0,1.0,Urban
211,Male,Yes,3+,Graduate,No,3430,1250.0,128.0,360.0,0.0,Semiurban


In [21]:
for col in X_train.columns:
    print('The number of missing values in {} -> {}'.format(col, X_train[col].isnull().sum()))

The number of missing values in Gender -> 11
The number of missing values in Married -> 1
The number of missing values in Dependents -> 11
The number of missing values in Education -> 0
The number of missing values in Self_Employed -> 20
The number of missing values in ApplicantIncome -> 0
The number of missing values in CoapplicantIncome -> 0
The number of missing values in LoanAmount -> 16
The number of missing values in Loan_Amount_Term -> 11
The number of missing values in Credit_History -> 36
The number of missing values in Property_Area -> 0


In [22]:
preprocess = PreProcessing()

In [23]:
preprocess

PreProcessing()

In [24]:
preprocess.fit(X_train)

PreProcessing()

In [25]:
X_train_transformed = preprocess.transform(X_train)

In [26]:
X_train_transformed.shape

(460, 11)

In [27]:
X_val_transformed = preprocess.transform(X_val)

In [28]:
X_val_transformed.shape

(154, 11)

In [29]:
y_val = y_val.replace({'Y':1, 'N':0}).as_matrix()

In [30]:
y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()

In [31]:
params={
    'xgbclassifier__max_depth__': [3,4,5,6],
    'xgbclassifier__subsample__': [0.4,0.5,0.6,0.7],
    'xgbclassifier__colsample_bytree__': [0.5,0.6,0.7,0.8],
    'xgbclassifier__n_estimators__': [1000,2000,3000],
    'xgbclassifier__reg_alpha__': [0.01, 0.02, 0.03, 0.04]
}

In [32]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

pipe = make_pipeline(PreProcessing(),
                     XGBClassifier(n_estimators=1000))

In [33]:
pipe

Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [34]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

In [35]:
grid

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, ...       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'xgbclassifier__max_depth__': [3, 4, 5, 6], 'xgbclassifier__subsample__': [0.4, 0.5, 0.6, 0.7], 'xgbclassifier__colsample_bytree__': [0.5, 0.6, 0.7, 0.8], 'xgbclassifier__n_estimators__': [1000, 2000, 3000], 'xgbclassifier__reg_alpha__': [0.01, 0.02, 0.03, 0.04]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [36]:
X_train, X_val, y_train, y_val = train_test_split(data[pred_var], data['Loan_Status'], test_size=0.25, random_state=42)

In [37]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 768 candidates, totalling 3840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 3840 out of 3840 | elapsed: 12.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', PreProcessing()), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, ...       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'xgbclassifier__max_depth__': [3, 4, 5, 6], 'xgbclassifier__subsample__': [0.4, 0.5, 0.6, 0.7], 'xgbclassifier__colsample_bytree__': [0.5, 0.6, 0.7, 0.8], 'xgbclassifier__n_estimators__': [1000, 2000, 3000], 'xgbclassifier__reg_alpha__': [0.01, 0.02, 0.03, 0.04]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [38]:
print("Best parameters: {}".format(grid.best_params_))

Best parameters: {'xgbclassifier__colsample_bytree__': 0.5, 'xgbclassifier__max_depth__': 3, 'xgbclassifier__n_estimators__': 1000, 'xgbclassifier__reg_alpha__': 0.01, 'xgbclassifier__subsample__': 0.4}


In [39]:
print("Test set score: {:.2f}".format(grid.score(X_val, y_val)))

Test set score: 0.75
