In [19]:
# Importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
#import category_encoders as ce
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import iqr,skew 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate,cross_val_predict,KFold, StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler,PowerTransformer
from sklearn.linear_model import LogisticRegression,BayesianRidge,Lasso
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,ExtraTreesRegressor
from sklearn.tree import DecisionTreeClassifier, export_graphviz,DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.feature_selection import SelectPercentile, f_classif,RFECV,SelectKBest,chi2,mutual_info_classif,SelectFromModel
from sklearn.pipeline import Pipeline,make_pipeline
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS,SequentialFeatureSelector as SFS
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE,ADASYN,RandomOverSampler,BorderlineSMOTE,SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler,TomekLinks
from imblearn.combine import SMOTETomek,SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import  Pipeline as Imb_pipeline
import pickle

import time
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns',15)
pd.set_option('display.float_format', lambda x: '{:.5f}'.format(x)) #Limiting floats output to 5 decimal points

In [20]:
#Importing Data sets
file_path='C:/Users/lcast/Desktop/DS/GitHub/Loan-Approval-Prediction/data/processed/'

loan_train_prepro=pd.read_pickle(file_path+ 'loan_train_prepro.pkl')
loan_test_prepro=pd.read_pickle(file_path+ 'loan_test_prepro.pkl')

In [21]:
y=loan_train_prepro.Loan_Status_Y
X=loan_train_prepro.copy()
X.drop(['Loan_Status_Y'],axis=1,inplace=True)

Since we have seen that the proportion of the approved class is significantly higher than the rejected class, we will apply a stratifield-kfold to keep the proportion in the splits.

In [22]:
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=42) 
#Creating a dataframe to store the results we are obtaining after applying different methods and alternatives. 
Results=pd.DataFrame(columns=['Algorithm', 'Model', 'Features','Best_Score_train','Best_Score_test','Case','Case_Descript'])

##### 3.1 Baseline: No Hyperparameter tuning, no Feature Engineering, no Feature Selection

In [23]:
#Algorithms
alg1=LogisticRegression()
alg2= SVC(probability=True)
alg3=KNeighborsClassifier()
alg4= DecisionTreeClassifier()
alg5= RandomForestClassifier()
alg6= xgb.XGBClassifier()

#Scaling the features for the logistic regression, SVC, KNeighbor and xgb. No need to scale for the tree based algorithms
pipe1 = Pipeline([('std', StandardScaler()),('classifier', alg1)])
pipe2 = Pipeline([('std', StandardScaler()),('classifier', alg2)])
pipe3 = Pipeline([('std', StandardScaler()),('classifier', alg3)])
pipe4 = Pipeline([('classifier', alg4)])
pipe5 = Pipeline([('classifier', alg5)])
pipe6 = Pipeline([('std', StandardScaler()),('classifier', alg6)])

pipelist= [pipe1,pipe2,pipe3,pipe4,pipe5,pipe6]

In [24]:
for i,j in zip(pipelist,('Logreg','SVM', 'KNN', 'DTree','RF','XGB')):
     results_kfold = cross_validate(i, X, y, cv=kf,return_train_score=True,scoring='accuracy')
     Results.loc[len(Results)] = [j, results_kfold,'All original features',results_kfold['train_score'].mean(),
                                  results_kfold['test_score'].mean(),'Algorithm only','Original features. Log transform. No tunning, \
                                  No feature selection']

In [25]:
Results

Unnamed: 0,Algorithm,Model,Features,Best_Score_train,Best_Score_test,Case,Case_Descript
0,Logreg,"{'fit_time': [0.0049860477447509766, 0.0049884...",All original features,0.80981,0.80807,Algorithm only,"Original features. Log transform. No tunning, ..."
1,SVM,"{'fit_time': [0.03795957565307617, 0.038833379...",All original features,0.82067,0.80648,Algorithm only,"Original features. Log transform. No tunning, ..."
2,KNN,"{'fit_time': [0.0039904117584228516, 0.0039887...",All original features,0.8194,0.76073,Algorithm only,"Original features. Log transform. No tunning, ..."
3,DTree,"{'fit_time': [0.004987239837646484, 0.00597834...",All original features,1.0,0.69383,Algorithm only,"Original features. Log transform. No tunning, ..."
4,RF,"{'fit_time': [0.01794910430908203, 0.018945217...",All original features,0.9886,0.74296,Algorithm only,"Original features. Log transform. No tunning, ..."
5,XGB,"{'fit_time': [0.04983019828796387, 0.053163528...",All original features,0.86193,0.80325,Algorithm only,"Original features. Log transform. No tunning, ..."


Model with best test scores are Logreg, SVM and XGB. It also can be appreciated how Decission Tree and Random Forest tend to overfit (tran score are 100% and 98% while test scores are 69% and 74%). Next we will do a GridSearch to fine tune the algorithms parameters.

##### 3.2 Model with Hyperparameter tuning (Gridsearch), no Feature Engineering, no Feature Selection

In [26]:
#Gridsearch
Gridsearch={}

#Pipelines for GridSearch
pipegrid1 = Pipeline([('std', StandardScaler()),
                  ('alg1', alg1)])
pipegrid2 = Pipeline([('std', StandardScaler()),
                  ('alg2', alg2)])
pipegrid3 = Pipeline([('std', StandardScaler()),
                  ('alg3', alg3)])
pipegrid4=Pipeline([('alg4', alg4)])
pipegrid5= Pipeline([('alg5', alg5)])  
pipegrid6 = Pipeline([('std', StandardScaler()),
                     ('alg6', alg6)])
#Hyper paramaeter grids for the gridsearch.    
param_grid1 = {'alg1__solver':['lbfgs', 'liblinear'],'alg1__penalty': ['l2'],'alg1__C': [0.001,0.01,0.1,1,10,100]} #np.power(10., np.arange(-4, 4)
param_grid2 = {'alg2__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'alg2__C': [0.001,0.01,0.1,1,10,100],
                'alg2__gamma': np.power(10., np.arange(-5, 0)),'alg2__decision_function_shape' : ['ovo', 'ovr']}
param_grid3 = {'alg3__n_neighbors': list(range(1, 10)),'alg3__p': [1, 2]}
param_grid4 = {'alg4__max_depth': list(range(1, 10)) + [None],'alg4__criterion': ['gini', 'entropy']}
param_grid5 = {'alg5__max_depth': list(range(1, 20, 2)), 'alg5__n_estimators': 
    list(range(1, 200, 20)),'alg5__max_features' : list(range(1,4,1))}
param_grid6 = {'alg6__subsample': np.arange(.05, 1, .05),
                  'alg6__max_depth': np.arange(3,20,1),
                  'alg6__colsample_bytree': np.arange(.1,1.05,.05)}

pipegridlist= [pipegrid1,pipegrid2,pipegrid3,pipegrid4,pipegrid5,pipegrid6]
paramgridlist=[param_grid1,param_grid2,param_grid3,param_grid4,param_grid5,param_grid6]

In [27]:
start_time = time.time()
for i,j,k in zip(pipegridlist,paramgridlist,('Logreg','SVM', 'KNN', 'DTree','RF','XGB')):
    grid = GridSearchCV(i, j, cv=kf,scoring='accuracy',return_train_score='True')
    grid.fit(X,y)
    Gridsearch[k]= grid
print("--- %s seconds ---" % (time.time() - start_time))

--- 5256.585534572601 seconds ---


In [28]:
for algorithm,gridresults in Gridsearch.items():
    Results.loc[len(Results)] = [algorithm, gridresults,'All original features',
                                     gridresults.cv_results_['mean_train_score'][gridresults.best_index_],
                                     gridresults.cv_results_['mean_test_score'][gridresults.best_index_],'Gridsearch','Original features.\
                                     Log transform. No tunning, No feature selection']

In [29]:
Results

Unnamed: 0,Algorithm,Model,Features,Best_Score_train,Best_Score_test,Case,Case_Descript
0,Logreg,"{'fit_time': [0.0049860477447509766, 0.0049884...",All original features,0.80981,0.80807,Algorithm only,"Original features. Log transform. No tunning, ..."
1,SVM,"{'fit_time': [0.03795957565307617, 0.038833379...",All original features,0.82067,0.80648,Algorithm only,"Original features. Log transform. No tunning, ..."
2,KNN,"{'fit_time': [0.0039904117584228516, 0.0039887...",All original features,0.8194,0.76073,Algorithm only,"Original features. Log transform. No tunning, ..."
3,DTree,"{'fit_time': [0.004987239837646484, 0.00597834...",All original features,1.0,0.69383,Algorithm only,"Original features. Log transform. No tunning, ..."
4,RF,"{'fit_time': [0.01794910430908203, 0.018945217...",All original features,0.9886,0.74296,Algorithm only,"Original features. Log transform. No tunning, ..."
5,XGB,"{'fit_time': [0.04983019828796387, 0.053163528...",All original features,0.86193,0.80325,Algorithm only,"Original features. Log transform. No tunning, ..."
6,Logreg,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...
7,SVM,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...
8,KNN,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80149,0.78339,Gridsearch,Original features. ...
9,DTree,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...


After tunning the algorithm hyper parameters,  the accuracy of all model has increased, most notably in the cases of the Decission Tree Random Forest. Since the gridsearch has performed and exhaustive search of the hyper parameters, it has taken quite time to get the results. Let's try next a Randomized search to see if we can get similar results with a reduced computation time. 

##### 3.3 Model with Hyperparameter tuning (Randomized Search), no Feature Engineering, no Feature Selection

In [31]:
Randomizedsearch={}

start_time = time.time()
for i,j,k in zip(pipegridlist,paramgridlist,('Logreg','SVM', 'KNN', 'DTree','RF','XGB')):
    rgrid = RandomizedSearchCV(i, j, cv=kf,n_iter=10,n_jobs=-1,return_train_score='True',scoring='accuracy',refit=True)
    rgrid.fit(X,y)
    Randomizedsearch[k]= rgrid
print("--- %s seconds ---" % (time.time() - start_time))

--- 17.941561222076416 seconds ---


In [32]:
for algorithm,gridresults in Randomizedsearch.items():
    Results.loc[len(Results)] = [algorithm, gridresults,'All original features',
                                     gridresults.cv_results_['mean_train_score'][gridresults.best_index_],
                                     gridresults.cv_results_['mean_test_score'][gridresults.best_index_],'Randomizedsearch','Original features.\
                                     Log transform. No tunning, No feature selection']  

In [33]:
Results

Unnamed: 0,Algorithm,Model,Features,Best_Score_train,Best_Score_test,Case,Case_Descript
0,Logreg,"{'fit_time': [0.0049860477447509766, 0.0049884...",All original features,0.80981,0.80807,Algorithm only,"Original features. Log transform. No tunning, ..."
1,SVM,"{'fit_time': [0.03795957565307617, 0.038833379...",All original features,0.82067,0.80648,Algorithm only,"Original features. Log transform. No tunning, ..."
2,KNN,"{'fit_time': [0.0039904117584228516, 0.0039887...",All original features,0.8194,0.76073,Algorithm only,"Original features. Log transform. No tunning, ..."
3,DTree,"{'fit_time': [0.004987239837646484, 0.00597834...",All original features,1.0,0.69383,Algorithm only,"Original features. Log transform. No tunning, ..."
4,RF,"{'fit_time': [0.01794910430908203, 0.018945217...",All original features,0.9886,0.74296,Algorithm only,"Original features. Log transform. No tunning, ..."
5,XGB,"{'fit_time': [0.04983019828796387, 0.053163528...",All original features,0.86193,0.80325,Algorithm only,"Original features. Log transform. No tunning, ..."
6,Logreg,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...
7,SVM,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...
8,KNN,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80149,0.78339,Gridsearch,Original features. ...
9,DTree,"GridSearchCV(cv=StratifiedKFold(n_splits=10, r...",All original features,0.80945,0.80945,Gridsearch,Original features. ...


Randomized search has jut taken less than a minute and the accuracy is similar to the one obtained with the Gridsearch, except from the XGB.  
Best models so far are the RF and the XGB with the following parameters:

In [40]:
#Getting the parameters of the best model
Results['Model'][10].best_estimator_.steps[0][1],Results['Model'][11].best_estimator_.steps[1][1]

(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=3, max_features=3, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=41,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.20000000000000004, gamma=0,
               learning_rate=0.1, max_delta_step=0, max_depth=3,
               min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
               nthread=None, objective='binary:logistic', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
               silent=None, subsample=0.2, verbosity=1))

In [45]:
Results.to_pickle('C:/Users/lcast/Desktop/DS/GitHub/Loan-Approval-Prediction/results')

PermissionError: [Errno 13] Permission denied: 'C:/Users/lcast/Desktop/DS/GitHub/Loan-Approval-Prediction/results'