In [1]:
import pandas as pd

data = pd.read_csv(r'data/train.csv')
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


#### To select model structure I will use basic dataset with dropped all NA values. This approach should give enough information about what is worth trying. 

In [2]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [3]:
data['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

So we have unbalanced target classes. We have 3 basic options:
    * undersample (I dont like this method)
    * oversample through SMOOTE
    * set class_weight parameter

From these options class weight is the most simple one so it should be enough to determine type of model.

In [7]:
df = data.copy()
df.drop('Loan_ID', axis=1, inplace=True)
df.dropna(inplace=True)

x = df.drop('Loan_Status', axis=1)
y = df['Loan_Status'].map({'Y':1, 'N':0})

x = x.drop(['Self_Employed', 'ApplicantIncome', 'Loan_Amount_Term', 'Gender', 'Dependents', 'LoanAmount'], axis=1)

X = pd.get_dummies(x)
X.head(3)

Unnamed: 0,CoapplicantIncome,Credit_History,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,1508.0,1.0,0,1,1,0,1,0,0
2,0.0,1.0,0,1,1,0,0,0,1
3,2358.0,1.0,0,1,0,1,0,0,1


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
from sklearn.model_selection import train_test_split

x_tr, x_te, y_tr, y_te = train_test_split(X, y, test_size=0.15, random_state=5)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

rf = RandomForestClassifier(n_estimators=300, max_depth=6, class_weight='balanced').fit(x_tr, y_tr)
log = LogisticRegression(max_iter=1000, class_weight='balanced').fit(x_tr, y_tr)
svc = SVC(class_weight='balanced').fit(x_tr, y_tr)
nb = GaussianNB().fit(x_tr, y_tr)

In [11]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
pred_rf = rf.predict(x_te)
pred_log = log.predict(x_te)
pred_svc = svc.predict(x_te)
pred_nb = nb.predict(x_te)

print(
    'RF - F1/acc: ',f1_score(y_te, pred_rf),'/',accuracy_score(y_te, pred_rf),
    '\nLog - F1/acc: ',f1_score(y_te, pred_log),'/',accuracy_score(y_te, pred_log),
    '\nSVC - F1/acc: ',f1_score(y_te, pred_svc),'/',accuracy_score(y_te, pred_svc),
    '\nNB - F1/acc: ',f1_score(y_te, pred_nb),'/',accuracy_score(y_te,pred_nb),
)

RF - F1/acc:  0.8351648351648351 / 0.7916666666666666 
Log - F1/acc:  0.8282828282828283 / 0.7638888888888888 
SVC - F1/acc:  0.8260869565217391 / 0.7777777777777778 
NB - F1/acc:  0.854368932038835 / 0.7916666666666666


RF - F1/acc:  0.8653846153846154 / 0.8055555555555556 
Log - F1/acc:  0.8761904761904762 / 0.8194444444444444 
SVC - F1/acc:  0.8761904761904762 / 0.8194444444444444 
NB - F1/acc:  0.8431372549019609 / 0.7777777777777778

In [13]:
svc = SVC(C=0.1, kernel='linear', class_weight='balanced').fit(x_tr, y_tr)
pred = svc.predict(x_te)
print('F1/acc: ',f1_score(y_te, pred),'/',accuracy_score(y_te, pred))

F1/acc:  0.8761904761904762 / 0.8194444444444444


In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

estimator = XGBClassifier(n_jobs=-1, class_weight='balanced')
params = {
    'learning_rate': [0.01, 0.1],
    'gamma':[0, 0.1, 1],
    'n_estimators':[100,300,500], 
    'max_depth':[2,4,6,8],
    'reg_alpha': [0,0.1,1,10],
    'reg_lambda': [0,0.1,1,10],
}
# CV only 3 because of lots of combinations
GS = GridSearchCV(estimator, params, cv=3, n_jobs=-1, verbose=2)
GS.fit(X, y)
GS.best_estimator_

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed:  8.3min finished


XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=10, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [15]:
xgb = XGBClassifier(learning_rate=0.01, gamma=0, n_estimators=300, max_depth=2, 
                    reg_alpha=0, reg_lambda=10, class_weight='balanced').fit(x_tr, y_tr) #
pred = svc.predict(x_te)
print('F1/acc: ',f1_score(y_te, pred),'/',accuracy_score(y_te, pred))

F1/acc:  0.8761904761904762 / 0.8194444444444444


This experiment showed that this problem is well suited for linear models such as SVM with linear kernel or Logistic Regression. Maybe Tree methods can beat them in more exhaustive hyperparameter search. Its performed by script named gridsearch.py