In [49]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [50]:
data = pd.read_csv( "train_u6lujuX_CVtuZ9i.csv" )

In [51]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [52]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


## Find missing values

In [53]:
n = len(data)

for c in data.columns:
    
    cnt = data[c].count()
    
    if (cnt < 614):
        print( f"{c}: {data[c].count()},  {data[c].dtype}" )

Gender: 601,  object
Married: 611,  object
Dependents: 599,  object
Self_Employed: 582,  object
LoanAmount: 592,  float64
Loan_Amount_Term: 600,  float64
Credit_History: 564,  float64


## Convert categorical values to integers

In [54]:
catColumns = [ "Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area", "Loan_Status" ] 

for c in catColumns:
    data[c] = data[c].astype('category')

data[ catColumns ] = data[ catColumns ].apply( lambda x: x.cat.codes )

## Fill Nans

In [55]:
data["Dependents"] = data["Dependents"].fillna( 1 )
data["LoanAmount"] = data["LoanAmount"].fillna( data.LoanAmount.mean() )
data["Loan_Amount_Term"] = data["Loan_Amount_Term"].fillna( data.Loan_Amount_Term.mean() )
data["Credit_History"] = data["Credit_History"].fillna( data.Credit_History.mean() )

In [56]:
data.corr()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Gender,1.0,0.277459,0.109018,0.068774,-0.031116,-0.024689,0.073642,0.022241,-0.060394,0.032801,-0.008142,0.02904
Married,0.277459,1.0,0.345269,0.02253,0.030171,0.047691,0.08327,0.141196,-0.103747,-0.003329,-0.00337,0.075547
Dependents,0.109018,0.345269,1.0,0.051414,0.048221,0.115686,0.033955,0.162864,-0.102874,-0.024511,-0.009085,0.014263
Education,0.068774,0.02253,0.051414,1.0,0.00082,-0.14076,-0.06229,-0.166998,-0.077242,-0.077936,-0.065243,-0.085884
Self_Employed,-0.031116,0.030171,0.048221,0.00082,1.0,0.092967,-0.052384,0.082292,-0.017594,-0.035999,-0.02873,-0.011332
ApplicantIncome,-0.024689,0.047691,0.115686,-0.14076,0.092967,1.0,-0.116605,0.56562,-0.045242,-0.014477,-0.0095,-0.00471
CoapplicantIncome,0.073642,0.08327,0.033955,-0.06229,-0.052384,-0.116605,1.0,0.187828,-0.059675,-0.001665,0.010522,-0.059187
LoanAmount,0.022241,0.141196,0.162864,-0.166998,0.082292,0.56562,0.187828,1.0,0.038801,-0.007738,-0.044776,-0.036416
Loan_Amount_Term,-0.060394,-0.103747,-0.102874,-0.077242,-0.017594,-0.045242,-0.059675,0.038801,1.0,0.001395,-0.07762,-0.020974
Credit_History,0.032801,-0.003329,-0.024511,-0.077936,-0.035999,-0.014477,-0.001665,-0.007738,0.001395,1.0,-0.00188,0.540483


## Extract features and targets

In [65]:
features = data[ [ "Gender", "Married", "Dependents", "Education", "Self_Employed", "ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term","Credit_History", "Property_Area"] ]
targets = data["Loan_Status"]

In [69]:
params = { "n_estimators" : [2, 3, 4, 5, 6, 7, 8, 9, 10 ,12, 16, 18, 20, 30 ],
           "max_depth" : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20 ,30 ],
           "min_samples_leaf" : [1, 2, 4, 5, 10 ,15, 20] }

forest = RandomForestClassifier()

gridForest = GridSearchCV( forest, params, cv = 10, n_jobs = -1 )
gridForest.fit( features, targets)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 18, 20, 30], 'max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30], 'min_samples_leaf': [1, 2, 4, 5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
gridForest.best_score_

0.8127035830618893

In [71]:
gridForest.best_params_

{'max_depth': 6, 'min_samples_leaf': 1, 'n_estimators': 20}

In [73]:
model = RandomForestClassifier( **gridForest.best_params_ )
model.fit( features, targets )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
pred = model.predict( features )
accuracy_score(pred, targets)

0.8273615635179153