In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Train_Loan_Home.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
columnsX = data.columns[1:-1]
columnsY = data.columns[-1]

In [5]:
columnsY

'Loan_Status'

In [6]:
catCol = ['Gender','Married','Education','Self_Employed','Property_Area','Dependents']
numWithScalingCol = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
numCol = ['Credit_History']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import ensemble

In [8]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
numeric_transformer_scaling = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=99))])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer_scaling, numWithScalingCol),
    ('num2', numeric_transformer, numCol),
    ('cat', categorical_transformer, catCol)
])

In [10]:
modelPipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', ensemble.RandomForestClassifier())])

In [11]:
modelPipeline.fit(data[columnsX], data[columnsY])

In [12]:
scores = modelPipeline.predict_proba(data[columnsX])[:,-1]

In [14]:
data2 = pd.read_csv('Test_Loan_Home.csv')

In [15]:
scores = modelPipeline.predict_proba(data2[columnsX])[:,-1]

In [16]:
scoreOutput = {j:k for j,k in zip(data2['Loan_ID'], scores)}

In [20]:
scoreOutput

{'LP001002': 0.99,
 'LP001003': 0.25,
 'LP001005': 0.97,
 'LP001006': 0.95,
 'LP001008': 0.98,
 'LP001011': 0.87,
 'LP001013': 0.95,
 'LP001014': 0.06,
 'LP001018': 0.91,
 'LP001020': 0.09,
 'LP001024': 0.97,
 'LP001027': 0.96,
 'LP001028': 0.86,
 'LP001029': 0.27,
 'LP001030': 0.88,
 'LP001032': 0.99,
 'LP001034': 0.89,
 'LP001036': 0.07,
 'LP001038': 0.25,
 'LP001041': 0.98,
 'LP001043': 0.07,
 'LP001046': 0.89,
 'LP001047': 0.09,
 'LP001050': 0.06,
 'LP001052': 0.27,
 'LP001066': 0.92,
 'LP001068': 1.0,
 'LP001073': 0.97,
 'LP001086': 0.23,
 'LP001087': 0.92,
 'LP001091': 0.3,
 'LP001095': 0.28,
 'LP001097': 0.2,
 'LP001098': 1.0,
 'LP001100': 0.21,
 'LP001106': 1.0,
 'LP001109': 0.05,
 'LP001112': 0.95,
 'LP001114': 0.79,
 'LP001116': 0.96,
 'LP001119': 0.23,
 'LP001120': 0.87,
 'LP001123': 0.88,
 'LP001131': 0.99,
 'LP001136': 0.87,
 'LP001137': 0.93,
 'LP001138': 0.87,
 'LP001144': 0.9,
 'LP001146': 0.11,
 'LP001151': 0.95,
 'LP001155': 1.0,
 'LP001157': 0.97,
 'LP001164': 0.39,


In [17]:
import joblib 

In [18]:
joblib.dump(modelPipeline, 'modelPipeline.pkl')

['modelPipeline.pkl']