# Loan Prediction 04 - Data Imputation With Random Forest

Let us try to improve the previous results by imputing missing data with a Random Forest

In [1]:
import sys

sys.path.append('utils')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
plt.style.use('seaborn')

from missingpy import MissForest
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import dataframe_utils



### Loading original dataset

In [2]:
df_import = pd.read_csv('dataset/train_loan_new_variables.csv')
df_import.drop(columns=['Loan_ID'],inplace = True)
df_import.shape

(614, 14)

### Counting missing values by column

In [3]:
df_import.isnull().sum()

Gender                   13
Married                   3
Dependents               15
Education                 0
Self_Employed            32
ApplicantIncome           0
CoapplicantIncome         0
LoanAmount               22
Loan_Amount_Term         14
Credit_History           50
Property_Area             0
Loan_Status               0
Base_Loan_Installment    36
Remaining_Income         36
dtype: int64

### Counting missing values by row

In [5]:
nulls = (df_import.isnull().sum(axis = 1) > 0)
df_null_rows = df_import.loc[nulls,:]
df_null_rows.shape[0]

134

## Replacing missing values with MissForest

Let us prepare the dataset in order to execute the MissForest algorithm

In [6]:
def encode_with_nan(df_input,categorical_columns,ordinal_encoder):
    df = df_input.copy()
    for category,col in zip(ordinal_encoder.categories_,categorical_columns):
        for index, label in enumerate(category):
            df.loc[df[col] == label,col] = index
    return df

In [10]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.categories_ = np.load('utils/variable_encoder_categories.npy', allow_pickle= True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term','Loan_Status']
df_encoded_nans = encode_with_nan(df_import,categorical_columns,ordinal_encoder)

Although Credit_History and Loan_Amount_Term are represented as numerical values, we will set them as categorical variables. 

That is because these variables have limited options, as shown below.

In [11]:
dataframe_utils.show_column_options(df_import[['Credit_History','Loan_Amount_Term']])

Column Values:
Credit_History : [ 1.  0. nan]
Loan_Amount_Term : [360. 120. 240.  nan 180.  60. 300. 480.  36.  84.  12.]


In [12]:
categorical_index = [0,1,2,3,4,8,9,10]
X = df_encoded_nans.copy().drop(columns = ['Loan_Status'])
y = df_encoded_nans.copy()[['Loan_Status']]
imputer = MissForest()
imputer.fit(X,y,cat_vars = categorical_index)

MissForest(bootstrap=True, class_weight=None, copy=True,
           criterion=('mse', 'gini'), decreasing=False, max_depth=None,
           max_features='auto', max_iter=10, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, missing_values=nan, n_estimators=100,
           n_jobs=-1, oob_score=False, random_state=None, verbose=0,
           warm_start=False)

In [13]:
X_filled = imputer.transform(X)
df_filled = pd.DataFrame(X_filled,columns = X.columns)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [14]:
df_imputed = df_filled.join(y)
df_imputed.head(20)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Base_Loan_Installment,Remaining_Income,Loan_Status
0,1.0,0.0,0.0,0.0,0.0,5849.0,0.0,98.22,360.0,1.0,2.0,275.000556,0.948233,Y
1,1.0,1.0,1.0,0.0,0.0,4583.0,1508.0,128.0,360.0,1.0,0.0,359.111111,0.941042,N
2,1.0,1.0,0.0,0.0,1.0,3000.0,0.0,66.0,360.0,1.0,2.0,185.166667,0.938278,Y
3,1.0,1.0,0.0,1.0,0.0,2583.0,2358.0,120.0,360.0,1.0,2.0,336.666667,0.931863,Y
4,1.0,0.0,0.0,0.0,0.0,6000.0,0.0,141.0,360.0,1.0,2.0,395.583333,0.934069,Y
5,1.0,1.0,2.0,0.0,1.0,5417.0,4196.0,267.0,360.0,1.0,2.0,749.083333,0.922076,Y
6,1.0,1.0,0.0,1.0,0.0,2333.0,1516.0,95.0,360.0,1.0,2.0,266.527778,0.930754,Y
7,1.0,1.0,3.0,0.0,0.0,3036.0,2504.0,158.0,360.0,0.0,1.0,443.277778,0.919986,N
8,1.0,1.0,2.0,0.0,0.0,4006.0,1526.0,168.0,360.0,1.0,2.0,471.333333,0.914799,Y
9,1.0,1.0,1.0,0.0,0.0,12841.0,10968.0,349.0,360.0,1.0,1.0,979.138889,0.958875,N


In [15]:
df_encoded_nans.head(20)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Base_Loan_Installment,Remaining_Income
0,1,0,0,0,0.0,5849,0.0,,360.0,1.0,2,Y,,
1,1,1,1,0,0.0,4583,1508.0,128.0,360.0,1.0,0,N,359.111111,0.941042
2,1,1,0,0,1.0,3000,0.0,66.0,360.0,1.0,2,Y,185.166667,0.938278
3,1,1,0,1,0.0,2583,2358.0,120.0,360.0,1.0,2,Y,336.666667,0.931863
4,1,0,0,0,0.0,6000,0.0,141.0,360.0,1.0,2,Y,395.583333,0.934069
5,1,1,2,0,1.0,5417,4196.0,267.0,360.0,1.0,2,Y,749.083333,0.922076
6,1,1,0,1,0.0,2333,1516.0,95.0,360.0,1.0,2,Y,266.527778,0.930754
7,1,1,3,0,0.0,3036,2504.0,158.0,360.0,0.0,1,N,443.277778,0.919986
8,1,1,2,0,0.0,4006,1526.0,168.0,360.0,1.0,2,Y,471.333333,0.914799
9,1,1,1,0,0.0,12841,10968.0,349.0,360.0,1.0,1,N,979.138889,0.958875


In [18]:
df_imputed.loc[df_imputed['Loan_Status'] == 'Y','Loan_Status'] = 1
df_imputed.loc[df_imputed['Loan_Status'] == 'N','Loan_Status'] = 0

df_imputed.to_csv('dataset/train_rf_imputed.csv',index = False)

We were able to impute all missing data with the MissForest algorithm.

Now, let us see how the models will perform with this new dataset in the next notebook.