# Loan Prediction 04 - Data Imputation With Random Forest

Let us try to improve the previous results by imputing missing data with a Random Forest.

But first, we will remove rows with more than one missing value.

In [1]:
import sys

sys.path.append('utils')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
plt.style.use('seaborn')

from missingpy import MissForest
from sklearn.preprocessing import OrdinalEncoder

import dataframe_utils
import preprocess_utils



### Loading original dataset

In [30]:
df_import = pd.read_csv('dataset/train_loan.csv')
df_import.drop(columns=['Loan_ID'],inplace = True)
df_import.shape

(614, 12)

### Counting missing values by column

In [31]:
df_import.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Rows with one or no missing value

In [32]:
# null_rows = df_import.isnull().sum(axis = 1)
# df_rows = df_import.loc[(null_rows == 1) | (null_rows == 0),:]
# df_rows

## Replacing missing values with MissForest

Let us prepare the dataset in order to execute the MissForest algorithm

In [33]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.categories_ = np.load('saves/variable_encoder_categories.npy', allow_pickle= True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term','Loan_Status']
df_encoded_nans = preprocess_utils.encode_with_nan(df_import, categorical_columns, ordinal_encoder)
df_encoded_nans

  result = method(y)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,,360.0,1.0,2,Y
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
610,1,1,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y
611,1,1,1,0,0,8072,240.0,253.0,360.0,1.0,2,Y
612,1,1,2,0,0,7583,0.0,187.0,360.0,1.0,2,Y


Although Credit_History and Loan_Amount_Term are represented as numerical values, we will set them as categorical variables. 

That is because these variables have a categorical behavior, as shown below.

In [34]:
dataframe_utils.show_column_options(df_import[['Credit_History','Loan_Amount_Term']])

Column Values:
Credit_History : [ 1.  0. nan]
Loan_Amount_Term : [360. 120. 240.  nan 180.  60. 300. 480.  36.  84.  12.]


In [36]:
X = df_encoded_nans.copy().drop(columns = ['Loan_Status'])
y = df_encoded_nans.copy()[['Loan_Status']]


categorical_index = [0,1,2,3,4,8,9,10]
imputer = MissForest(oob_score=True, random_state = 0, class_weight = 'balanced')
imputer.fit(X,y,cat_vars = categorical_index)

MissForest(bootstrap=True, class_weight='balanced', copy=True,
           criterion=('mse', 'gini'), decreasing=False, max_depth=None,
           max_features='auto', max_iter=10, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, missing_values=nan, n_estimators=100,
           n_jobs=-1, oob_score=True, random_state=0, verbose=0,
           warm_start=False)

In [37]:
X_filled = imputer.transform(X)
df_fill = pd.DataFrame(X_filled,columns = X.columns)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3


In [38]:
df_fill

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,0.0,0.0,0.0,0.0,5849.0,0.0,145.7,360.0,1.0,2.0
1,1.0,1.0,1.0,0.0,0.0,4583.0,1508.0,128.0,360.0,1.0,0.0
2,1.0,1.0,0.0,0.0,1.0,3000.0,0.0,66.0,360.0,1.0,2.0
3,1.0,1.0,0.0,1.0,0.0,2583.0,2358.0,120.0,360.0,1.0,2.0
4,1.0,0.0,0.0,0.0,0.0,6000.0,0.0,141.0,360.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0.0,0.0,0.0,0.0,2900.0,0.0,71.0,360.0,1.0,0.0
610,1.0,1.0,3.0,0.0,0.0,4106.0,0.0,40.0,180.0,1.0,0.0
611,1.0,1.0,1.0,0.0,0.0,8072.0,240.0,253.0,360.0,1.0,2.0
612,1.0,1.0,2.0,0.0,0.0,7583.0,0.0,187.0,360.0,1.0,2.0


In [39]:
df_fill['Loan_Status'] = y.values.ravel()

## Pre processing after Imputation

### Calculating Base_Loan_Installment and Remaining_Income

In [40]:
base_loan_installment = df_fill['LoanAmount'] * 1000 / df_fill['Loan_Amount_Term']
total_income = df_fill['ApplicantIncome'] + df_fill['CoapplicantIncome']
remaining_income = (total_income - base_loan_installment) / total_income

df_fill['Base_Loan_Installment'] = base_loan_installment
df_fill['Remaining_Income'] = remaining_income

### Removing outliers

In [41]:
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Base_Loan_Installment','Remaining_Income']
df_fill_no_outlier = preprocess_utils.remove_outliers(df_fill,numerical_columns,threshold = 3)

(582, 14)


### Envoding Loan_Status

In [42]:
df_fill_no_outlier.loc[df_fill_no_outlier['Loan_Status'] == 'Y','Loan_Status'] = 1
df_fill_no_outlier.loc[df_fill_no_outlier['Loan_Status'] == 'N','Loan_Status'] = 0

In [43]:
df_fill_no_outlier.to_csv('dataset/train_rf_imputed.csv',index = False)