# Loan Prediction 06 - Testing Final Dataset

In the real world, it could occur that the input service did not provide all the necessary information for the model. 

For these cases, we can skip the classification and either notify other systems about the inconsistency or fill in missing data.

For the test set of this challenge, variables other than Self_employed, LoanAmount_Terms and CoapplicantIncome will be imputed with their Mean or Mode.

Variables:
- Creation of Base_Loan_Installment and Remaining_Income

Missing data treatment:
- Fill in the Self_employed with 'No'.
- Fill in the LoanAmount_Terms with 360 for clients who have this value missing.
- Fill in the CoapplicantIncome with 0.

Dataset preprocess:
- Encode categorical variables
- Scale variables

In [11]:
import sys
import math
from joblib import dump, load
sys.path.append('utils')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')

from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder

import metrics_utils 
import model_utils
import preprocess_utils

In [80]:
df_import = pd.read_csv('dataset/test_loan.csv')
df_import

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


### Filling Missing Data

In [81]:
df_import.loc[df_import['Loan_Amount_Term'].isnull(),'Loan_Amount_Term'] = 360
df_import.loc[df_import['Self_Employed'].isnull(),'Self_Employed'] = 'No'
df_import.loc[df_import['CoapplicantIncome'].isnull(),'CoapplicantIncome'] = 0

### Calculating New Variables

In [82]:
def calculate_new_variables(df):
    base_loan_installment = df['LoanAmount'] * 1000 / df['Loan_Amount_Term']

    total_income = df['ApplicantIncome'] + df['CoapplicantIncome']
    remaining_income = (total_income - base_loan_installment) / total_income

    df['Base_Loan_Installment'] = base_loan_installment
    df['Remaining_Income'] = remaining_income
    return df

In [83]:
df_import = calculate_new_variables(df_import)

In [123]:
df_import.count()

Loan_ID                  367
Gender                   356
Married                  367
Dependents               357
Education                367
Self_Employed            367
ApplicantIncome          367
CoapplicantIncome        367
LoanAmount               362
Loan_Amount_Term         367
Credit_History           338
Property_Area            367
Base_Loan_Installment    362
Remaining_Income         362
dtype: int64

In [129]:
X_test = df_import.drop(columns = ['Loan_ID']).copy()

### Encoding Variables

In [130]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.categories_ = np.load('saves/variable_encoder_categories.npy', allow_pickle= True)
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term']
df_encoded_nans = preprocess_utils.encode_with_nan(X_test, categorical_columns, ordinal_encoder)
df_encoded_nans

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Base_Loan_Installment,Remaining_Income
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,305.555556,0.946581
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,350.000000,0.923514
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,577.777778,0.915033
3,1,1,2,0,0,2340,2546,100.0,360.0,,2,277.777778,0.943148
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,216.666667,0.933862
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,1,1,3,1,1,4009,1777,113.0,360.0,1.0,2,313.888889,0.945750
363,1,1,0,0,0,4158,709,115.0,360.0,1.0,2,319.444444,0.934365
364,1,0,0,0,0,3250,1993,126.0,360.0,,1,350.000000,0.933244
365,1,1,0,0,0,5000,2393,158.0,360.0,1.0,0,438.888889,0.940635


### Scaling Test Set

In [139]:
scaler =  load('saves/minmax_scaler_manual_imputation.bin')

X_test_norm = pd.DataFrame(data=scaler.transform(df_encoded_nans),columns=df_encoded_nans.columns)
X_test_norm

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Base_Loan_Installment,Remaining_Income
0,1.0,1.0,0.000000,0.0,0.0,0.309187,0.000000,0.272237,0.666667,1.0,1.0,0.183971,0.796543
1,1.0,1.0,0.333333,0.0,0.0,0.162420,0.167038,0.315364,0.666667,1.0,1.0,0.213115,0.692344
2,1.0,1.0,0.666667,0.0,0.0,0.269220,0.200445,0.536388,0.666667,1.0,1.0,0.362477,0.654032
3,1.0,1.0,0.666667,0.0,0.0,0.121565,0.283519,0.245283,0.666667,,1.0,0.165756,0.781035
4,1.0,0.0,0.000000,1.0,0.0,0.173522,0.000000,0.185984,0.666667,1.0,1.0,0.125683,0.739090
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,1.0,1.0,1.000000,1.0,1.0,0.214210,0.197884,0.280323,0.666667,1.0,1.0,0.189435,0.792789
363,1.0,1.0,0.000000,0.0,0.0,0.222481,0.078953,0.285714,0.666667,1.0,1.0,0.193078,0.741361
364,1.0,0.0,0.000000,0.0,0.0,0.172079,0.221938,0.315364,0.666667,,0.5,0.213115,0.736297
365,1.0,1.0,0.000000,0.0,0.0,0.269220,0.266481,0.401617,0.666667,1.0,0.0,0.271403,0.769680


In [140]:
X_test_norm.count()

Gender                   356
Married                  367
Dependents               357
Education                367
Self_Employed            367
ApplicantIncome          367
CoapplicantIncome        367
LoanAmount               362
Loan_Amount_Term         367
Credit_History           338
Property_Area            367
Base_Loan_Installment    362
Remaining_Income         362
dtype: int64

### Marking missing values as -999 (only for XGBoost)

In [141]:
X_test_xgb = X_test_norm.copy()
nulls = X_test_xgb.isnull()
X_test_xgb[nulls != 0] = -999

### Replacing Missing Values for Mean/Mode

In [151]:
X_test_filled = df_encoded_nans.copy()

categorical_columns_to_fill = ['Gender', 'Married', 'Dependents', 'Education', 'Property_Area','Credit_History']
for col in categorical_columns_to_fill:
    X_test_filled.loc[nulls[col],col] = X_test_filled[col].mode()[0]
    
X_test_filled.loc[nulls['LoanAmount'],'LoanAmount'] = X_test_filled['LoanAmount'].mean()
X_test_filled = calculate_new_variables(X_test_filled)

X_test_filled_norm = pd.DataFrame(data=scaler.transform(X_test_filled),columns=X_test_filled.columns)

In [152]:
X_test_filled_norm.describe()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Base_Loan_Installment,Remaining_Income
count,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0
mean,0.809264,0.634877,0.268847,0.228883,0.100817,0.258429,0.174786,0.342675,0.618952,0.839237,0.53951,0.323627,0.636088
std,0.393417,0.482122,0.355059,0.420687,0.301498,0.272589,0.259937,0.164275,0.179607,0.367814,0.412168,0.914668,0.850912
min,0.0,0.0,0.0,0.0,0.0,-0.008326,0.0,0.051213,-0.316667,0.0,0.0,0.047359,-12.479868
25%,1.0,0.0,0.0,0.0,0.0,0.150652,0.0,0.247978,0.666667,1.0,0.0,0.173042,0.666104
50%,1.0,1.0,0.0,0.0,0.0,0.201832,0.114143,0.315364,0.666667,1.0,0.5,0.220401,0.725935
75%,1.0,1.0,0.666667,0.0,0.0,0.272551,0.270657,0.40027,0.666667,1.0,1.0,0.301457,0.777737
max,1.0,1.0,1.0,1.0,1.0,4.017707,2.672606,1.458221,1.0,1.0,1.0,14.191257,1.013945


### Preparing for submission

In [160]:
def make_final_prediction(model, data,id_tags):
    df = pd.DataFrame(columns = ['Loan_ID','Loan_Status'])
    df['Loan_ID'] = id_tags
    df['Loan_Status'] = model.predict(data.values).ravel()
    df.loc[df['Loan_Status'] == 1,'Loan_Status'] = 'Y'
    df.loc[df['Loan_Status'] == 0,'Loan_Status'] = 'N'
    return df

### Executing Classifications

In [171]:
logistic_reg =  load('saves/logistic_regression_manual_imputation.bin')
df_pred_logistic_reg = make_final_prediction(logistic_reg,X_test_filled_norm, id_tags = df_import['Loan_ID'])
df_pred_logistic_reg.to_csv('submission/logistic_regression_prediction_20200215.csv',index = False)

In [172]:
ridge_reg =  load('saves/ridge_regression_manual_imputation.bin')
df_pred_ridge_reg = make_final_prediction(ridge_reg,X_test_filled_norm, id_tags = df_import['Loan_ID'])
df_pred_ridge_reg.to_csv('submission/ridge_regression_prediction_20200215.csv',index = False)

In [173]:
random_forest =  load('saves/random_forest_manual_imputation.bin')
df_pred_random_forest = make_final_prediction(random_forest,X_test_filled_norm, id_tags = df_import['Loan_ID'])
df_pred_random_forest.to_csv('submission/random_forest_prediction_20200215.csv',index = False)

In [174]:
gboost =  load('saves/gradient_boosting_manual_imputation.bin')
df_pred_gboost = make_final_prediction(gboost,X_test_filled_norm, id_tags = df_import['Loan_ID'])
df_pred_gboost.to_csv('submission/gboost_prediction_20200215.csv',index = False)

In [176]:
xgboost =  load('saves/extreme_gradient_boosting_manual_imputation.bin')
df_pred_xgb = make_final_prediction(xgboost,X_test_xgb, id_tags = df_import['Loan_ID'])
df_pred_xgb.to_csv('submission/xgb_no_fill_prediction_20200215.csv',index = False)

In [177]:
xgboost =  load('saves/extreme_gradient_boosting_manual_imputation.bin')
df_pred_xgb = make_final_prediction(xgboost,X_test_filled_norm, id_tags = df_import['Loan_ID'])
df_pred_xgb.to_csv('submission/xgb_prediction_20200215.csv',index = False)

All the predictions were submitted on the following link:

https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/

