In [32]:
# import libraries
import numpy as np 
import pandas as pd
import plotly.express as px
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , StandardScaler , LabelEncoder , RobustScaler
from sklearn.impute import SimpleImputer , KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.model_selection import train_test_split , cross_validate
from sklearn.metrics import accuracy_score , recall_score , precision_score , f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [33]:
# load data
df = pd.read_pickle("Loan_Prediction_After_EDA.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Gender                 601 non-null    object 
 1   Married                611 non-null    object 
 2   Dependents             599 non-null    object 
 3   Education              614 non-null    object 
 4   Self_Employed          582 non-null    object 
 5   ApplicantIncome        614 non-null    int64  
 6   CoapplicantIncome      614 non-null    float64
 7   LoanAmount             592 non-null    float64
 8   Loan_Amount_Term       600 non-null    float64
 9   Credit_History         564 non-null    float64
 10  Property_Area          614 non-null    object 
 11  Loan_Status            614 non-null    object 
 12  Total_Income           614 non-null    float64
 13  Loan_Monthly_Paid      578 non-null    float64
 14  Income_After_Loan      578 non-null    float64
 15  log_To

In [34]:
# removing unnecessary columns
df.drop(['ApplicantIncome' , 'CoapplicantIncome' , 'Total_Income' , 'Loan_Monthly_Paid' , 'Income_After_Loan'] , axis = 1 , inplace = True)

In [35]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,log_Total_Income,log_Loan_Monthly_Paid,log_Income_After_Loan
0,Male,No,0,Graduate,No,,360.0,1.0,Urban,Y,8.674026,,
1,Male,Yes,1,Graduate,No,128000.0,360.0,1.0,Rural,N,8.714568,12.781437,
2,Male,Yes,0,Graduate,Yes,66000.0,360.0,1.0,Urban,Y,8.006368,12.119061,
3,Male,Yes,0,Not Graduate,No,120000.0,360.0,1.0,Urban,Y,8.505323,12.716898,
4,Male,No,0,Graduate,No,141000.0,360.0,1.0,Urban,Y,8.699515,12.878166,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,71000.0,360.0,1.0,Rural,Y,7.972466,12.192086,
610,Male,Yes,3+,Graduate,No,40000.0,180.0,1.0,Rural,Y,8.320205,12.311433,
611,Male,Yes,1,Graduate,No,253000.0,360.0,1.0,Urban,Y,9.025456,13.462796,
612,Male,Yes,2,Graduate,No,187000.0,360.0,1.0,Urban,Y,8.933664,13.160515,


In [36]:
# pipeline contains imputation for missing values in numerical and categorical columns, encoding, scaling, imbalanced learning

In [66]:
# we will use one hot encoder for all categorical columns,knn imputer for imputing numerical missing values, simple imputer for imputing categorical missing values

In [37]:
# pipeline for numerical data preprocessing using KNNImputer and standard scaler
num_steps = list()
num_steps.append(("Num_Imputer" , KNNImputer()))
num_steps.append(("Scaler" , StandardScaler()))
Num_Pipeline = Pipeline(steps = num_steps)

In [38]:
# pipeline for categorical data preprocessing using simple imputer, one hot encoder
cat_steps = list()
cat_steps.append(("Cat_Imputer" , SimpleImputer(strategy = "most_frequent")))
cat_steps.append(("Encoder" , OneHotEncoder(sparse_output = False , drop = "first")))
Cat_Pipeline = Pipeline(steps = cat_steps)

In [39]:
# splitting dataframe into feature and target variables
x = df.drop("Loan_Status" , axis = 1)
y = df['Loan_Status']

In [40]:
# encoding the target variable using label encoder
y = LabelEncoder().fit_transform(y)

In [41]:
# separating numerical and categorical columns
Num_Columns = x.select_dtypes(include = "number")
Cat_Columns = x.select_dtypes(include = "object_")

In [42]:
Num_Columns

Unnamed: 0,LoanAmount,Loan_Amount_Term,Credit_History,log_Total_Income,log_Loan_Monthly_Paid,log_Income_After_Loan
0,,360.0,1.0,8.674026,,
1,128000.0,360.0,1.0,8.714568,12.781437,
2,66000.0,360.0,1.0,8.006368,12.119061,
3,120000.0,360.0,1.0,8.505323,12.716898,
4,141000.0,360.0,1.0,8.699515,12.878166,
...,...,...,...,...,...,...
609,71000.0,360.0,1.0,7.972466,12.192086,
610,40000.0,180.0,1.0,8.320205,12.311433,
611,253000.0,360.0,1.0,9.025456,13.462796,
612,187000.0,360.0,1.0,8.933664,13.160515,


In [63]:
df['Property_Area']

0          Urban
1          Rural
2          Urban
3          Urban
4          Urban
         ...    
609        Rural
610        Rural
611        Urban
612        Urban
613    Semiurban
Name: Property_Area, Length: 614, dtype: object

In [62]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Loan_Status', 'log_Total_Income', 'log_Loan_Monthly_Paid',
       'log_Income_After_Loan'],
      dtype='object')

In [43]:
Cat_Columns

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,Male,No,0,Graduate,No,Urban
1,Male,Yes,1,Graduate,No,Rural
2,Male,Yes,0,Graduate,Yes,Urban
3,Male,Yes,0,Not Graduate,No,Urban
4,Male,No,0,Graduate,No,Urban
...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,Rural
610,Male,Yes,3+,Graduate,No,Rural
611,Male,Yes,1,Graduate,No,Urban
612,Male,Yes,2,Graduate,No,Urban


In [44]:
# creating a column transformer for numerical and categorical data preprocessing
Transformer = ColumnTransformer(transformers = [('Num' , Num_Pipeline , Num_Columns.columns) , ('Cat' , Cat_Pipeline , Cat_Columns.columns) ] , remainder='passthrough')

In [45]:
# creating a list of classification models for comparison
models = list()
models.append(("LR" , LogisticRegression()))
#models.append(("MNBA" , MultinomialNB()))
models.append(("GNB" , GaussianNB()))
models.append(("SVM" , SVC()))
models.append(("CART" , DecisionTreeClassifier()))
models.append(("RF" , RandomForestClassifier()))
models.append(("XG" , XGBClassifier()))
models.append(("KNN" , KNeighborsClassifier()))

In [46]:
# comparing the performance of different machine learning models using cross-validation and pipeline
for model in models:
    steps = list()
    steps.append(("Preprocessing" , Transformer))
    steps.append(("SmoteTomek" , SMOTETomek(smote = SMOTE(sampling_strategy = {0 : 220} , random_state = 24))))
    steps.append(model)
    pipeline = ImPipeline(steps = steps)
    scores = cross_validate(pipeline , x , y , scoring = "accuracy"  , cv = 5 , return_train_score = True)
    print(model[0])
    print("Train_accuracy" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_accuracy" , scores["test_score"].mean())
    print("-" * 20)
    print("\n") 

LR
Train_accuracy 0.8025267829053035
----------
Test_accuracy 0.7964014394242304
--------------------


GNB
Train_accuracy 0.802121934661302
----------
Test_accuracy 0.7948020791683327
--------------------


SVM
Train_accuracy 0.8310251188051595
----------
Test_accuracy 0.8012661602025857
--------------------


CART
Train_accuracy 0.9478788932492176
----------
Test_accuracy 0.6856190856990536
--------------------


RF
Train_accuracy 0.9543945490371399
----------
Test_accuracy 0.7769025723044116
--------------------


XG
Train_accuracy 0.9543928932161012
----------
Test_accuracy 0.7426496068239372
--------------------


KNN
Train_accuracy 0.8314324507807196
----------
Test_accuracy 0.7621618019458885
--------------------




In [47]:
# pipeline to chain the preprocessing, resampling, classification steps
steps = list()
steps.append(("Preprocessing" , Transformer))
steps.append(("SmoteTomek" , SMOTETomek(smote = SMOTE(sampling_strategy = {0 : 220} , random_state = 24))))
steps.append(("SVM" , SVC()))
pipeline = ImPipeline(steps = steps)
scores = cross_validate(pipeline , x , y , scoring = "accuracy"  , cv = 5 , return_train_score = True , return_estimator = True)
print("Train_accuracy" , scores["train_score"].mean() )
print("-" * 10)
print("Test_accuracy" , scores["test_score"].mean())
print("-" * 20)
print("\n")

Train_accuracy 0.8310251188051595
----------
Test_accuracy 0.8012661602025857
--------------------




In [48]:
# retrieving hyperparameters of the SVM estimator from cross validation results
scores["estimator"][0]["SVM"].get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [49]:
# pipeline["SVM"].get_params()

In [50]:
# tuning hyperparameters of SVM model using grid search cross-validation
params = [
        {'SVM__C' : [0.1 , 0.15 , 0.2 , 0.25 , 0.3 ] , 'SVM__kernel' : ['linear', 'sigmoid' , 'poly']},
        {'SVM__C' : [0.1 , 0.15 , 0.2 , 0.25 , 0.3 ] , 'SVM__kernel' : ['rbf'] , 'SVM__gamma' : [0.01 , 0.02 , 0.03 ,"scale"]},
        {'SVM__C' : [0.1 , 0.15 , 0.2 , 0.25 , 0.3 ] , 'SVM__kernel' : ['poly'] , "SVM__degree" : [2 , 3 , 4 , 5]}
         ]

In [51]:
steps = list()
steps.append(("Preprocessing" , Transformer))
steps.append(("SmoteTomek" , SMOTETomek(smote = SMOTE(sampling_strategy = {0 : 220} , random_state = 24))))
steps.append(("SVM" , SVC()))
pipeline = ImPipeline(steps = steps)

In [52]:
# performing grid search for hyperparameter tuning with cross-validation
grid_search = GridSearchCV(estimator = pipeline , param_grid =  params, scoring = 'accuracy', cv = 5 , n_jobs = -1 , return_train_score = True)
grid_search.fit(x, y)

In [53]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Mean train score: ", grid_search.cv_results_["mean_train_score"].mean())
print("Mean test score: ", grid_search.cv_results_["mean_test_score"].mean())

Best hyperparameters:  {'SVM__C': 0.15, 'SVM__gamma': 'scale', 'SVM__kernel': 'rbf'}
Mean train score:  0.814725050916497
Mean test score:  0.8071738577296353


In [54]:
# retrieving the best estimator
grid_search.best_estimator_

In [55]:
final_model = grid_search.best_estimator_

In [56]:
joblib.dump(final_model , "Model.pkl")
joblib.dump(x.columns,"Input.pkl")

['Input.pkl']

In [61]:
df['LoanAmount'].value_counts()

120000.0    20
110000.0    17
100000.0    15
160000.0    12
187000.0    12
            ..
240000.0     1
214000.0     1
59000.0      1
166000.0     1
253000.0     1
Name: LoanAmount, Length: 203, dtype: int64

In [57]:
x.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'log_Total_Income', 'log_Loan_Monthly_Paid', 'log_Income_After_Loan'],
      dtype='object')

In [64]:
%%writefile streamlit_app.py

import joblib
import pandas as pd
import numpy as np
import streamlit as st
import sklearn
import imblearn

# Load the pre-trained model 
Model = joblib.load("Model.pkl")
# Load the input data 
Inputs = joblib.load("Input.pkl")

def prediction(Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Log_Total_Income,log_Loan_Monthly_Paid,log_Income_After_Loan):
    # create an empty dataframe with column names specified by inputs
    df = pd.DataFrame(columns=Inputs)
    # assign the input values to the dataframe
    df.at[0,"Gender"] = Gender
    df.at[0,"Married"] = Married
    df.at[0,"Dependents"] = Dependents
    df.at[0,"Education"] = Education
    df.at[0,"Self_Employed"] = Self_Employed
    df.at[0,"LoanAmount"] = LoanAmount
    df.at[0,"Loan_Amount_Term"] = Loan_Amount_Term
    df.at[0,"Credit_History"] = Credit_History
    df.at[0,"Property_Area"] = Property_Area
    df.at[0,"Log_Total_Income"] = Log_Total_Income
    df.at[0,"log_Loan_Monthly_Paid"] = log_Loan_Monthly_Paid
    df.at[0,"log_Income_After_Loan"] = log_Income_After_Loan
    # make a prediction using the loaded model and the dataframe
    result = Model.predict(df)[0]
    return result

def Main():
    # set the title of the streamlit app
    st.title("Loan Approval Predictor")
    # create select boxes for user inputs
    Gender = st.selectbox("Gender",['Male' , 'Female'])
    Married = st.selectbox("Married",['No' , 'Yes'])
    Dependents = st.selectbox("Dependents",[ '0' ,  '1' ,  '2' ,  '3+'])
    Education = st.selectbox("Education",['Graduate', 'Not Graduate'])
    Self_Employed = st.selectbox("Self employed",['No' , 'Yes'])
    # create sliders for user inputs
    LoanAmount = st.slider("Loan amount" , min_value = 9000.0 , max_value = 700000.0 , step = 1000.0 , value = 10000.0)
    Loan_Amount_Term = st.slider("Loan amount term" , min_value = 12.0 , max_value = 480.0 , step = 1.0 , value = 10.0)
    Credit_History = st.selectbox("Credit history" , [1 , 0])
    Property_Area = st.selectbox("Property area", ['Urban' , 'Rural' , 'Semi urban'])
    ApplicantIncome = st.slider("Applicant income", min_value = 0.0 , max_value =  81000.0 , step = 5.0,value = 500.0)
    CoapplicantIncome = st.slider("Coapplicant income", min_value = 0.0 , max_value = 41667.0 , step = 5.0,value = 500.0)
    # calculate total income
    Total_Income = ApplicantIncome + CoapplicantIncome
    # calculate monthly loan payment
    Loan_Monthly_Paid = round((LoanAmount * 1000) / Loan_Amount_Term)
    # calculate income after deducting the loan payment
    Income_After_Loan = ApplicantIncome - Loan_Monthly_Paid
    # calculate logarithm of the total income
    Log_Total_Income = np.log(Total_Income)
    # calculate logarithm of the monthly loan payment
    log_Loan_Monthly_Paid = np.log(Loan_Monthly_Paid)
    # calculate logarithm of the income after loan deduction
    log_Income_After_Loan = np.log(Income_After_Loan)
    
    if st.button("Predict"):
        # call the prediction function with the user inputs
        result = prediction(Gender , Married , Dependents , Education , Self_Employed , LoanAmount , Loan_Amount_Term , Credit_History , Property_Area , Log_Total_Income , log_Loan_Monthly_Paid , log_Income_After_Loan)
        list_result = ["Rejected" , "Accepted"]
        st.text(f"Your Loan Is {list_result[result]}.")
Main()
 

Writing loan_app.py
