In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score


import pickle
import json

In [2]:
df = pd.read_csv("Loan.csv")
df.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
162,LP001565,Male,Yes,1,Graduate,No,3089,1280.0,121.0,360.0,0.0,Semiurban,N
103,LP001356,Male,Yes,0,Graduate,No,4652,3583.0,,360.0,1.0,Semiurban,Y
485,LP002544,Male,Yes,1,Not Graduate,No,1958,2436.0,131.0,360.0,1.0,Rural,Y
111,LP001387,Female,Yes,0,Graduate,,2929,2333.0,139.0,360.0,1.0,Semiurban,Y
81,LP001266,Male,Yes,1,Graduate,Yes,2395,0.0,,360.0,1.0,Semiurban,Y


In [3]:
df.shape

(614, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
# sns.countplot(df["Gender"]) --> Male
# sns.countplot(df["Married"]) --> Yes
# sns.countplot(df["Dependents"]) --> 0
# sns.countplot(df["Self_Employed"]) --> No

In [8]:
df["Gender"] = df["Gender"].fillna("Male")
df["Married"] = df["Married"].fillna("Yes")
df["Dependents"] = df["Dependents"].fillna("0")
df["Self_Employed"] = df["Self_Employed"].fillna("No")
df["ApplicantIncome"] = df["ApplicantIncome"].astype(int)
df["CoapplicantIncome"] = df["CoapplicantIncome"].astype(int)
df["LoanAmount"] = df["LoanAmount"].fillna(df["LoanAmount"].mean()).astype(int)
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(df["Loan_Amount_Term"].mean()).astype(int)
df["Credit_History"] = df["Credit_History"].fillna(df["Credit_History"].mean()).astype(int)

In [9]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
df.dtypes

Loan_ID              object
Gender               object
Married              object
Dependents           object
Education            object
Self_Employed        object
ApplicantIncome       int32
CoapplicantIncome     int32
LoanAmount            int32
Loan_Amount_Term      int32
Credit_History        int32
Property_Area        object
Loan_Status          object
dtype: object

In [11]:
# df['Gender'].value_counts().to_dict()                 #  {'Male': 502, 'Female': 112}
# df['Married'].value_counts().to_dict()                #  {'Yes': 401, 'No': 213}
# df['Dependents'].value_counts().to_dict()             #  {'0': 360, '1': 102, '2': 101, '3+': 51}
# df['Education'].value_counts().to_dict()              #  {'Graduate': 480, 'Not Graduate': 134}
# df['Self_Employed'].value_counts().to_dict()          #  {'No': 532, 'Yes': 82}
# df['Property_Area'].value_counts().to_dict()          #  {'Semiurban': 233, 'Urban': 202, 'Rural': 179}
# df['Loan_Status'].value_counts().to_dict()            #  {'Y': 422, 'N': 192}

In [12]:
New_df = df.copy()

In [13]:
New_df['Gender'].replace({'Male': 1, 'Female': 0}, inplace = True)
New_df['Married'].replace({'Yes': 1, 'No': 0}, inplace = True)
New_df['Dependents'].replace({'0': 0, '1': 1, '2': 2, '3+': 3}, inplace = True)
New_df['Education'].replace({'Graduate': 1, 'Not Graduate': 0}, inplace = True)
New_df['Self_Employed'].replace({'No': 0, 'Yes': 1}, inplace = True)
New_df['Property_Area'].replace({'Semiurban': 0, 'Urban': 1, 'Rural': 2}, inplace = True)
New_df['Loan_Status'].replace({'Y': 1, 'N': 0}, inplace = True)
New_df.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
556,LP002794,0,0,0,1,0,2667,1625,84,360,0,1,1
271,LP001891,1,1,0,1,0,11146,0,136,360,1,1,1
325,LP002067,1,1,1,1,1,8666,4983,376,360,0,2,0
199,LP001673,1,0,0,1,1,11000,0,83,360,1,1,0
276,LP001903,1,1,0,1,0,3993,3274,207,360,1,0,1


In [14]:
X = New_df.drop(["Loan_ID","Loan_Status"],axis=1)
Y = New_df["Loan_Status"]

In [15]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [16]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((491, 11), (123, 11), (491,), (123,))

In [17]:
Logistic_Model = LogisticRegression()
Logistic_Model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [18]:
train_score = Logistic_Model.score(x_train,y_train)
print(train_score)

test_score = Logistic_Model.score(x_test,y_test)
print(test_score)

0.7678207739307535
0.7398373983739838


In [19]:
y_pred = Logistic_Model.predict(x_test)

In [20]:
accuracy_score(y_test,y_pred)

0.7398373983739838

In [21]:
# Evalution on Training Data Set 

def model_eval(x_data,y_data,Logistic_Model):
    y_pred = Logistic_Model.predict(x_data)

    con_matrix = confusion_matrix(y_data,y_pred)
    print(f"Confusion Matrix = \n{con_matrix}")
    print("*"*50)

    clf_report = classification_report(y_data,y_pred)
    print(f"Classification Report =\n{clf_report}")
    print("*"*50)

    acc_score = accuracy_score(y_data,y_pred)
    print(f"Accuracy Score = {acc_score}")
    print("*"*50)

    precision = precision_score(y_data,y_pred)
    print(f"Precision = {precision}")

    recall = recall_score(y_data,y_pred)
    print(f"Recall = {recall}")

    f1 = f1_score(y_data,y_pred)
    print(f"f1- Score = {f1}")
    print("*"*50)
    
    return "SUCCESS"

In [22]:
 model_eval(x_train,y_train,Logistic_Model)

Confusion Matrix = 
[[ 67  82]
 [ 32 310]]
**************************************************
Classification Report =
              precision    recall  f1-score   support

           0       0.68      0.45      0.54       149
           1       0.79      0.91      0.84       342

    accuracy                           0.77       491
   macro avg       0.73      0.68      0.69       491
weighted avg       0.76      0.77      0.75       491

**************************************************
Accuracy Score = 0.7678207739307535
**************************************************
Precision = 0.7908163265306123
Recall = 0.9064327485380117
f1- Score = 0.8446866485013624
**************************************************


'SUCCESS'

In [23]:
dict_file = {"Column_Names":X.columns.tolist(),
             "Gender":{'Male': 1, 'Female': 0},
             "Married":{'Yes': 1, 'No': 0},
             "Dependents":{'0': 0, '1': 1, '2': 2, '3+': 3},
             "Education":{'Graduate': 1, 'Not Graduate': 0},
             "Self_Employed":{'No': 0, 'Yes': 1},
             "Property_Area":{'Semiurban': 0, 'Urban': 1, 'Rural': 2},
             "Loan_Status":{'Y': 1, 'N': 0}
            }

with open("dict_file.json","w") as file:
    json.dump(dict_file,file)

In [24]:
Gender            = "Male"
Married           = "Yes"
Dependents        = "0"
Education         = "Graduate"
Self_Employed     = "No"
ApplicantIncome   = 11146
CoapplicantIncome = 0
LoanAmount        = 136
Loan_Amount_Term  = 360
Credit_History    = 1
Property_Area     = "Urban"

# Preprocessing 

if Gender == "Male":
    Gender = 1
else: 
    Gender = 0
    
if Married == "Yes":
    Married = 1
else: 
    Married = 0
    
if Dependents == "0":
    Dependents = 0
elif Dependents == "1": 
    Dependents = 1
elif Dependents == "2": 
    Dependents = 2
else: 
    Dependents = 3

if Education == "Graduate":
    Education = 1
else: 
    Education = 0

if Self_Employed == "Yes":
    Self_Employed = 1
else: 
    Self_Employed = 0

if Property_Area == "Semiurban":
    Property_Area = 0
elif Property_Area == "Urban": 
    Property_Area = 1
else: 
    Property_Area = 2



user_data = np.zeros(len(X.columns))
user_data[0]    = Gender
user_data[1]    = Married
user_data[2]    = Dependents
user_data[3]    = Education
user_data[4]    = Self_Employed
user_data[5]    = ApplicantIncome
user_data[6]    = CoapplicantIncome
user_data[7]    = LoanAmount
user_data[8]    = Loan_Amount_Term
user_data[9]    = Credit_History
user_data[10]   = Property_Area

result = Logistic_Model.predict([user_data])[0]
if result == 1:
    result = "Congratulation You Are Eligible For Loan"
else: 
    result = " Sorry! You Are Not Eligible For Loan"

result



'Congratulation You Are Eligible For Loan'

In [25]:
with open('Loan_Pred_Model.pkl','wb') as file: 
    pickle.dump(Logistic_Model,file)