# To predict loan status



### Business Objective:
To predict the impact of the incident raised by the customer.

### Data Set Details:
It’s not at all easy to get a loan from the bank. Getting a loan approved requires a complex mix of factors not the least of which is a steady income! So this ML project aims **to create a model that will classify how much loan the user can obtain** based on various factors such as the user’s marital status, income, education, employment prospects, number of dependents, etc. The dataset attached provides details about all these factors which can then be used to create an ML model that demonstrates the amount of loan that can be approved

In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

loan_train=pd.read_csv('train.csv')
loan_test=pd.read_csv('test.csv')


In [2]:
loan_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
loan_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [4]:
loan_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
loan_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


# # Exploratory Data Analysis-(EDA) on Train & Test Data 

In [6]:
loan_train.shape

(614, 13)

In [7]:
loan_test.shape

(367, 12)

In [8]:
loan_train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [9]:
loan_test.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [10]:
loan_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [None]:
loan_test.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [None]:
loan_train.isnull().sum()

In [None]:
loan_test.isnull().sum()

In [None]:
loan_train.columns

In [None]:
loan_test.columns

In [None]:
loan_train.nunique()

In [None]:
loan_test.nunique()

In [None]:
loan_train.dtypes.value_counts()

In [None]:
loan_test.dtypes.value_counts()

In [None]:
# unique classes in each object columns in train
object_var=loan_train.select_dtypes('object').columns
object_var=object_var[1:]
unique_values=[]
for i in range(0,7):
    unique_values.append(loan_train[object_var[i]].unique())
object_var_table=pd.DataFrame({"Object_variable":object_var,"Unique_Values":unique_values})
object_var_table

In [None]:
# unique classes in each object columns in test
object_var=loan_test.select_dtypes('object').columns
object_var=object_var[1:]
unique_values=[]
for i in range(0,6):
    unique_values.append(loan_test[object_var[i]].unique())
object_var_table=pd.DataFrame({"Object_variable":object_var,"Unique_Values":unique_values})
object_var_table

In [None]:
# describe the categoric columns in train
loan_train.describe(include=['O'])

In [None]:
# describe the categoric columns in test
loan_test.describe(include=['O'])

# Data Visualization

In [None]:
import warnings 
warnings.filterwarnings('ignore')


In [None]:
#Univariant Analysis on train
plt.figure(figsize=(15,8))
plt.subplot(231);sns.countplot('Gender',data=loan_train)
plt.subplot(232);sns.countplot('Married',data=loan_train)
plt.subplot(233);sns.countplot('Credit_History',data=loan_train)
plt.subplot(234);sns.countplot('Education',data=loan_train)
plt.subplot(235);sns.countplot('Self_Employed',data=loan_train)
plt.subplot(236);sns.countplot('Property_Area',data=loan_train)


In [None]:
#Univariant Analysis on test
plt.figure(figsize=(15,8))
plt.subplot(231);sns.countplot('Gender',data=loan_test)
plt.subplot(232);sns.countplot('Married',data=loan_test)
plt.subplot(233);sns.countplot('Credit_History',data=loan_test)
plt.subplot(234);sns.countplot('Education',data=loan_test)
plt.subplot(235);sns.countplot('Self_Employed',data=loan_test)
plt.subplot(236);sns.countplot('Property_Area',data=loan_test)

In [None]:
sns.countplot('Dependents',hue='Loan_Status',data=loan_train)

Most of the people who dont have dependents they applied for loan and got approved too.

In [None]:
plt.figure(figsize=(15,8))
plt.subplot(231);sns.countplot('Gender',hue='Loan_Status',data=loan_train)
plt.subplot(232);sns.countplot('Married',hue='Loan_Status',data=loan_train)
plt.subplot(233);sns.countplot('Credit_History',hue='Loan_Status',data=loan_train)
plt.subplot(234);sns.countplot('Education',hue='Loan_Status',data=loan_train)
plt.subplot(235);sns.countplot('Self_Employed',hue='Loan_Status',data=loan_train)
plt.subplot(236);sns.countplot('Property_Area',hue='Loan_Status',data=loan_train)

In [None]:
##On train data
plt.figure(figsize=(20,15))
plt.subplot(421);sns.histplot(loan_train['ApplicantIncome'])
plt.subplot(422);sns.boxplot(loan_train['ApplicantIncome'])
plt.subplot(423);sns.histplot(loan_train['CoapplicantIncome'])
plt.subplot(424);sns.boxplot(loan_train['CoapplicantIncome'])
plt.subplot(425);sns.histplot(loan_train['LoanAmount'])
plt.subplot(426);sns.boxplot(loan_train['LoanAmount'])
plt.subplot(427);sns.histplot(loan_train['Loan_Amount_Term'])
plt.subplot(428);sns.boxplot(loan_train['Loan_Amount_Term'])
plt.show() 

In [None]:
## test data
plt.figure(figsize=(20,15))
plt.subplot(421);sns.histplot(loan_test['ApplicantIncome'])
plt.subplot(422);sns.boxplot(loan_test['ApplicantIncome'])
plt.subplot(423);sns.histplot(loan_test['CoapplicantIncome'])
plt.subplot(424);sns.boxplot(loan_test['CoapplicantIncome'])
plt.subplot(425);sns.histplot(loan_test['LoanAmount'])
plt.subplot(426);sns.boxplot(loan_test['LoanAmount'])
plt.subplot(427);sns.histplot(loan_test['Loan_Amount_Term'])
plt.subplot(428);sns.boxplot(loan_test['Loan_Amount_Term'])
plt.show()

In [None]:
##Train dataSet

plt.figure(figsize=(20,15))
plt.subplot(421);sns.distplot(loan_train['ApplicantIncome'])
plt.subplot(422);sns.distplot(np.log(loan_train['ApplicantIncome']))
plt.subplot(423);sns.distplot(loan_train['CoapplicantIncome'])
plt.subplot(424);sns.distplot(np.log(loan_train['CoapplicantIncome']+1))
plt.subplot(425);sns.distplot(loan_train['LoanAmount'])
plt.subplot(426);sns.distplot(np.log(loan_train['LoanAmount']+1))
plt.subplot(427);sns.distplot(loan_train['Loan_Amount_Term'])
plt.subplot(428);sns.distplot(np.log(loan_train['Loan_Amount_Term']+1))
plt.show()

In [None]:
##Test dataSet

plt.figure(figsize=(20,15))
plt.subplot(421);sns.distplot(loan_test['ApplicantIncome'])
plt.subplot(422);sns.distplot(np.log(loan_test['ApplicantIncome']+1))
plt.subplot(423);sns.distplot(loan_test['CoapplicantIncome'])
plt.subplot(424);sns.distplot(np.log(loan_test['CoapplicantIncome']+1))
plt.subplot(425);sns.distplot(loan_test['LoanAmount'])
plt.subplot(426);sns.distplot(np.log(loan_test['LoanAmount']+1))
plt.subplot(427);sns.distplot(loan_test['Loan_Amount_Term'])
plt.subplot(428);sns.distplot(np.log(loan_test['Loan_Amount_Term']+1))
plt.show()

In [None]:
##Train data
plt.figure(figsize=(15,10))
plt.subplot(121);plt.scatter('LoanAmount','ApplicantIncome',data=loan_train)
plt.subplot(122);plt.scatter('ApplicantIncome','CoapplicantIncome',data=loan_train)

In [None]:
##Test data
plt.figure(figsize=(15,10))
plt.subplot(121);plt.scatter('LoanAmount','ApplicantIncome',data=loan_test)
plt.subplot(122);plt.scatter('ApplicantIncome','CoapplicantIncome',data=loan_test)

In [None]:
# Targate Vaariable
plt.figure(figsize=(12,4));plt.subplot(122);sns.countplot(loan_train['Loan_Status'])#check our data is balanced or not
print(loan_train['Loan_Status'].value_counts())

By seeing the data,Our data is imbalanced we need to balance it.

# Preprocessing on train Data

In [None]:
# drop 'loan ID' column because not usefull
loan_train.drop('Loan_ID', axis=1, inplace=True)
loan_test.drop('Loan_ID', axis=1, inplace=True)

In [None]:
# ckeck duplicate entries in train data
loan_train.duplicated().any()


In [None]:
# ckeck duplicate entries in test data
loan_test.duplicated().any()

In [None]:
# visualize using Heat map
plt.figure(figsize=(20,5))
sns.heatmap(loan_train.isnull(), cbar=False);plt.show()



In [None]:
loan_train.isna().sum()

In [None]:
# visualize using Heat map
plt.figure(figsize=(20,5))
sns.heatmap(loan_test.isnull(), cbar=False);plt.show()

In [None]:
loan_test.isna().sum()

In [None]:
#Using Mode Function for filling Categorical on train data
 
loan_train['Gender'].fillna(loan_train['Gender'].mode()[0],inplace=True)
loan_train['Married'].fillna(loan_train['Married'].mode()[0],inplace=True)
loan_train['Self_Employed'].fillna(loan_train['Self_Employed'].mode()[0],inplace=True)
loan_train['Dependents'].fillna(loan_train['Dependents'].mode()[0],inplace=True)
loan_train['Credit_History'].fillna(value=loan_train['Credit_History'].mode()[0],inplace=True)

In [None]:
#Using median on numrical data on train
loan_train['LoanAmount'].fillna(value=loan_train['LoanAmount'].mean(),inplace=True)
loan_train['Loan_Amount_Term'].fillna(value=loan_train['Loan_Amount_Term'].mean(),inplace=True)



In [None]:
loan_train.isna().sum()

In [None]:
loan_train.dtypes

In [None]:
#Using Mode Function for filling Categorical on test data

loan_test['Gender'].fillna(loan_test['Gender'].mode()[0],inplace=True)
loan_test['Self_Employed'].fillna(loan_test['Self_Employed'].mode()[0],inplace=True)
loan_test['Dependents'].fillna(loan_test['Dependents'].mode()[0],inplace=True)
loan_test['Credit_History'].fillna(loan_test['Credit_History'].mode()[0],inplace=True)


In [None]:
#Using median on numrical data on test data
loan_test['LoanAmount'].fillna(loan_test['LoanAmount'].mean(),inplace=True)
loan_test['Loan_Amount_Term'].fillna(loan_test['Loan_Amount_Term'].mean(),inplace=True)

In [None]:
loan_test.isna().sum()

In [None]:
loan_test.dtypes

In [None]:
# Coverting categorical to binary values usning LabelEncoder on train data
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()

loan_train['Gender']=pd.DataFrame(lb.fit_transform(loan_train['Gender']))
loan_train['Married']=pd.DataFrame(lb.fit_transform(loan_train['Married']))
loan_train['Dependents']=pd.DataFrame(lb.fit_transform(loan_train['Dependents']))
loan_train['Education']=pd.DataFrame(lb.fit_transform(loan_train['Education']))
loan_train['Loan_Status']=pd.DataFrame(lb.fit_transform(loan_train['Loan_Status']))
loan_train['Self_Employed']=pd.DataFrame(lb.fit_transform(loan_train['Self_Employed']))
loan_train['Property_Area']=pd.DataFrame(lb.fit_transform(loan_train['Property_Area']))

In [None]:
loan_train.head()

In [None]:
# Add both ApplicantIncome and CoapplicantIncome to TotalIncome on train data
loan_train['TotalIncome'] = loan_train['ApplicantIncome'] + loan_train['CoapplicantIncome']

In [None]:
loan_train.head()

In [None]:
loan_train.drop(['ApplicantIncome','CoapplicantIncome'],axis=1,inplace=True)

In [None]:
loan_train.head()

In [None]:
# Coverting categorical to binary values usning LabelEncoder on test data

loan_test['Gender']=pd.DataFrame(lb.fit_transform(loan_test['Gender']))
loan_test['Married']=pd.DataFrame(lb.fit_transform(loan_test['Married']))
loan_test['Dependents']=pd.DataFrame(lb.fit_transform(loan_test['Dependents']))
loan_test['Education']=pd.DataFrame(lb.fit_transform(loan_test['Education']))
loan_test['Self_Employed']=pd.DataFrame(lb.fit_transform(loan_test['Self_Employed']))
loan_test['Property_Area']=pd.DataFrame(lb.fit_transform(loan_test['Property_Area']))

In [None]:
loan_test.head()

In [None]:
# Add both ApplicantIncome and CoapplicantIncome to TotalIncome in test data
loan_test['TotalIncome'] = loan_test['ApplicantIncome'] + loan_test['CoapplicantIncome']

In [None]:
loan_test.head()

In [None]:
loan_test.drop(['ApplicantIncome','CoapplicantIncome'],axis=1,inplace=True)

In [None]:
loan_test.head()

In [None]:
#transfroming loan_amount_term ,lona_amount & totalIncome in training data

loan_train['TotalIncome']=np.log(np.log(loan_train['TotalIncome']))
loan_train['LoanAmount']=np.log(np.log(loan_train['LoanAmount']))
loan_train['Loan_Amount_Term']=np.log(np.log(loan_train['Loan_Amount_Term']))


In [None]:
loan_train.head()

In [None]:
#transfroming loan_amount_term ,lona_amount & totalIncome in test data

loan_test['TotalIncome']=np.log(np.log(loan_test['TotalIncome']))
loan_test['LoanAmount']=np.log(np.log(loan_test['LoanAmount']))
loan_test['Loan_Amount_Term']=np.log(np.log(loan_test['Loan_Amount_Term']))

In [None]:
loan_test.head()

In [None]:
X_test=loan_test.iloc[:,0:]
X_test

In [None]:
X=loan_train.drop('Loan_Status',axis=1)
X

In [None]:
y=loan_train['Loan_Status']
Y=pd.DataFrame(y)
Y

In [None]:
# Feature Engineering
#using Chi-squared for classification

from sklearn.feature_selection import SelectKBest,chi2
from numpy import set_printoptions
#Feature Extration
test=SelectKBest(score_func=chi2,k=6)
fit=test.fit(X,Y)

#Summaries of Scores
set_printoptions(precision=3)
print(fit.scores_)
features=fit.transform(X)
features

a=pd.DataFrame(X.columns)
b=pd.DataFrame(fit.scores_)
c=pd.concat([a,b],axis=1)
c.columns=['Features','chi2_scores']
c.sort_values(by='chi2_scores',ascending=False)

# Balancing the data

In [None]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 
print('Original data set shape %s' % Counter(y))
imb = RandomOverSampler(sampling_strategy='minority',random_state=1)
X_res,Y_res = imb.fit_resample(X,y)
print('Resample data set shape %s' % Counter(Y_res))

In [None]:
#Model Validation Method

#Using KFold Method Cross Validation

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
kfold=KFold(n_splits=10,random_state=7)
LR=LogisticRegression()
results=cross_val_score(LR,X_res,Y_res,cv=kfold)

In [None]:
results.mean()*100


In [None]:
results.std()*100.0

In [None]:
# Random Forest Classification on training data
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier(n_estimators=100,max_features=3)
result_rf=cross_val_score(model_rf,X_res,Y_res,cv=kfold)
result_rf.mean()*100

In [None]:
model_rf.fit(X_res,Y_res)


In [None]:
y_pred=model_rf.predict(loan_test)
y_pred_test=pd.DataFrame(y_pred)
y_pred_test

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y,y_pred_test))

In [None]:
from sklearn.metrics import precision_score

print("Precision score: {}".format(precision_score(y_pred,y)))

In [None]:
# Using XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train,X_test,y_train,y_test=train_test_split(X_res,Y_res,random_state=42,test_size=0.33)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
# fit model no training data
model_XG = XGBClassifier()
model_XG.fit(X_train, y_train)

In [None]:
# make predictions for test data
y_pred_xg = model_XG.predict(X_test)

y_pred_xg

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_xg)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Exploratory Data Analysis-(EDA) on Test Data