#  **Loan Status Prediction Level 2**


## Importing Libraries

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import warnings
import sklearn
warnings.filterwarnings('ignore')


## Loading the Data Set

In [2]:
df=pd.read_csv("train_u6lujuX_CVtuZ9i (1).csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## Missing values checking

In [3]:

df.isnull().sum()*100/len(df)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [4]:
df=df.drop('Loan_ID',axis=1)
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## Handling the missing values

In [5]:
columns=['Gender',"Dependents","LoanAmount","Loan_Amount_Term","Self_Employed","Credit_History"]
df=df.dropna(subset=columns)
df.isnull().sum()*100/len(df)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

## Handling Categorical Columns

In [6]:
df["Dependents"].value_counts()

0     274
2      85
1      80
3+     41
Name: Dependents, dtype: int64

In [7]:
df["Dependents"].replace({'3+':4},inplace=True)

df['Dependents'].value_counts()

0    274
2     85
1     80
4     41
Name: Dependents, dtype: int64

In [8]:
df['Gender']=df["Gender"].map({"Male":1,"Female":0}).astype('int')
df['Married']=df["Married"].map({"Yes":1,"No":0}).astype('int')
df['Education']=df["Education"].map({"Graduate":1,"Not Graduate":0}).astype('int')
df['Self_Employed']=df["Self_Employed"].map({"No":0,"Yes":1}).astype('int')
df['Property_Area']=df["Property_Area"].map({"Urban":1,"Semiurban":2,"Rural":0}).astype('int')

In [9]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,N
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,Y
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,Y
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,Y
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1,Y


## Storing Feature Matrix in x and Target Variable in Vector y

In [10]:
x=df.drop("Loan_Status",axis=1)
y=df["Loan_Status"]

In [11]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,1


## Feature Scaling

In [12]:
cols=["ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term"]
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
x[cols]=st.fit_transform(x[cols])

In [13]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.137970,-0.027952,-0.208089,0.275542,1.0,0
2,1,1,0,1,1,-0.417536,-0.604633,-0.979001,0.275542,1.0,1
3,1,1,0,0,0,-0.491180,0.297100,-0.307562,0.275542,1.0,1
4,1,0,0,1,0,0.112280,-0.604633,-0.046446,0.275542,1.0,1
5,1,1,2,1,1,0.009319,0.999978,1.520245,0.275542,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.435196,-0.604633,-0.916831,0.275542,1.0,0
610,1,1,4,1,0,-0.222210,-0.604633,-1.302286,-2.487549,1.0,0
611,1,1,1,1,0,0.478206,-0.512854,1.346168,0.275542,1.0,1
612,1,1,2,1,0,0.391846,-0.604633,0.525520,0.275542,1.0,1


## Splitting the Data into Training and Test Set and Applying K-fold Cross Validation

In [14]:
from sklearn.model_selection import cross_val_score
model_df={}
def model_val(model,x,y):
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.20)
    
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f"{model} accuracy is{accuracy_score(y_test,y_pred)}")

    score=cross_val_score(model,x,y,cv=5)
    print(f"{model} cross validation score is{np.mean(score)}")

    model_df[model]=round(np.mean(score)*100,2)

### Logistic Regression

In [15]:

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model_val(model,x,y)

LogisticRegression() accuracy is0.8229166666666666
LogisticRegression() cross validation score is0.8020833333333334


### SVC(Support Vector Classifier)

In [16]:
from sklearn import svm
model=svm.SVC()
model_val(model,x,y)

SVC() accuracy is0.8020833333333334
SVC() cross validation score is0.7979166666666667


### Decision Tree Classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model_val(model,x,y)

DecisionTreeClassifier() accuracy is0.7395833333333334
DecisionTreeClassifier() cross validation score is0.7166666666666666


### Randon Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model_val(model,x,y)

RandomForestClassifier() accuracy is0.8229166666666666
RandomForestClassifier() cross validation score is0.79375


### Gradient Boosting Classifier

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
model=GradientBoostingClassifier()
model_val(model,x,y)

GradientBoostingClassifier() accuracy is0.7916666666666666
GradientBoostingClassifier() cross validation score is0.7854166666666667


In [20]:
model_df

{LogisticRegression(): 80.21,
 SVC(): 79.79,
 DecisionTreeClassifier(): 71.67,
 RandomForestClassifier(): 79.38,
 GradientBoostingClassifier(): 78.54}

## Hyper Parameter Tuning

In [21]:
from sklearn.model_selection import RandomizedSearchCV

### Logistic Regression

In [22]:

log_params={"C":[1.0,2.0,3.0,4.5,5.5,5.0,8.0,8.3,2.2,20.0,15.0],
            "solver":['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
            "max_iter":[100,200,500,1000]}
log_rand=RandomizedSearchCV(LogisticRegression(),param_distributions=log_params,n_iter=20,verbose=True,cv=10)
log_rand.fit(x,y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [23]:
print(log_rand.best_score_)
print(log_rand.best_params_)

0.8083333333333333
{'solver': 'lbfgs', 'max_iter': 200, 'C': 1.0}


### SVC

In [24]:
svc_params={"C":[1.0,2.0,3.0,4.5,5.5,5.0,8.0,8.3,2.2,20.0,15.0],
            "kernel":['linear', 'poly', 'rbf', 'sigmoid'],
            "gamma":['scale','auto']}
svc_rand=RandomizedSearchCV(svm.SVC(),param_distributions=svc_params,n_iter=20,verbose=2,cv=10)
svc_rand.fit(x,y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ......................C=8.3, gamma=auto, kernel=rbf; total time=   0.0s
[CV] END ...................C=4.5, gamma=auto, kernel=linear; total time=   0.0s
[CV] END ...................C=4.5, gamma=auto,

In [25]:
print(svc_rand.best_score_)
print(svc_rand.best_params_)

0.8083333333333332
{'kernel': 'linear', 'gamma': 'auto', 'C': 4.5}


### Random Forest Classifier

In [26]:
ran_params={"n_estimators":[100,200,500,1000,700],
            "criterion":['gini', 'entropy', 'log_loss'],
            "max_depth":[None,1,2,3,5],
            "min_samples_split":[0.5,1.0,2,3,5,3,1,8,10],
            "min_samples_leaf":[1,2,3,5],
            "max_features":["sqrt",'log2',None]}
ran_rand=RandomizedSearchCV(RandomForestClassifier(),param_distributions=ran_params,n_iter=20,verbose=2,cv=10)
ran_rand.fit(x,y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END criterion=entropy, max_depth=None, max_features=None, min_samples_leaf=2, mi

In [27]:
print(ran_rand.best_score_)
print(ran_rand.best_params_)

0.8083333333333332
{'n_estimators': 700, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 2, 'criterion': 'entropy'}


## Saving the Model


In [36]:
x=df.drop("Loan_Status",axis=1)
y=df["Loan_Status"]

In [37]:
rf=RandomForestClassifier(n_estimators= 700,
                          min_samples_split= 3, 
                          min_samples_leaf=1, 
                          max_features= 'log2', 
                          max_depth=2, 
                          criterion = 'entropy')

In [38]:
rf.fit(x,y)

In [39]:
import joblib
joblib.dump(rf,'loan_pred_status')


['loan_pred_status']

In [40]:
model=joblib.load('loan_pred_status')

In [41]:
data=pd.DataFrame({

    'Gender':1,
    'Married':1,
    'Dependents':0,
    'Education':0,
    'Self_Employed':1,
    'ApplicantIncome':2609,
    'CoapplicantIncome':3449,
    'LoanAmount':165,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':2
},index=[0])
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,1,2609,3449,165,180,0,2


In [42]:
result=model.predict(data)

if result=='Y':
  print('loan approved')
else:
  print("loan not approved")
  

loan not approved
