#### Make Necessary Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

#### Data Inspection and Cleaning

In [2]:
data= pd.read_csv('loan_data.csv')

In [4]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [5]:
data.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [6]:
data.dtypes

Unnamed: 0,0
Loan_ID,object
Gender,object
Married,object
Dependents,object
Education,object
Self_Employed,object
ApplicantIncome,int64
CoapplicantIncome,float64
LoanAmount,float64
Loan_Amount_Term,float64


In [7]:
data.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,5
Married,0
Dependents,8
Education,0
Self_Employed,21
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,11


In [16]:
#Replacing Null Values
def fill_missing():
    # handling all missing values.
    for col in data.columns:
        # missing values in numerical columns are replaced with mean
        if data[col].dtype == float:
            mean=data[col].mean()
            data[col]= data[col].fillna(mean)
        # missing values in discrete columns are replaced with median
        if data[col].dtype == int:
            median=data[col].median()
            data[col]= data[col].fillna(median)
        # missing values in categorical columns are replaced with mode
        if data[col].dtype == object:
            mode=data[col].mode().iloc[0]
            data[col]= data[col].fillna(mode)
    return data
data=fill_missing()

In [17]:
data.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [18]:
# Remove duplicates
data.drop_duplicates(inplace=True)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             381 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         381 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      381 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   381 non-null    float64
 10  Credit_History     381 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [20]:
df = data.drop('Loan_ID', axis =1)

In [21]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [22]:
#Removing Unneccesary Features
df = df.dropna(subset = ['Gender', 'Dependents', 'Loan_Amount_Term'])

In [23]:
df.shape

(381, 12)

In [24]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [25]:
df['Dependents'].unique()
df['Dependents'].replace('3+', '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace=True)


In [26]:
df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [27]:
encoding = {
    'Gender': {'Male':1 , 'Female': 0},
    'Married': {'Yes': 1, 'No': 0},
    'Dependents': {'0':0, '1':1, '2': 2, '4': 4},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed': {'Yes': 1, 'No': 0},
    'Property_Area': {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status': {'Y': 1, 'N': 0}
}

In [28]:
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             381 non-null    int64  
 1   Married            381 non-null    int64  
 2   Dependents         381 non-null    int64  
 3   Education          381 non-null    int64  
 4   Self_Employed      381 non-null    int64  
 5   ApplicantIncome    381 non-null    int64  
 6   CoapplicantIncome  381 non-null    float64
 7   LoanAmount         381 non-null    float64
 8   Loan_Amount_Term   381 non-null    float64
 9   Credit_History     381 non-null    float64
 10  Property_Area      381 non-null    int64  
 11  Loan_Status        381 non-null    int64  
dtypes: float64(4), int64(8)
memory usage: 35.8 KB


In [31]:
X= df.drop('Loan_Status', axis=1)
y= df['Loan_Status']

In [32]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1,1
2,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
3,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1,1
4,1,1,0,0,0,2333,1516.0,95.0,360.0,1.0,1,1


In [33]:
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [34]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,0.707469,0.098695,0.812575,0.283647,1.0,0
1,1,1,0,1,1,-0.408932,-0.546371,-1.376596,0.283647,1.0,1
2,1,1,0,0,0,-0.703019,0.462294,0.530102,0.283647,1.0,1
3,1,0,0,1,0,1.706799,-0.546371,1.271595,0.283647,1.0,1
4,1,1,0,0,0,-0.87933,0.102118,-0.352629,0.283647,1.0,1


In [35]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
def evaluate_model(model):
    X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.2, random_state = 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__} - Accuarcy : {accuracy: .2f} , Cross-Val-Score: {avg_cross_val: .2f}")
    return avg_cross_val

In [36]:
models= {
    LogisticRegression(): {},
    DecisionTreeClassifier(): {},
    RandomForestClassifier(): {},
    GradientBoostingClassifier(): {},
    svm.SVC(): {}
}

In [37]:
model_score= {model.__class__.__name__: evaluate_model(model) for model in models}

LogisticRegression - Accuarcy :  0.82 , Cross-Val-Score:  0.84
DecisionTreeClassifier - Accuarcy :  0.73 , Cross-Val-Score:  0.77
RandomForestClassifier - Accuarcy :  0.81 , Cross-Val-Score:  0.83
GradientBoostingClassifier - Accuarcy :  0.82 , Cross-Val-Score:  0.82
SVC - Accuarcy :  0.79 , Cross-Val-Score:  0.84


In [41]:
def tune_model(model, param_grid):
  tuner= RandomizedSearchCV(model, param_grid, cv=5, n_iter=20, verbose=True,random_state=42)
  tuner.fit(X,y)
  print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
  print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
  return tuner.best_estimator_

In [50]:
log_reg_grid= {'C': np.logspace(-4,4,20), "solver": ["liblinear"]}
svc_grid= {'C': [0.25, 0.50,0.75,1], "kernel": ['linear']}
rf_grid={
    'n_estimators':np.arange(10,1000,10),
    'max_features': ['log2', 'sqrt'],
    'min_samples_split': [2,5,20,50,100],
    'max_samples':[1,2,5,10]
}

In [47]:
best_log_req= tune_model(LogisticRegression(), log_reg_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression: 0.85
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': np.float64(0.615848211066026)}


In [48]:
best_svc_req= tune_model(svm.SVC(),svc_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score for SVC: 0.85
Best Parameter for SVC: {'kernel': 'linear', 'C': 0.25}


In [51]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier: 0.74
Best Parameter for RandomForestClassifier: {'n_estimators': np.int64(430), 'min_samples_split': 2, 'max_samples': 10, 'max_features': 'sqrt'}


In [52]:
final_model =best_rf

In [54]:
#Save Model
joblib.dump(final_model, 'loan_status_predictor.pkl')

['loan_status_predictor.pkl']

In [55]:
# Prediction System

sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [0],
    'Self_Employed': [0],
    'ApplicantIncome': [1000],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [150],
    'Loan_Amount_Term': [180],
    'Credit_History': [0],
    'Property_Area': [1]
})

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('loan_status_predictor.pkl')
prediction = loaded_model.predict(sample_data)

result = "Loan Approved" if prediction[0] == 1 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")


Prediction Result: Loan Not Approved


In [56]:
joblib.dump(scaler, 'vector.pkl')

['vector.pkl']