### Importing Necesarry Libraries

In [183]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


### Dataset is loaded using pandas

In [184]:
data = pd.read_csv('loan_approval_dataset.csv')
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


### Removing the leading space character from column names

In [185]:
data.columns = data.columns.str.strip()

### Checking the shape of dataset

In [186]:
data.shape

(4269, 13)

In [187]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
 12  loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


### Checking for missing values in the dataset

In [188]:
data.isnull().sum()

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [189]:
data.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


### Checking distribution of Loan Status

In [190]:
data['loan_status'].value_counts()

loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64

### Label Encoding
#### (Approved - Rejected) --> (1 - 0)
#### (Graduate - Not Graduate) --> (1 - 0)
#### (Yes - No) --> (1 - 0)

In [191]:
data['loan_status'] = data['loan_status'].map({' Approved':1, ' Rejected':0})
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [192]:
# Removing the leading space character from 'Graduate' and 'Not Graduate'

data['education'] = data['education'].astype(str)
data['education'] = data['education'].str.strip()
data['education'] = data['education'].map({'Graduate': 1, 'Not Graduate': 0})
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [193]:
# Removing the leading space character from 'Yes' and 'No'

data['self_employed'] = data['self_employed'].astype(str)
data['self_employed'] = data['self_employed'].str.strip()
data['self_employed'] = data['self_employed'].map({'Yes': 1, 'No': 0})
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,0,0,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,3,1,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,3,1,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,5,0,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [194]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   loan_id                   4269 non-null   int64
 1   no_of_dependents          4269 non-null   int64
 2   education                 4269 non-null   int64
 3   self_employed             4269 non-null   int64
 4   income_annum              4269 non-null   int64
 5   loan_amount               4269 non-null   int64
 6   loan_term                 4269 non-null   int64
 7   cibil_score               4269 non-null   int64
 8   residential_assets_value  4269 non-null   int64
 9   commercial_assets_value   4269 non-null   int64
 10  luxury_assets_value       4269 non-null   int64
 11  bank_asset_value          4269 non-null   int64
 12  loan_status               4269 non-null   int64
dtypes: int64(13)
memory usage: 433.7 KB


### Analyzing Mean Values by Loan Status

In [195]:
data.groupby('loan_status').mean()

Unnamed: 0_level_0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
loan_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2107.033478,2.538128,0.49907,0.50341,5113825.0,14946060.0,11.728456,429.468072,7592498.0,4926720.0,15306940.0,5004960.0
1,2151.984187,2.474774,0.504142,0.503765,5025904.0,15247250.0,10.39759,703.461973,7399812.0,5001355.0,15016600.0,4959526.0


### Defining Independent X and Dependent y Variables

In [196]:
X = data.drop(columns = ['loan_id', 'loan_status'])
y = data['loan_status']

### Splitting the Data into Training and Test Sets

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Data Standardization

In [198]:
scaler = StandardScaler()

In [199]:
scaler.fit(X_train)

In [200]:
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [201]:
print(X_train)

[[ 1.51250774 -1.00263891 -1.01504731 ...  2.04678575  0.07808278
   1.16041374]
 [-1.43500078 -1.00263891  0.98517575 ...  1.22311091  2.49843196
   0.88201987]
 [-0.84549907  0.99736803 -1.01504731 ... -0.8818359  -1.33923881
  -1.31419838]
 ...
 [ 0.92300603 -1.00263891  0.98517575 ...  1.29175048  1.47359943
   0.13963624]
 [-0.25599737 -1.00263891  0.98517575 ... -0.83607619  0.50327926
   1.4388076 ]
 [ 0.92300603  0.99736803 -1.01504731 ... -0.28695963  1.03750048
  -0.10782497]]


### Hyperparameter Tuning with GridSearchCV for Four Models

In [202]:
param_grids = {
    'Decision Tree': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    'KNN': {
        'n_neighbors': [3],
        'weights': ['distance'],
        'metric': ['euclidean']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

models = {
    'Decision Tree': DecisionTreeClassifier(random_state = 42),
    'Random Forest': RandomForestClassifier(random_state = 42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state = 42)
}

In [203]:
best_models = {}
results = {}

for name, model in models.items():
    grid_search = GridSearchCV(estimator = model,
                               param_grid = param_grids[name],
                               cv = 5,
                               scoring = 'accuracy',
                               n_jobs = -1)
    grid_search.fit(X_train, y_train)

    # Saving the best model
    best_models[name] = grid_search.best_estimator_

    # Testing performance
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Accuracy score for {name}: {accuracy}')
    print('---------------------')

Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10}
Accuracy score for Decision Tree: 0.9789227166276346
---------------------
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy score for Random Forest: 0.9800936768149883
---------------------
Best parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Accuracy score for KNN: 0.8934426229508197
---------------------
Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy score for SVM: 0.9402810304449649
---------------------


### Training Random Forest Model with Best Parameters

In [204]:
best_params_rf = {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}

rf_best = RandomForestClassifier(**best_params_rf, random_state = 42)
rf_best.fit(X_train, y_train)

# Predict training data
X_train_prediction = rf_best.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

# Predict test data
y_pred_rf = rf_best.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy Score of testing data: ', rf_accuracy)

Accuracy score of training data :  0.9976573938506589
Accuracy Score of testing data:  0.9402810304449649


### Building a Predictive System

In [205]:
# Loan Status : Rejected (3rd row of dataset)
input_data = [3, 1, 0, 9100000 ,29700000, 20, 506, 7100000, 4500000, 33300000, 12800000]

# Changing input data to a numpy array
input_data_as_array = np.asarray(input_data)

# Reshape the array
reshaped_input_data = input_data_as_array.reshape(1, -1)

# Standartized data
std_data = scaler.transform(reshaped_input_data)

prediction_input = rf_best.predict(std_data)
print(prediction_input)

if prediction_input[0] == 0:
    print('Loan Request is Rejected')
else:
    print('Loan Request is Approved')

[0]
Loan Request is Rejected
