In [11]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# importing required libraries for Machine learing models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Onboarding the data & create the data frame
df = pd.read_csv(r'E:\Python\AI&ML\SourceData\loan_data.csv')

In [14]:
# Exploratory Data Analysis (EDA)
# To see 1st five records of df
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [15]:
# To see lsst five records of df
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
376,LP002953,Male,Yes,3+,Graduate,No,5703,0.0,128.0,360.0,1.0,Urban,Y
377,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
378,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
379,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
380,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [16]:
# To see the non-null count and data type of each column in df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [17]:
# To see the dimension (r x c) of df
df.shape

(381, 13)

In [18]:
# To see the minimal statistical report of df
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,381.0,381.0,381.0,370.0,351.0
mean,3579.845144,1277.275381,104.986877,340.864865,0.837607
std,1419.813818,2340.818114,28.358464,68.549257,0.369338
min,150.0,0.0,9.0,12.0,0.0
25%,2600.0,0.0,90.0,360.0,1.0
50%,3333.0,983.0,110.0,360.0,1.0
75%,4288.0,2016.0,127.0,360.0,1.0
max,9703.0,33837.0,150.0,480.0,1.0


In [19]:
# To see column names of df
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [20]:
df = df[['Gender', 'Married', 'Dependents', 'ApplicantIncome', 'LoanAmount', 'Loan_Status']]
df # taking required columns to the Analysis based on business sense or droping the not required columns

Unnamed: 0,Gender,Married,Dependents,ApplicantIncome,LoanAmount,Loan_Status
0,Male,Yes,1,4583,128.0,N
1,Male,Yes,0,3000,66.0,Y
2,Male,Yes,0,2583,120.0,Y
3,Male,No,0,6000,141.0,Y
4,Male,Yes,0,2333,95.0,Y
...,...,...,...,...,...,...
376,Male,Yes,3+,5703,128.0,Y
377,Male,Yes,0,3232,108.0,Y
378,Female,No,0,2900,71.0,Y
379,Male,Yes,3+,4106,40.0,Y


In [21]:
# Checking for null values in each column
df.isnull().sum()

Gender             5
Married            0
Dependents         8
ApplicantIncome    0
LoanAmount         0
Loan_Status        0
dtype: int64

In [22]:
# Finding mode for gender column (0 will be for exact mode value)
Gen_Mode = df['Gender'].mode()[0]
Gen_Mode

'Male'

In [23]:
# Replace '+' with an empty string in the 'Dependents' column & converting it to numeric datatype
df['Dependents'] = df['Dependents'].str.replace('+', '', regex=False)
# With regex=False, '+' is treated as a plain character and will be replaced with blank from the column.
df['Dependents'] = pd.to_numeric(df['Dependents'])
#df['Dependents'] = df['Dependents'].astype(int)
df.dtypes

Gender              object
Married             object
Dependents         float64
ApplicantIncome      int64
LoanAmount         float64
Loan_Status         object
dtype: object

In [24]:

# To find Dependents column mean value
Dep_Mean =round(df['Dependents'].mean())
Dep_Mean

1

In [25]:
# Filling gender null values with it's mode (Most frequent occuring value in Gender column) & Dependents column with it's mean value
df['Gender'] = df['Gender'].fillna(Gen_Mode)
df['Dependents'] = df['Dependents'].fillna(Dep_Mean)
df.isnull().sum()

Gender             0
Married            0
Dependents         0
ApplicantIncome    0
LoanAmount         0
Loan_Status        0
dtype: int64

In [26]:
# converting Dependents & LoanAmount to integers
df['Dependents'] = df['Dependents'].astype(int)
df['LoanAmount'] = pd.to_numeric(df['LoanAmount']).astype(int)
df

Unnamed: 0,Gender,Married,Dependents,ApplicantIncome,LoanAmount,Loan_Status
0,Male,Yes,1,4583,128,N
1,Male,Yes,0,3000,66,Y
2,Male,Yes,0,2583,120,Y
3,Male,No,0,6000,141,Y
4,Male,Yes,0,2333,95,Y
...,...,...,...,...,...,...
376,Male,Yes,3,5703,128,Y
377,Male,Yes,0,3232,108,Y
378,Female,No,0,2900,71,Y
379,Male,Yes,3,4106,40,Y


In [27]:
# Reordering the columns
df = df[['Dependents', 'ApplicantIncome', 'LoanAmount', 'Gender', 'Married', 'Loan_Status']]
df

Unnamed: 0,Dependents,ApplicantIncome,LoanAmount,Gender,Married,Loan_Status
0,1,4583,128,Male,Yes,N
1,0,3000,66,Male,Yes,Y
2,0,2583,120,Male,Yes,Y
3,0,6000,141,Male,No,Y
4,0,2333,95,Male,Yes,Y
...,...,...,...,...,...,...
376,3,5703,128,Male,Yes,Y
377,0,3232,108,Male,Yes,Y
378,0,2900,71,Female,No,Y
379,3,4106,40,Male,Yes,Y


In [28]:
# Creating dummy variables # Data encoding
df.dtypes
df=pd.get_dummies(df, drop_first=True, dtype=int)
#df=pd.get_dummies(df, dtype=int) # dtype=int, To avoid booleans True or False
# with drop_first=True, Gender_Female, Married_No, Loan_Status_No will be droped.
df

Unnamed: 0,Dependents,ApplicantIncome,LoanAmount,Gender_Male,Married_Yes,Loan_Status_Y
0,1,4583,128,1,1,0
1,0,3000,66,1,1,1
2,0,2583,120,1,1,1
3,0,6000,141,1,0,1
4,0,2333,95,1,1,1
...,...,...,...,...,...,...
376,3,5703,128,1,1,1
377,0,3232,108,1,1,1
378,0,2900,71,0,0,1
379,3,4106,40,1,1,1


In [29]:
# Data Normalization (ApplicantIncome, LoanAmount) using standardscalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['ApplicantIncome', 'LoanAmount']] = scaler.fit_transform(df[['ApplicantIncome', 'LoanAmount']])
df

Unnamed: 0,Dependents,ApplicantIncome,LoanAmount,Gender_Male,Married_Yes,Loan_Status_Y
0,1,0.707469,0.812575,1,1,0
1,0,-0.408932,-1.376596,1,1,1
2,0,-0.703019,0.530102,1,1,1
3,0,1.706799,1.271595,1,0,1
4,0,-0.879330,-0.352629,1,1,1
...,...,...,...,...,...,...
376,3,1.497342,0.812575,1,1,1
377,0,-0.245316,0.106391,1,1,1
378,0,-0.479457,-1.200050,0,0,1
379,3,0.371067,-2.294635,1,1,1


In [30]:
# defining X and Y variables
X = df.drop('Loan_Status_Y', axis=1) # axis=1 tells pandas that 'Loan_Status' is a column, not a row.
Y = df['Loan_Status_Y']

In [31]:
# Split the X & Y datasets into train & test sets
# from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [32]:
#SVM model building
svc = SVC()
svc.fit(X_train, Y_train)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [33]:
# Predict the outcome with Test Data
Y_pred = svc.predict(X_test)

In [34]:
# Build the confusion matrix
# from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
cm

array([[ 0, 22],
       [ 0, 55]])

In [35]:
F1Score = svc.score(X_test, Y_test)
F1Score

# Key Targets:
# ≈ 50%	    No better than random guessing (binary classification)
# 60–70%	Weak / baseline model
# 70–85%	Reasonable / decent
# 85–95%	Very good
# > 95%	    Excellent (check for overfitting or data leakage)
# Note: High accuracy can be misleading if classes are imbalanced.

0.7142857142857143