In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as mc
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import seaborn as sns

# Data Understanding & Preprocessing

In [5]:
data = pd.read_csv("train_csv.csv")
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [8]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
#sns.pairplot(data , diag_kind=None , kind= "scatter")

In [10]:
data['Gender'].fillna(data['Gender'].mode()[0] , inplace = True)
data['Married'].fillna(data['Married'].mode()[0] , inplace = True)
data['LoanAmount'].interpolate(method= 'linear' , inplace = True)
data['Loan_Amount_Term'].interpolate(method= 'linear' , inplace = True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0] , inplace = True)

In [11]:
data.drop(["Loan_ID"] ,axis =1 ,inplace= True)

In [12]:
data.isnull().sum()

Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            1
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
#sns.pairplot(data,diag_kind=None , kind= "scatter")

In [14]:
data.dropna(inplace= True)
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [15]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Categorical Data


In [16]:
data.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [17]:
data.Married.unique()

array(['Yes', 'No'], dtype=object)

In [18]:
data.Education.unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [19]:
data.Self_Employed.unique()

array(['No', 'Yes'], dtype=object)

In [20]:
data.Property_Area.unique()

array(['Rural', 'Urban', 'Semiurban'], dtype=object)

In [21]:
data.Dependents.unique()

array(['1', '0', '2', '3+'], dtype=object)

In [22]:
data['Dependents'] = data['Dependents'].replace('3+',3)

In [23]:
le=LabelEncoder()

In [24]:
gen =le.fit_transform(data.Gender)
mar =le.fit_transform(data.Married)
edu =le.fit_transform(data.Education)
s_e =data.Self_Employed.map({'No':0 ,'Yes':1})
p_a =le.fit_transform(data.Property_Area)

In [25]:
data.Gender = gen
data.Married = mar
data.Education = edu
data.Self_Employed = s_e
data.Property_Area = p_a

In [26]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,2,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.0,360.0,1.0,0,Y
610,1,1,3,0,0,4106,0.0,40.0,180.0,1.0,0,Y
611,1,1,1,0,0,8072,240.0,253.0,360.0,1.0,2,Y
612,1,1,2,0,0,7583,0.0,187.0,360.0,1.0,2,Y


In [27]:
data.isna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 566 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Gender             566 non-null    bool 
 1   Married            566 non-null    bool 
 2   Dependents         566 non-null    bool 
 3   Education          566 non-null    bool 
 4   Self_Employed      566 non-null    bool 
 5   ApplicantIncome    566 non-null    bool 
 6   CoapplicantIncome  566 non-null    bool 
 7   LoanAmount         566 non-null    bool 
 8   Loan_Amount_Term   566 non-null    bool 
 9   Credit_History     566 non-null    bool 
 10  Property_Area      566 non-null    bool 
 11  Loan_Status        566 non-null    bool 
dtypes: bool(12)
memory usage: 11.1 KB


# Naive Bayes

In [28]:
x = data.iloc[ : , : -1]
y = data.iloc[:, -1]

In [29]:
x_train,x_test ,y_train,y_test =train_test_split(x,y ,test_size =.15)

In [30]:
GNB = GaussianNB()

In [31]:
GNB.fit(x_train , y_train)

GaussianNB()

In [32]:
y_GNB_predict = GNB.predict(x_test)

In [33]:
mc.accuracy_score(y_test , y_GNB_predict)

0.8

In [34]:
report = mc.classification_report(y_test, y_GNB_predict, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)

In [35]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
Y,0.8,0.967742,0.875912,62.0
accuracy,0.8,0.8,0.8,0.8
weighted avg,0.8,0.8,0.770095,85.0
macro avg,0.8,0.657784,0.68038,85.0
N,0.8,0.347826,0.484848,23.0


# SVM

In [36]:
from sklearn import svm

In [37]:
classifier = svm.SVC(kernel='linear')

In [38]:
classifier.fit(x_train,y_train)

SVC(kernel='linear')

In [39]:
y_svm_predict = classifier.predict(x_test)

In [40]:
mc.accuracy_score(y_test , y_svm_predict)

0.8

In [41]:
report = mc.classification_report(y_test, y_svm_predict, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)

In [42]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
Y,0.8,0.967742,0.875912,62.0
accuracy,0.8,0.8,0.8,0.8
weighted avg,0.8,0.8,0.770095,85.0
macro avg,0.8,0.657784,0.68038,85.0
N,0.8,0.347826,0.484848,23.0
