In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

#### Data Collection and Preprocessing

In [2]:
df = pd.read_csv('data/data.csv',index_col='Loan_ID')

In [3]:
df.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 12)

In [5]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


In [7]:
columns = df.columns

#### Pipeline for Preprocessing

In [8]:
column_preprocessing = ColumnTransformer(transformers=[
    ('impute_married',SimpleImputer(strategy='constant',fill_value=0.0),['Married']),
    ('impute_dependents',SimpleImputer(strategy='most_frequent'),['Dependents']),
    ('impute_credit',SimpleImputer(strategy='constant',fill_value=1.0),['Credit_History']),
    ('one-hot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['Property_Area']),
    ('standard_scaler',StandardScaler(),['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'])
    ],
    remainder='passthrough',
    n_jobs=-1)

In [9]:
class Cat_to_Num(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.replace({
                    'Married':{'Yes':1.0,"No":0.0},
                    'Gender':{'Male':1,'Female':0},
                    'Self_Employed':{'Yes':1,'No':0},
                    'Education':{'Graduate':1,'Not Graduate':0},
                    'Loan_Status':{'N':0,'Y':1}
                })
    def get_feature_names_out(self, input_features=None):
        return columns

In [10]:
class Dependents(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X.replace(to_replace='3+',value=4)
    def get_feature_names_out(self, input_features=None):
        return columns

In [11]:
preprocessing = Pipeline(steps=[
    ('cat_to_num_values',Cat_to_Num()),
    ('depnedents',Dependents()),
    ('column_processing',column_preprocessing),
])

In [12]:
from sklearn import set_config

set_config(display='diagram')
display(preprocessing)

In [13]:
df2 = preprocessing.fit_transform(df)

  return X.replace({


In [14]:
df3 = pd.DataFrame(df2, columns=preprocessing.get_feature_names_out())

In [15]:
df4 = df3.dropna()

In [16]:
df4

Unnamed: 0,impute_married__Married,impute_dependents__Dependents,impute_credit__Credit_History,one-hot__Property_Area_Rural,one-hot__Property_Area_Semiurban,one-hot__Property_Area_Urban,standard_scaler__ApplicantIncome,standard_scaler__CoapplicantIncome,standard_scaler__LoanAmount,standard_scaler__Loan_Amount_Term,remainder__Gender,remainder__Education,remainder__Self_Employed,remainder__Loan_Status
1,1.0,1,1.0,1.0,0.0,0.0,-0.134412,-0.038732,-0.215309,0.276642,1.0,1.0,0.0,0.0
2,1.0,0,1.0,0.0,0.0,1.0,-0.393747,-0.554487,-0.940328,0.276642,1.0,1.0,1.0,1.0
3,1.0,0,1.0,0.0,0.0,1.0,-0.462062,0.25198,-0.30886,0.276642,1.0,0.0,0.0,1.0
4,0.0,0,1.0,0.0,0.0,1.0,0.097728,-0.554487,-0.063289,0.276642,1.0,1.0,0.0,1.0
5,1.0,2,1.0,0.0,0.0,1.0,0.002218,0.8806,1.410137,0.276642,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,0,1.0,1.0,0.0,0.0,-0.41013,-0.554487,-0.881859,0.276642,0.0,1.0,0.0,1.0
610,1.0,4,1.0,1.0,0.0,0.0,-0.212557,-0.554487,-1.244368,-2.489775,1.0,1.0,0.0,1.0
611,1.0,1,1.0,0.0,0.0,1.0,0.437174,-0.472404,1.246423,0.276642,1.0,1.0,0.0,1.0
612,1.0,2,1.0,0.0,0.0,1.0,0.357064,-0.554487,0.474628,0.276642,1.0,1.0,0.0,1.0


In [17]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 535 entries, 1 to 613
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   impute_married__Married             535 non-null    object
 1   impute_dependents__Dependents       535 non-null    object
 2   impute_credit__Credit_History       535 non-null    object
 3   one-hot__Property_Area_Rural        535 non-null    object
 4   one-hot__Property_Area_Semiurban    535 non-null    object
 5   one-hot__Property_Area_Urban        535 non-null    object
 6   standard_scaler__ApplicantIncome    535 non-null    object
 7   standard_scaler__CoapplicantIncome  535 non-null    object
 8   standard_scaler__LoanAmount         535 non-null    object
 9   standard_scaler__Loan_Amount_Term   535 non-null    object
 10  remainder__Gender                   535 non-null    object
 11  remainder__Education                535 non-null    object
 12 

In [18]:
df4 = df4.astype({
    'impute_married__Married':'int8',
    'impute_dependents__Dependents':'int8',
    'impute_credit__Credit_History':'int8',
    'one-hot__Property_Area_Rural':'int8',
    'one-hot__Property_Area_Semiurban':'int8',
    'one-hot__Property_Area_Urban':'int8',
    'standard_scaler__ApplicantIncome':'float32',
    'standard_scaler__CoapplicantIncome':'float32',
    'standard_scaler__LoanAmount':'float32',
    'standard_scaler__Loan_Amount_Term':'float32',
    'remainder__Gender':'int8',
    'remainder__Education':'int8',
    'remainder__Self_Employed':'int8',
    'remainder__Loan_Status':'int8'
})

In [19]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 535 entries, 1 to 613
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   impute_married__Married             535 non-null    int8   
 1   impute_dependents__Dependents       535 non-null    int8   
 2   impute_credit__Credit_History       535 non-null    int8   
 3   one-hot__Property_Area_Rural        535 non-null    int8   
 4   one-hot__Property_Area_Semiurban    535 non-null    int8   
 5   one-hot__Property_Area_Urban        535 non-null    int8   
 6   standard_scaler__ApplicantIncome    535 non-null    float32
 7   standard_scaler__CoapplicantIncome  535 non-null    float32
 8   standard_scaler__LoanAmount         535 non-null    float32
 9   standard_scaler__Loan_Amount_Term   535 non-null    float32
 10  remainder__Gender                   535 non-null    int8   
 11  remainder__Education                535 non-null  

In [26]:
df4

Unnamed: 0,impute_married__Married,impute_dependents__Dependents,impute_credit__Credit_History,one-hot__Property_Area_Rural,one-hot__Property_Area_Semiurban,one-hot__Property_Area_Urban,standard_scaler__ApplicantIncome,standard_scaler__CoapplicantIncome,standard_scaler__LoanAmount,standard_scaler__Loan_Amount_Term,remainder__Gender,remainder__Education,remainder__Self_Employed,remainder__Loan_Status
1,1,1,1,1,0,0,-0.134412,-0.038732,-0.215309,0.276642,1,1,0,0
2,1,0,1,0,0,1,-0.393747,-0.554487,-0.940328,0.276642,1,1,1,1
3,1,0,1,0,0,1,-0.462062,0.251980,-0.308860,0.276642,1,0,0,1
4,0,0,1,0,0,1,0.097728,-0.554487,-0.063289,0.276642,1,1,0,1
5,1,2,1,0,0,1,0.002218,0.880600,1.410137,0.276642,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,1,1,0,0,-0.410130,-0.554487,-0.881859,0.276642,0,1,0,1
610,1,4,1,1,0,0,-0.212557,-0.554487,-1.244368,-2.489775,1,1,0,1
611,1,1,1,0,0,1,0.437174,-0.472404,1.246423,0.276642,1,1,0,1
612,1,2,1,0,0,1,0.357064,-0.554487,0.474628,0.276642,1,1,0,1


In [20]:
X = df4.drop(columns=['remainder__Loan_Status']).values
y = df4['remainder__Loan_Status'].to_numpy(dtype=np.int8)

#### Splitting the data into Train and Test data

In [21]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=.2,stratify=y,random_state=2)

In [22]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train,y_train)

In [23]:
clf.score(X_test,y_test)

0.822429906542056

In [24]:
clf2 = LogisticRegression()
clf2.fit(X_train,y_train)

In [25]:
clf2.score(X_test,y_test)

0.8130841121495327