In [1]:
# ------------------------------------------------------------------
# Build the Support Vector Classifier Model
# Predict the loan approval status based on 
# Gender, Marital Status, Credit History, Income and Loan Amount
# ------------------------------------------------------------------

# Import Libraries and read csv file
import pandas as pd
import numpy as np
data = pd.read_csv('01Exercise1.csv')

In [2]:
#find out columns with missing values
data.isnull().sum()
data.dtypes

gender      object
married     object
ch         float64
income       int64
loanamt    float64
status      object
dtype: object

In [3]:
# Replace Missing Values. Drop the rows.
data.dtypes
data.gender.fillna(data.gender.mode()[0], inplace = True)
data.married.fillna(data.married.mode()[0], inplace = True)
data.fillna(data.mean(), inplace = True)
data.isnull().sum()


gender     0
married    0
ch         0
income     0
loanamt    0
status     0
dtype: int64

In [4]:
# Drop irrelevant columns based on business sense
data.head()
data.drop('married', axis = 1)

Unnamed: 0,gender,ch,income,loanamt,status
0,Male,1.0,5849,146.412162,Y
1,Male,1.0,4583,128.000000,N
2,Male,1.0,3000,66.000000,Y
3,Male,1.0,2583,120.000000,Y
4,Male,1.0,6000,141.000000,Y
...,...,...,...,...,...
609,Female,1.0,2900,71.000000,Y
610,Male,1.0,4106,40.000000,Y
611,Male,1.0,8072,253.000000,Y
612,Male,1.0,7583,187.000000,Y


In [5]:
# Create Dummy variables
pd.get_dummies(data, drop_first = True)

Unnamed: 0,ch,income,loanamt,gender_Male,married_Yes,status_Y
0,1.0,5849,146.412162,1,0,1
1,1.0,4583,128.000000,1,1,0
2,1.0,3000,66.000000,1,1,1
3,1.0,2583,120.000000,1,1,1
4,1.0,6000,141.000000,1,0,1
...,...,...,...,...,...,...
609,1.0,2900,71.000000,0,0,1
610,1.0,4106,40.000000,1,1,1
611,1.0,8072,253.000000,1,1,1
612,1.0,7583,187.000000,1,1,1


In [6]:
# Normalize the data (Income and Loan Amount) Using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Income = scaler.fit_transform(data[['income']])
LAmount = scaler.fit_transform(data[['loanamt']])

In [7]:
# Create the X (Independent) and Y (Dependent) dataframes
X = pd.get_dummies(data, drop_first = True)
Y = X.iloc[:, -1]
X = X.iloc[:, :-1]

In [8]:
# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 11)

In [9]:
# Import and build Support Vector Classifier
from sklearn import svm
svc = svm.SVC(kernel = 'linear')
svc.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [10]:
# Predict the outcome using Test data
y_pred = svc.predict(x_test)

In [14]:
# Build the conufsion matrix and get the accuracy/score
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)
metrics.recall_score(y_test, y_pred)
metrics.precision_score(y_test, y_pred)

#confusion matrix
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.33      0.47        64
           1       0.73      0.97      0.83       121

    accuracy                           0.75       185
   macro avg       0.79      0.65      0.65       185
weighted avg       0.77      0.75      0.71       185

