In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Load data

In [2]:
df = pd.read_csv('data/train_u6lujuX_CVtuZ9i (1).csv')
df = df.dropna()
df.drop(['Loan_ID'],axis=1, inplace=True)
le = LabelEncoder()
features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Loan_Status']

for feature in features:
    df[feature] = le.fit_transform(df[feature])
df = pd.get_dummies(df,prefix=['Property_Area'])
columns = list(df.columns)
columns.remove('Loan_Status')
columns.append('Loan_Status')
df = df[columns]
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,1,0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,0,0,1,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,0,0,1,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,0,0,1,1
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,0,0,1,1


In [3]:
print('shape: ', df.shape)

shape:  (480, 14)


# Split data into train,test

In [4]:
X = df.iloc[:,0:13].values
Y = df.iloc[:,13].values

In [5]:
X

array([[1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 0., 1.],
       [1., 1., 0., ..., 0., 0., 1.],
       ...,
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 1., 2., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [6]:
Y

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,

In [7]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=11)

In [8]:
X_train

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 1., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [1., 1., 2., ..., 0., 0., 1.],
       [1., 1., 2., ..., 0., 0., 1.],
       [1., 1., 2., ..., 1., 0., 0.]])

In [15]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=1)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

LR: 0.815753 (0.053320)
LDA: 0.821819 (0.052100)
KNN: 0.669998 (0.041725)
CART: 0.750596 (0.088315)
NB: 0.809782 (0.054184)
SVM: 0.711269 (0.013283)




# Predict

In [24]:
# Make predictions on validation dataset
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)



In [25]:
# Evaluate predictions
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

0.7777777777777778
[[20 29]
 [ 3 92]]
              precision    recall  f1-score   support

           0       0.87      0.41      0.56        49
           1       0.76      0.97      0.85        95

    accuracy                           0.78       144
   macro avg       0.81      0.69      0.70       144
weighted avg       0.80      0.78      0.75       144

