# Author: Makayla McKibben
## Course: DSC550 Data Mining
## Exercise 9.2
## Date: 10.29.2024

In [2]:
# Import relevant libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Import dataset
loans = pd.read_csv('Loan_Train.csv')

In [4]:
# Check that it loaded correctly
loans.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# Check that it loaded correctly
loans.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
# Drop Load_ID column
loans.drop('Loan_ID', inplace = True, axis = 1)
loans

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [7]:
# Make the df of categorical columns
to_dummy = loans.select_dtypes(include = 'object')
to_dummy

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,Urban,Y
1,Male,Yes,1,Graduate,No,Rural,N
2,Male,Yes,0,Graduate,Yes,Urban,Y
3,Male,Yes,0,Not Graduate,No,Urban,Y
4,Male,No,0,Graduate,No,Urban,Y
...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,Rural,Y
610,Male,Yes,3+,Graduate,No,Rural,Y
611,Male,Yes,1,Graduate,No,Urban,Y
612,Male,Yes,2,Graduate,No,Urban,Y


In [8]:
# Make list of categorical columns
colnames = to_dummy.columns.tolist()

In [9]:
# Drop categorical columns
for item in colnames:
    loans.drop(item, inplace = True, axis = 1)

In [10]:
# Get and add the dummies to the df then take a look
dummy = pd.get_dummies(to_dummy, drop_first = True)
loans = pd.concat([loans, dummy], axis = 1)
loans.head(18)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,,360.0,1.0,True,False,False,False,False,False,False,False,True,True
1,4583,1508.0,128.0,360.0,1.0,True,True,True,False,False,False,False,False,False,False
2,3000,0.0,66.0,360.0,1.0,True,True,False,False,False,False,True,False,True,True
3,2583,2358.0,120.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True
4,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,False,False,False,True,True
5,5417,4196.0,267.0,360.0,1.0,True,True,False,True,False,False,True,False,True,True
6,2333,1516.0,95.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True
7,3036,2504.0,158.0,360.0,0.0,True,True,False,False,True,False,False,True,False,False
8,4006,1526.0,168.0,360.0,1.0,True,True,False,True,False,False,False,False,True,True
9,12841,10968.0,349.0,360.0,1.0,True,True,True,False,False,False,False,True,False,False


In [11]:
# Drop ANY rows with missing data
loans.dropna(how = 'any', inplace = True)

In [12]:
# Set features and target variables
target = loans['Loan_Status_Y']
loans.drop('Loan_Status_Y', axis = 1, inplace = True)
features = loans

In [13]:
# Split into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 18)

In [14]:
# Create min max scaler object
minmax_scaler = preprocessing.MinMaxScaler()

In [15]:
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5).fit(features_train, target_train)

In [16]:
# Create pipeline
pipe = Pipeline([('minmax_scaler', minmax_scaler), ('knn', knn)])

In [17]:
# Fits KNN model to the training data
model1 = pipe.fit(features_train, target_train)
model1

In [18]:
# Makes predictions with the model
preds1 = model1.predict(features_test)
preds1

array([ True,  True, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [19]:
# Find vector that will tell us about correct number of guesses
acc1 = target_test * 1 + preds1 * 1

In [20]:
# Count number of correct responses
zeros1 = acc1[(acc1 == 0)].count()
twos1 = acc1[(acc1 == 2)].count()

In [21]:
# Print
print('The accuracy is: ', round(100 * (zeros1 + twos1) / len(acc1), 2))

The accuracy is:  79.25


In [22]:
# Create potential values of k
pot_k = [{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [23]:
# Create gridsearch object
classifier = GridSearchCV(pipe, pot_k, cv = 5, verbose = 0).fit(features_train, target_train)

In [24]:
# View object
classifier

In [25]:
# Make predictions with model
preds2 = classifier.predict(features_test)

In [26]:
# Find the vector that will tell us about accuray
acc2 = target_test * 1 + preds2 * 1

In [27]:
# Find the number of correct predictions
zeros2 = acc2[(acc2 == 0)].count()
twos2 = acc2[(acc2 == 2)].count()

In [28]:
# Print
print('The tuned accuracy is: ', round(100 * (zeros2 + twos2) / len(acc2), 2))

The tuned accuracy is:  78.3


In [29]:
# Make knn classifier object
knn = KNeighborsClassifier()
# use 'liblinear' to prevent hyperparameter warnings 
log_reg = LogisticRegression(solver = 'liblinear') 
# Create random forest object
random_forest = RandomForestClassifier()

In [30]:
# Create pipeline
pipe = Pipeline([('minmax_scaler', minmax_scaler),
                    ('classifier', knn)])

In [31]:
# Create search space including logistic regression and trees
search_space = [{'classifier': [knn],
                 'classifier__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                {'classifier': [log_reg],
               'classifier__penalty': ['l1', 'l2'],
               'classifier__C': np.logspace(0,4,10)},
               {'classifier': [random_forest],
               'classifier__n_estimators': [10, 100, 1000],
               'classifier__max_features': [3, 5, 10, 20]},
               {'classifier__n_neighbors': list(range(1,11,1)),
                'classifier__weights': ['uniform', 'distance']}]

In [32]:
# Creates the GridSearchCV object
compared_classifiers = GridSearchCV(pipe, search_space, cv = 5, verbose = 0)

In [33]:
# Fits the  different models to the training set
best_model = compared_classifiers.fit(features_train, target_train)

In [34]:
# Find the best model type
best_model.best_estimator_

The logistic regression model produces the best outcome in this instance

In [36]:
# Finds the best hyperparameters
best_params = best_model.best_params_

In [37]:
# Displays the best hyperparameters
best_params

{'classifier': LogisticRegression(solver='liblinear'),
 'classifier__C': 1.0,
 'classifier__penalty': 'l1'}

In [38]:
# Fit the logistic regression to the feature matrix
log_reg.fit(features_train, target_train)

In [39]:
# Predict the results of the test data
pred_log_reg = log_reg.predict(features_test)
pred_log_reg

array([ True,  True, False, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True, False, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [40]:
# Create vector to find out how accurate the model is
counts = pred_log_reg * 1 + target_test * 1
counts

310    2
272    2
548    0
48     0
325    0
      ..
225    1
76     1
282    2
403    2
49     2
Name: Loan_Status_Y, Length: 106, dtype: int32

In [41]:
# Count number of correct predictions
zero = counts[counts == 0].count()
two = counts[counts == 2].count()

In [85]:
# Calculate and display accuracy
acc = (zero + two) / len(counts) * 100
print('Accuracy:', round(acc, 2), '%')

Accuracy: 82.08 %
