# Classification Project

In [3]:
import pandas as pd
import numpy as np

In [4]:
#import data and ensure it loaded properly
loan_df = pd.read_csv("Loan_Train.csv")
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
#create coded column for Loan_status
loan_df['Loan_Status_Numeric'] = loan_df['Loan_Status'].replace(to_replace = ['Y', 'N'], value=[1,0])
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Status_Numeric
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1


In [6]:
#check shape of dataframe for comparison
loan_df.shape

(614, 14)

In [7]:
#find count of na values
loan_df.isna().sum()

Loan_ID                 0
Gender                 13
Married                 3
Dependents             15
Education               0
Self_Employed          32
ApplicantIncome         0
CoapplicantIncome       0
LoanAmount             22
Loan_Amount_Term       14
Credit_History         50
Property_Area           0
Loan_Status             0
Loan_Status_Numeric     0
dtype: int64

In [8]:
#drop Loan_ID column
loan_df.drop('Loan_ID', axis=1, inplace = True)

In [9]:
#drop any rows with missing values
loan_df = loan_df.dropna()

In [10]:
#confirm rows were dropped
loan_df.shape

(480, 13)

In [11]:
#check dtypes for columns
loan_df.dtypes

Gender                  object
Married                 object
Dependents              object
Education               object
Self_Employed           object
ApplicantIncome          int64
CoapplicantIncome      float64
LoanAmount             float64
Loan_Amount_Term       float64
Credit_History         float64
Property_Area           object
Loan_Status             object
Loan_Status_Numeric      int64
dtype: object

In [12]:
#find categorical columns
cat_cols = loan_df.select_dtypes('object').columns
cat_cols = list(set(cat_cols))
cat_cols

['Property_Area',
 'Gender',
 'Loan_Status',
 'Self_Employed',
 'Married',
 'Dependents',
 'Education']

In [13]:
#get dummy variables for categorical columns
loan_df = pd.get_dummies(loan_df, columns = cat_cols)

In [14]:
#confirm dataframe is cleaned
print(loan_df.shape)
print(loan_df.isna().sum())
loan_df.head()

(480, 23)
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status_Numeric        0
Property_Area_Rural        0
Property_Area_Semiurban    0
Property_Area_Urban        0
Gender_Female              0
Gender_Male                0
Loan_Status_N              0
Loan_Status_Y              0
Self_Employed_No           0
Self_Employed_Yes          0
Married_No                 0
Married_Yes                0
Dependents_0               0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Graduate         0
Education_Not Graduate     0
dtype: int64


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status_Numeric,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Gender_Female,...,Self_Employed_No,Self_Employed_Yes,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,...,1,0,0,1,0,1,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,1,0,0,1,0,...,0,1,0,1,1,0,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,1,0,0,1,0,...,1,0,0,1,1,0,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,1,0,...,1,0,1,0,1,0,0,0,1,0
5,5417,4196.0,267.0,360.0,1.0,1,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0


In [15]:
#split data into training and test set with 80/20 split and Loan_status as target
from sklearn.model_selection import train_test_split
X = loan_df.drop(['Loan_Status_Numeric', 'Loan_Status_N', 'Loan_Status_Y'], axis=1)
y = loan_df['Loan_Status_Numeric']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [16]:
#import needed libraries
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [17]:
#create scaler
standardizer = StandardScaler()

In [18]:
#standardize the features
X_train_stand = standardizer.fit_transform(X_train)
X_test_stand = standardizer.transform(X_test)

In [19]:
#create KNN classifier
knn = KNeighborsClassifier()

In [20]:
#create pipeline 
pipe = Pipeline([('scaler', standardizer), ('knn', knn)])

In [21]:
#fitting the pipeline and getting accuracy score
from sklearn.metrics import accuracy_score
pipe.fit(X_train_stand, y_train)
pipe_pred = pipe.predict(X_test_stand)
accuracy_score(y_test, pipe_pred)

0.6770833333333334

In [22]:
#create search space with parameters 1-10
search_space = [{'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]}]

In [23]:
#fit a grid search with pipe, search space and 5 fold CV
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(X_train_stand, y_train)

In [24]:
#find best value for n_neighbors
classifier.best_estimator_.get_params()['knn__n_neighbors']

9

In [25]:
#find accuracy for grid search on test set
classifier_pred = classifier.predict(X_test_stand)
accuracy_score(y_test, classifier_pred)

0.75

In [26]:
#create new pipeline and search space including Logistic regression and Random Forest Model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import liblinear

pipe2 = Pipeline([('scaler', standardizer), ('classifier', RandomForestClassifier())])

search_space2 = [{'classifier': [KNeighborsClassifier()], 'classifier__n_neighbors': [1,2,3,4,5,6,7,8,9,10]},
                 {'classifier': [LogisticRegression(max_iter=1000, solver='saga')], "classifier__penalty": ['l2'], 'classifier__C': np.logspace(0,4,10)},
                {'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [10,100,1000], 'classifier__max_features': [1,2,3]}]

In [27]:
#create a grid search
classifier2 = GridSearchCV(pipe2, search_space2, cv=5, verbose=0).fit(X_train_stand, y_train)

In [28]:
#get best model and parameters
classifier2.best_estimator_.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('classifier', LogisticRegression(max_iter=1000, solver='saga'))],
 'verbose': False,
 'scaler': StandardScaler(),
 'classifier': LogisticRegression(max_iter=1000, solver='saga'),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'classifier__C': 1.0,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 1000,
 'classifier__multi_class': 'auto',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': None,
 'classifier__solver': 'saga',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [29]:
#predict and get accuracy score for best model
classifer2_pred = classifier2.predict(X_test_stand)
accuracy_score(y_test, classifer2_pred)

0.8333333333333334

## Summarize your results.

The accuracy of the model improved roughly from 67% to 75% after hyperparameter tuning the KNN classifier. It increased even further to 83% after expanding the grid search into other models with LogisticRegression being the best model for this dataset.