In [184]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Get the Data
** Read the 'KNN_Project_Data csv file into a dataframe **

In [185]:
df = pd.read_csv("hawaii.csv")


In [186]:
df.columns

Index(['loan_type', 'property_type', 'purpose', 'occupancy', 'amount',
       'status', 'sex', 'income'],
      dtype='object')

**Check the head of the dataframe.**

In [187]:
df.head()

Unnamed: 0,loan_type,property_type,purpose,occupancy,amount,status,sex,income
0,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,396,Loan originated,Female,152.0
1,VA-guaranteed,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,662,Loan originated,Male,111.0
2,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,461,Loan originated,Female,138.0
3,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Not owner-occupied as a principal dwelling,345,Loan originated,Female,184.0
4,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Not owner-occupied as a principal dwelling,210,Application withdrawn by applicant,Female,99.0


In [188]:
df.columns

Index(['loan_type', 'property_type', 'purpose', 'occupancy', 'amount',
       'status', 'sex', 'income'],
      dtype='object')

# EDA

In [189]:
# Remove null values
df = df.dropna()

### Status

In [190]:
# Keep only records that either granted or rejected loans
states = ['originated', 'approved', 'denied']
df = df[df.apply(lambda x: any(s in x.status for s in states), axis=1)]

In [191]:
df.status.value_counts()

Loan originated                                25429
Application denied by financial institution     5321
Application approved but not accepted           1164
Name: status, dtype: int64

In [192]:
# Merged approved and withdrawn in row status
def f(row):
    return 1 if ("approved" in row.status or "originated" in row.status) else 0
df['status'] = df.apply(f, axis=1)


In [193]:
df['status'].unique()

array([1, 0], dtype=int64)

In [194]:
# Making sure the counts added up
df.status.value_counts()

1    26593
0     5321
Name: status, dtype: int64

# Encode Categorical Features

In [195]:
TARGET = 'status'

In [196]:
df.dtypes

loan_type         object
property_type     object
purpose           object
occupancy         object
amount             int64
status             int64
sex               object
income           float64
dtype: object

In [197]:
categorical_columns = [col for col in df.columns if df[col].dtype == object and col != TARGET]
categorical_columns

['loan_type', 'property_type', 'purpose', 'occupancy', 'sex']

In [198]:
df_categorical = df.loc[:, categorical_columns].copy()
df_categorical

Unnamed: 0,loan_type,property_type,purpose,occupancy,sex
0,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Female
1,VA-guaranteed,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Male
2,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Female
3,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Not owner-occupied as a principal dwelling,Female
5,VA-guaranteed,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,Male
...,...,...,...,...,...
44863,VA-guaranteed,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,Male
44864,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Male
44865,FHA-insured,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Male
44866,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Female


In [199]:
df_non_categorical = df.loc[:, [col for col in df.columns if col not in categorical_columns]].copy()
df_non_categorical

Unnamed: 0,amount,status,income
0,396,1,152.0
1,662,1,111.0
2,461,1,138.0
3,345,1,184.0
5,563,0,116.0
...,...,...,...
44863,316,1,79.0
44864,186,0,65.0
44865,239,1,38.0
44866,540,1,125.0


In [200]:
from sklearn.preprocessing import OneHotEncoder

# Create the encoder.
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(df_categorical)

# Apply the encoder.
df_categorical = encoder.transform(df_categorical)
df_categorical

<31914x16 sparse matrix of type '<class 'numpy.float64'>'
	with 159570 stored elements in Compressed Sparse Row format>

In [201]:
df_categorical.shape

(31914, 16)

In [202]:
df_categorical = pd.DataFrame.sparse.from_spmatrix(df_categorical, columns=categorical_columns)

ValueError: Column length mismatch: 5 vs. 16

In [None]:
df_categorical.shape

(31914, 1)

In [None]:
df_non_categorical.shape

(31914, 3)

In [None]:
df = pd.merge(df_categorical, df_non_categorical)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

# Standardize the Variables

Time to standardize the variables.

** Import StandardScaler from Scikit learn.**

In [None]:
# from sklearn.preprocessing import StandardScaler

** Create a StandardScaler() object called scaler.**

In [None]:
# scaler = StandardScaler()

** Fit scaler to the features.**

In [None]:
# scaler.fit(df.drop('status', axis=1))

**Use the .transform() method to transform the features to a scaled version.**

In [None]:
# scaled_features = scaler.transform(df.drop('status', axis=1))


**Convert the scaled features to a dataframe and check the head of this dataframe to make sure the scaling worked.**

In [None]:
# df_features = pd.DataFrame(scaled_features, columns=[col for col in df.columns if col != 'status'])
# df_features.head()


# Train Test Split

**Use train_test_split to split your data into a training set and a testing set.**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('status', axis=1), df['status'], test_size=0.3, random_state=101)
X_train

Unnamed: 0,loan_type,property_type,purpose,occupancy,amount,sex,income
10375,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Not owner-occupied as a principal dwelling,900,Male,188.0
8872,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Not owner-occupied as a principal dwelling,967,Male,402.0
3531,Conventional,One-to-four family dwelling (other than manufa...,Home improvement,Owner-occupied as a principal dwelling,49,Male,101.0
43756,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,165,Male,51.0
31788,VA-guaranteed,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,412,Female,63.0
...,...,...,...,...,...,...,...
7955,VA-guaranteed,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,522,"Information not provided by applicant in mail,...",84.0
11062,FSA/RHS-guaranteed,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,222,Female,96.0
24722,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,360,Female,165.0
25011,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Not owner-occupied as a principal dwelling,525,Male,212.0


In [None]:
X_train.shape

(22339, 2228)

In [None]:
y_train.shape

(22339,)

# Using KNN

**Import KNeighborsClassifier from scikit learn.**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

**Create a KNN model instance with n_neighbors=1**

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)

**Fit this KNN model to the training data.**

In [None]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=15)

# Predictions and Evaluations
Let's evaluate our KNN model!

**Use the predict method to predict values using your KNN model and X_test.**

In [None]:
prediction = knn.predict(X_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


ValueError: could not convert string to float: 'Conventional'

** Create a confusion matrix and classification report.**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test, prediction))

[[  48 1519]
 [  87 7921]]


In [None]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.36      0.03      0.06      1567
           1       0.84      0.99      0.91      8008

    accuracy                           0.83      9575
   macro avg       0.60      0.51      0.48      9575
weighted avg       0.76      0.83      0.77      9575



# Choosing a K Value
Let's go ahead and use the elbow method to pick a good K Value!

** Create a for loop that trains various KNN models with different k values, then keep track of the error_rate for each of these models with a list. Refer to the lecture if you are confused on this step.**

In [None]:
# error_rate = []
# for i in range(1, 50):
#     print(i)
#     knn = KNeighborsClassifier(n_neighbors=i)
#     knn.fit(X_train, y_train)
#     prediction = knn.predict(X_test)
#     error_rate.append(np.mean(prediction != y_test))

**Now create the following plot using the information from your for loop.**

In [None]:
# figure = plt.figure(figsize=(7,5))
# plt.plot(np.arange(1,50), error_rate, marker='o')
# plt.title('Error rate vs K Value')
# plt.xlabel('K Value')
# plt.ylabel('Error rate')

## Retrain with new K Value

**Retrain your model with the best K value (up to you to decide what you want) and re-do the classification report and the confusion matrix.**

In [None]:
knn = KNeighborsClassifier(n_neighbors=35)
knn.fit(X_train, y_train)
k_100_prediction = knn.predict(X_test)
print(confusion_matrix(y_test, k_100_prediction))


KeyboardInterrupt: 

In [None]:
print(classification_report(y_test, k_100_prediction))


              precision    recall  f1-score   support

           0       0.26      0.01      0.01      1567
           1       0.84      1.00      0.91      8008

    accuracy                           0.83      9575
   macro avg       0.55      0.50      0.46      9575
weighted avg       0.74      0.83      0.76      9575



In [None]:
knn.score(X_test, y_test)

0.8342558746736293

In [None]:
import pickle
knnPickle = open('model', 'wb')

# source, destination
pickle.dump(knn, knnPickle)

# close the file
knnPickle.close()
