# Data Science for Business - Predicting Credit Card Default with KNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Set up workspace
np.random.seed(42)
# Turn off scientific notation for large numbers
pd.options.display.float_format = '{:.2f}'.format

## Load Data
Importing the dataset from a CSV file.

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/olivermueller/ds4b-2024/96d117a1f864c0a2701580f784645b1e409fb7b0/Session_01/default.csv')

In [None]:
df.head()

Removing unnecessary columns.

In [5]:
df = df.drop('student', axis=1)

## Summary Statistics
Generating summary statistics and cross-tabulation for the `default` variable.

In [None]:
# Summary statistics
print(df.describe())

In [None]:
# Cross-table for the `default` variable
pd.crosstab(df['default'], columns='default')


## Visualizations
Use seaborn to visually explore the dataset.

In [None]:
# create a scatterplot with balance and income on the x and y axes and default as hue
sns.scatterplot(x='balance', y='income', hue='default', alpha=0.5, data=df)
plt.xlabel('Balance')
plt.ylabel('Income')
plt.show()

## Machine Learning - KNN
Training a K-Nearest Neighbors model to predict credit card default.

In [9]:
# Splitting the data into features and outcome
X = df[['balance', 'income']]
y = df['default']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Training KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [None]:
# Show labels
knn.classes_

In [13]:
# Making predictions
y_prob = knn.predict_proba(X_test)[: , 1]

In [None]:
y_prob

In [None]:
# plot distribution of predicted probabilities for default
sns.histplot(y_prob, kde=True)

In [16]:
# Transform probabilities into binary outcomes
decision_threshold = 0.5
y_pred = np.where(y_prob > decision_threshold, 'Yes', 'No')

In [None]:
# Plot a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Print a classification report
print(classification_report(y_test, y_pred))

## Your Turn!

Experiment with the above code and:

1.  Change `n_neighbors` and observe how the accuracy of the classifier changes.

2.  Change the `decision threshold` and observe how the accuracy of the classifier changes.