# Loan Default Prediction - Modelling

In this notebook, we'll prepare

At the end of this notebook we will

# Import Libraries

In [1]:
# Regular EDA (exploratory data analysis) and plotting libraries
import pandas as pd
# Display only 3 digits after floating point in pandas objects
pd.set_option('display.float_format', lambda x:'%.3f' % x)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
# Configure default settings for plots
sb.set(style='ticks')
sb.set_palette('Paired')
plt.rcParams['axes.spines.top'] = False     # Remove top border
plt.rcParams['axes.spines.right'] = False   # Remove righr border

import warnings
warnings.simplefilter(action="ignore", category=Warning)

# Load Data

## Mounting Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/My Drive/Colab Notebooks/Capstone-Project-Loan-Default-Prediction

/content/drive/My Drive/Colab Notebooks/Capstone-Project-Loan-Default-Prediction


In [33]:
# Set directorty paths as global variable
data_path = './Data/Processed/'

data = pd.read_csv(data_path + 'hmeq_no_debtinc_no_null_no_outliers.csv')

# Create a copy of the dataframe to secure it against changes
df = data.copy()
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.367,1.0,9.0
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833,0.0,14.0
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.467,1.0,10.0
3,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333,0.0,14.0
4,1,1700,30548.0,40320.0,HomeImp,Other,9.0,0.0,0.0,101.466,1.0,8.0


# Transform Data

In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [34]:
# Create lists for categorical and numeric columns
cat_features = []
num_features = []

for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        num_features.append(column)
    elif pd.api.types.is_object_dtype(df[column]):
        cat_features.append(column)

print(f'Categorical columns: {len(cat_features)}', cat_features)
print(f'Numeric columns: {len(num_features)}', num_features)

Categorical columns: 2 ['REASON', 'JOB']
Numeric columns: 10 ['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO']


In [36]:
num_features.remove('BAD')

In [37]:
# Encoding target
lbl_encoder = LabelEncoder()
df['BAD Encoded'] = lbl_encoder.fit_transform(df['BAD'])
df['BAD Encoded'].value_counts()

BAD Encoded
0    3386
1     836
Name: count, dtype: int64

In [38]:
x = df.drop(['BAD', 'BAD Encoded'], axis=1)
y = df['BAD Encoded']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

In [39]:
# Define transformers for different column types
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

# Combine transformers for specific columns
preprocessor = ColumnTransformer([
    ("num", numerical_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])

# Fit transformers on training data only
preprocessor.fit(x_train)

# Transform train and test data using fitted transformers
x_train_transformed = preprocessor.transform(x_train)
x_test_transformed = preprocessor.transform(x_test)

x_train_transformed.shape

(3377, 15)

# Comparing Different Algorithms
In this section we'll compare non-linear and ensemble algorithms using k-fold cross validation.

❗️Prioritizing the