In [1]:
%matplotlib inline

In [2]:
import sys
import os
sys.path.append(os.path.abspath('.'))

In [5]:
from pages.A_Explore_Preprocess_Dataset import load_dataset
filepath = "Loan_default.csv"
df = load_dataset(filepath)
df.head()



Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [6]:
import numpy as np
# print basic statistics about dataset - number of features, distribution of target variable
print(f"Number of rows: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

print(f"Number of numeric features: {len(numeric_cols)}")
print(f"Number of categorical features: {len(categorical_cols)}")

num_rows_with_missing = df.isnull().any(axis=1).sum()
print(f"% of rows with missing values: {num_rows_with_missing/df.shape[0] * 100:.2f}%")

print("Counts of 0 and 1:", df['Default'].value_counts())

Number of rows: 255347
Number of features: 18
Number of numeric features: 10
Number of categorical features: 8
% of rows with missing values: 0.00%
Counts of 0 and 1: Default
0    225694
1     29653
Name: count, dtype: int64


In [7]:
# for each categorical column, print how many unique values it has

print(len(categorical_cols))
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts())
    print()

8
LoanID: 255347 unique values
LoanID
I38PQUQS96    1
WGB0GD3150    1
XATSFC5YHN    1
CQXDW5VBAG    1
6AIVUNAJG8    1
             ..
ZBK3GDL2LI    1
SP7XY2LPYA    1
Q2DO8ENMV1    1
5EXD8N4MT4    1
ZTH91CGL0B    1
Name: count, Length: 255347, dtype: int64

Education: 4 unique values
Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

EmploymentType: 4 unique values
EmploymentType
Part-time        64161
Unemployed       63824
Self-employed    63706
Full-time        63656
Name: count, dtype: int64

MaritalStatus: 3 unique values
MaritalStatus
Married     85302
Divorced    85033
Single      85012
Name: count, dtype: int64

HasMortgage: 2 unique values
HasMortgage
Yes    127677
No     127670
Name: count, dtype: int64

HasDependents: 2 unique values
HasDependents
Yes    127742
No     127605
Name: count, dtype: int64

LoanPurpose: 5 unique values
LoanPurpose
Business     51298
Home         51286
Education    51005
Other    

In [None]:
# balance classes (0 is overbalanced)
from sklearn.utils import resample
import pandas as pd

df_majority = df[df['Default'] == 0]
df_minority = df[df['Default'] == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,                      # no bootstrapping
    n_samples=len(df_minority),         # match minority count
    random_state=42
)

df_balanced = pd.concat([df_majority_downsampled, df_minority])
df_balanced = df_balanced.sample(frac=1, random_state=42)  # shuffle

print("Class distribution after undersampling:")
print(df_balanced['Default'].value_counts())

Class distribution after undersampling:
Default
0    29653
1    29653
Name: count, dtype: int64


In [12]:
df = df.drop(columns="LoanID")

In [13]:
from pages.A_Explore_Preprocess_Dataset import one_hot_encode_feature 
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df = one_hot_encode_feature(df, categorical_cols)



In [14]:
print(df.columns)

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Default',
       'Education_Bachelor's', 'Education_High School', 'Education_Master's',
       'Education_PhD', 'EmploymentType_Full-time', 'EmploymentType_Part-time',
       'EmploymentType_Self-employed', 'EmploymentType_Unemployed',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'HasMortgage_No', 'HasMortgage_Yes',
       'HasDependents_No', 'HasDependents_Yes', 'LoanPurpose_Auto',
       'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Home',
       'LoanPurpose_Other', 'HasCoSigner_No', 'HasCoSigner_Yes'],
      dtype='object')


In [15]:
features = [col for col in df.columns if col != 'Default']
X = df[features]
y = df['Default']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y.values, test_size=0.3, random_state=42
)

In [17]:
from pages.C_Test_Model import compute_accuracy, compute_precison_recall

def test_model(model, X_train, y_train, X_val, y_val):
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    train_acc = compute_accuracy(y_pred_train, y_train)
    train_precision, train_recall = compute_precison_recall(y_pred_train, y_train)

    val_acc = compute_accuracy(y_pred_val, y_val)
    val_precision, val_recall = compute_precison_recall(y_pred_val, y_val)

    print(f"Train accuracy: {train_acc * 100:.2f}%, Train precision: {train_precision * 100:.2f}%, Train recall: {train_recall * 100:.2f}%")
    print(f"Validation accuracy: {val_acc * 100:.2f}%, Validation precision: {val_precision * 100:.2f}%, Validation recall: {val_recall * 100:.2f}%")



In [None]:
from pages.B_Train_Model import LogisticRegression

log_model = LogisticRegression(learning_rate=0.001, num_iterations=500) # TODO: try diff learning rate, iterations
log_model.fit(X_train, y_train)

test_model(log_model, X_train, y_train, X_val, y_val)s

  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.exp(-z))
  logexp = np.log(1. + np.exp(-scores))
  y_pred = 1 / (1 + np.e

Train accuracy: 0.00%, Train precision: 0.00%, Train recall: 0.00%
Validation accuracy: 0.00%, Validation precision: 0.00%, Validation recall: 0.00%


In [19]:
from pages.B_Train_Model import NaiveBayes

bayes_model = NaiveBayes(classes=[0, 1])
bayes_model.fit(X_train, y_train)

test_model(bayes_model, X_train, y_train, X_val, y_val)

  return reduction(axis=axis, out=out, **passkwargs)


Train accuracy: 88.40%, Train precision: 0.00%, Train recall: 0.00%
Validation accuracy: 88.35%, Validation precision: 0.00%, Validation recall: 0.00%


In [20]:
from pages.B_Train_Model import SVM

svm_model = SVM() # TODO: try different hyperparameters
svm_model.fit(X_train, y_train)

test_model(svm_model, X_train, y_train, X_val, y_val)



Train accuracy: 11.60%, Train precision: 11.60%, Train recall: 100.00%
Validation accuracy: 11.65%, Validation precision: 11.65%, Validation recall: 100.00%


TODO: plot loss curves, plot AUC, plot precision-recall

TODO: cross val/grid search for best hyperparameter choices