In [1]:
# Importing necessary libraries
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
df = pd.read_csv('data/credit_risk1.csv')

In [3]:
df.head()

Unnamed: 0,borrower_id,income,employment_status,credit_score,debt_to_income_ratio,loan_amount_requested,age,education_level,marital_status,num_of_credit_inquiries,...,loan_type,loan_amount,interest_rate,loan_term,collateral_value,loan_to_value_ratio,repayment_schedule,current_balance,num_of_late_payments,loan_default
0,1,57450.7123,Self-employed,342,0.311864,30208.01581,69,Bachelor's,Divorced,7,...,Auto,4380.510427,3.038261,284,47643.93928,0.889981,Quarterly,46676.21564,3,0
1,2,47926.03548,Retired,684,0.196757,22258.35222,67,High School,Single,9,...,Auto,49794.31602,10.627914,146,41744.82252,0.738939,Monthly,26337.48749,4,0
2,3,59715.32807,Employed,734,0.300241,9055.870925,60,Master's,Married,8,...,Business,15151.68853,10.978312,243,84295.51999,0.714941,Quarterly,26891.89116,3,0
3,4,72845.44785,Retired,707,0.371848,6712.570653,46,High School,Single,0,...,Auto,8314.132621,11.232058,50,23385.04503,0.505514,Quarterly,39035.89979,0,0
4,5,46487.69938,Self-employed,782,0.130496,27788.08842,38,PhD,Married,8,...,Personal,12808.81664,11.334358,302,1069.232688,0.963279,Monthly,15734.44869,4,0


In [4]:
# Fill missing values for numerical variables with the mean
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

# Fill missing values for categorical variables with the mode
for column in df.select_dtypes(include='object').columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [5]:
# Step 1: Check for categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(f"Categorical Columns: {categorical_cols}")

# Step 2: Label Encoding for columns with ordinal relationship or a small number of categories
# You can select specific columns for label encoding
label_enc = LabelEncoder()

# Apply label encoding to all categorical columns (if appropriate)
for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])

# Step 3: One-Hot Encoding for categorical columns without an ordinal relationship
# This method is usually preferred for columns with no natural ordering

# Convert the categorical columns to one-hot encoding
data_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

Categorical Columns: Index(['employment_status', 'education_level', 'marital_status', 'loan_type',
       'repayment_schedule'],
      dtype='object')


In [6]:
# Features and target
X = df.drop(columns=["borrower_id", "loan_id", "loan_default"])  # Drop ID columns and the target column
y = df["loan_default"]

In [7]:
# Standardize the dataset for models like SVM, KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [9]:
# Step 2: Train and Evaluate 5 Models

# 1. Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
log_reg_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])
print(f"Logistic Regression ROC-AUC: {log_reg_auc}")

Logistic Regression ROC-AUC: 0.547597995873858


In [10]:
# 2. Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_clf_auc = roc_auc_score(y_test, rf_clf.predict_proba(X_test)[:, 1])
print(f"Random Forest ROC-AUC: {rf_clf_auc}")

Random Forest ROC-AUC: 0.5009824147755182


In [11]:
# 3. Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb = gb_clf.predict(X_test)
gb_clf_auc = roc_auc_score(y_test, gb_clf.predict_proba(X_test)[:, 1])
print(f"Gradient Boosting ROC-AUC: {gb_clf_auc}")

Gradient Boosting ROC-AUC: 0.4934669417428038


In [12]:
# 4. Support Vector Classifier
svc_clf = SVC(probability=True, random_state=42)
svc_clf.fit(X_train, y_train)
y_pred_svc = svc_clf.predict(X_test)
svc_clf_auc = roc_auc_score(y_test, svc_clf.predict_proba(X_test)[:, 1])
print(f"SVM ROC-AUC: {svc_clf_auc}")

SVM ROC-AUC: 0.4899302485509382


In [13]:
# 5. K-Nearest Neighbors Classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
knn_clf_auc = roc_auc_score(y_test, knn_clf.predict_proba(X_test)[:, 1])
print(f"KNN ROC-AUC: {knn_clf_auc}")

KNN ROC-AUC: 0.4720011788977305


In [14]:
# Step 3: Model Selection based on ROC-AUC
model_scores = {
    "Logistic Regression": log_reg_auc,
    "Random Forest": rf_clf_auc,
    "Gradient Boosting": gb_clf_auc,
    "SVM": svc_clf_auc,
    "KNN": knn_clf_auc
}

best_model_name = max(model_scores, key=model_scores.get)
best_model_score = model_scores[best_model_name]

print(f"\nBest Model: {best_model_name} with ROC-AUC: {best_model_score}")


Best Model: Logistic Regression with ROC-AUC: 0.547597995873858


In [15]:
# Step 4: Use the Best Model for Final Predictions
if best_model_name == "Logistic Regression":
    best_model = log_reg
elif best_model_name == "Random Forest":
    best_model = rf_clf
elif best_model_name == "Gradient Boosting":
    best_model = gb_clf
elif best_model_name == "SVM":
    best_model = svc_clf
elif best_model_name == "KNN":
    best_model = knn_clf

# Final Prediction on Test Data using the Best Model
final_predictions = best_model.predict(X_test)

# Print final predictions
print("Final Predictions:", final_predictions)

Final Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [16]:
# Step 1: Create a directory to store the model if it doesn't exist
model_directory = "model"
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

In [17]:
# Step 2: Save the best model to the folder
model_file_path = os.path.join(model_directory, "default_probability_model.pkl")
joblib.dump(best_model, model_file_path)
print(f"Best model saved to: {model_file_path}")

Best model saved to: model\default_probability_model.pkl
