Import Necessary Libraries


In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV  # Ensure this line is present and correct
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix


Load the Dataset

In [33]:
df = pd.read_csv('credit_risk_dataset.csv')


Explore the Dataset

In [34]:
print(df.head())


   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


Handle Missing Values

In [35]:
print(df.isnull().sum())  # Check for missing values
df = df.dropna()  # Drop missing values or use imputation as appropriate


person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


Encode Categorical Features

In [36]:
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade'], drop_first=True)


In [37]:
df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({'Y': 1, 'N': 0})


In [38]:
print(df['cb_person_default_on_file'].unique())  # It should print [1 0]


[1 0]


Split Data

In [39]:
from sklearn.model_selection import train_test_split

X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np

param_dist = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-3, 2, 6), 'kernel': ['rbf']}

# Using fewer iterations and all CPU cores
random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist, n_iter=10, cv=5, verbose=2, n_jobs=-1)
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...................C=100.0, gamma=100.0, kernel=rbf; total time= 1.3min
[CV] END .....................C=0.1, gamma=100.0, kernel=rbf; total time=  53.0s
[CV] END .....................C=1.0, gamma=0.001, kernel=rbf; total time=  11.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=  11.3s
[CV] END ....................C=0.001, gamma=10.0, kernel=rbf; total time=  19.1s
[CV] END ....................C=0.001, gamma=10.0, kernel=rbf; total time=  18.9s
[CV] END ....................C=100.0, gamma=10.0, kernel=rbf; total time= 1.1min
[CV] END .......................C=1.0, gamma=0.1, kernel=rbf; total time=  50.3s
[CV] END .....................C=10.0, gamma=10.0, kernel=rbf; total time= 1.2min
[CV] END ......................C=1.0, gamma=10.0, kernel=rbf; total time= 1.0min
[CV] END ...................C=100.0, gamma=100.0, kernel=rbf; total time= 1.1min
[CV] END ......................C=1.0, gamma=0.01

Normalize Data

In [41]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Create a Sequential model
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])


Build the Model

In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(32, input_dim=X_train.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Train the Model

In [44]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff0d1a496c0>

Evaluate the Model

In [45]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 91.55%


Confusion Matrix and Classification Report

In [46]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predicting the Test set results
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)  # convert probabilities to binary output

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(cr)


Confusion Matrix:
[[4356   87]
 [ 397  888]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      4443
           1       0.91      0.69      0.79      1285

    accuracy                           0.92      5728
   macro avg       0.91      0.84      0.87      5728
weighted avg       0.92      0.92      0.91      5728



Feature Engineering

In [47]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)


Ensemble Modeling

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_poly, y_train)

# Predictions and Evaluation
rf_predictions = rf_model.predict(X_test_poly)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Model Accuracy: {rf_accuracy*100:.2f}%")


Random Forest Model Accuracy: 93.52%


In [57]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, rf_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, rf_predictions)
print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[4419   24]
 [ 347  938]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      4443
           1       0.98      0.73      0.83      1285

    accuracy                           0.94      5728
   macro avg       0.95      0.86      0.90      5728
weighted avg       0.94      0.94      0.93      5728



In [58]:
pip install shap

Note: you may need to restart the kernel to use updated packages.


In [None]:
import shap  # Make sure to install shap if you haven't already
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ... (the rest of your data loading and preprocessing code)

# Training the model and making predictions
rf_model.fit(X_train_poly, y_train)
rf_predictions = rf_model.predict(X_test_poly)

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, rf_predictions)*100:.2f}%")

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, rf_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Compute and plot SHAP values
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_poly)

# Plot summary_plot
shap.summary_plot(shap_values, X_test_poly, plot_type="bar")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
