In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

In [204]:
# 1. Load the dataset
df = pd.read_csv('credit_risk_dataset.csv')

In [205]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,3.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [206]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [207]:
print(df.shape)

(32581, 12)


In [208]:
# Count the number of missing values for every column
print(df.isnull().sum())

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [209]:
#Handle Missing Values (Impute with Median)
median_emp_length = df['person_emp_length'].median()
median_int_rate = df['loan_int_rate'].median()

df['person_emp_length'].fillna(median_emp_length, inplace=True)
df['loan_int_rate'].fillna(median_int_rate, inplace=True)

In [210]:
#Count the number of missing values for every column
print(df.isnull().sum())

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


In [211]:
#One-Hot Encoding
# Identify all remaining text/word columns (object dtype)
categorical_cols = df.select_dtypes(include=['object']).columns
# Convert all text columns into 0/1 numerical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [212]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,22,59000,3.0,35000,16.02,1,0.59,3,0,0,...,0,1,0,0,0,1,0,0,0,1
1,21,9600,5.0,1000,11.14,0,0.1,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,1,0,0,0,1,0,0,0,0,1


In [213]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   person_age                   32581 non-null  int64  
 1   person_income                32581 non-null  int64  
 2   person_emp_length            32581 non-null  float64
 3   loan_amnt                    32581 non-null  int64  
 4   loan_int_rate                32581 non-null  float64
 5   loan_status                  32581 non-null  int64  
 6   loan_percent_income          32581 non-null  float64
 7   cb_person_cred_hist_length   32581 non-null  int64  
 8   person_home_ownership_OTHER  32581 non-null  uint8  
 9   person_home_ownership_OWN    32581 non-null  uint8  
 10  person_home_ownership_RENT   32581 non-null  uint8  
 11  loan_intent_EDUCATION        32581 non-null  uint8  
 12  loan_intent_HOMEIMPROVEMENT  32581 non-null  uint8  
 13  loan_intent_MEDI

In [214]:
#define dependent and independent varaible
X = df.drop('loan_status', axis=1)
y = df['loan_status']
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

Features (X) shape: (32581, 22)
Target (y) shape: (32581,)


In [215]:
# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

Training features shape: (22806, 22)
Testing features shape: (9775, 22)


In [216]:
# scalling the data
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income','cb_person_cred_hist_length']
scaler = StandardScaler()
# Fit (learn the scale) on the TRAINING data and transform both sets
scaler.fit(X_train[numerical_cols])
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("--- Data Scaling Complete ---")

--- Data Scaling Complete ---


In [217]:
# Initialize and train the model
log_reg_model = LogisticRegression(random_state=42, solver='liblinear')
log_reg_model.fit(X_train, y_train)
# Make predictions on the test set for later evaluation
y_pred_log_reg = log_reg_model.predict(X_test)

In [218]:
# y_test is the true answer (0 or 1)
# y_pred_log_reg is the model's guess (0 or 1)

# 1. Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_log_reg)
print("--- Confusion Matrix ---")
print(conf_matrix)

# 2. Classification Report (Precision, Recall, F1-Score)
class_report = classification_report(y_test, y_pred_log_reg)
print("\n--- Classification Report ---")
print(class_report)

# 3. ROC-AUC Score (measures how well the model separates classes)
roc_auc = roc_auc_score(y_test, log_reg_model.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC Score: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

--- Confusion Matrix ---
[[7248  365]
 [ 957 1205]]

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      7613
           1       0.77      0.56      0.65      2162

    accuracy                           0.86      9775
   macro avg       0.83      0.75      0.78      9775
weighted avg       0.86      0.86      0.86      9775


ROC-AUC Score: 0.8719
Accuracy: 0.8648


In [201]:
# Define the set of parameters to test (the 'grid')
param_grid = {
    'n_estimators': [100, 200],      # Number of trees
    'max_depth': [10, 20],           # Max depth of each tree
    'min_samples_split': [5, 10]     # Minimum samples required to split a node
}

# Initialize Grid Search with 3-fold cross-validation (cv=3)
# We use 'roc_auc' as the scoring metric since it's best for imbalanced data
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           scoring='roc_auc', cv=3, n_jobs=-1, verbose=2)

# Run the search (this may take a few minutes)
grid_search.fit(X_train, y_train)

# Get the best parameters found
best_params = grid_search.best_params_
print("\n--- Best Parameters Found ---")
print(best_params)

Fitting 3 folds for each of 8 candidates, totalling 24 fits

--- Best Parameters Found ---
{'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}


In [219]:
# Define the new person's data
new_person = {
    'person_age': 40,
    'person_income': 75000,
    'person_home_ownership': 'MORTGAGE',
    'person_emp_length': 15.0,
    'loan_intent': 'HOMEIMPROVEMENT',
    'loan_grade': 'A',
    'loan_amnt': 10000,
    'loan_int_rate': 8.5,
    'loan_percent_income': 0.13,
    'cb_person_default_on_file': 'N',
    'cb_person_cred_hist_length': 12
}

# Get the prediction status (0 or 1)
status = predict_risk(new_person)

print("\n--- NEW PREDICTION RESULT ---")
if status == 0:
    print(f"Prediction (0/1): 0 - ELIGIBLE")
else:
    print(f"Prediction (0/1): 1 - NOT ELIGIBLE")


--- NEW PREDICTION RESULT ---
Prediction (0/1): 1 - NOT ELIGIBLE


  X_template.loc[:] = 0


In [223]:
# Get feature names and absolute coefficient values from the optimized model
feature_names = X_train.columns
coefficients = np.abs(optimized_log_reg_model.coef_[0])
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': coefficients
})
top_5_features = feature_importance.sort_values(by='Importance', ascending=False).head(5)

print("\n--- Top 5 Features (Highest Impact on Prediction) ---")
print(top_5_features.to_markdown(index=False, numalign='left', stralign='left'))


--- Top 5 Features (Highest Impact on Prediction) ---
| Feature                   | Importance   |
|:--------------------------|:-------------|
| loan_grade_G              | 6.11677      |
| loan_grade_F              | 3.31029      |
| loan_grade_E              | 2.81032      |
| loan_grade_D              | 2.61766      |
| person_home_ownership_OWN | 1.69403      |


In [229]:
!pip install joblib

