In [4]:
import pandas as pd

In [5]:
df=pd.read_csv("credit_risk_dataset.csv")

In [6]:
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [7]:
df['loan_status'].value_counts(normalize=True)

loan_status
0    0.781836
1    0.218164
Name: proportion, dtype: float64

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
X_train.shape, X_test.shape

((26064, 11), (6517, 11))

In [10]:
# One-hot encode categorical features
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Make sure columns match
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [11]:
X_train_encoded.shape

(26064, 22)

In [12]:
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [13]:
# Fill numeric columns with median
X_train_encoded['person_emp_length'].fillna(X_train_encoded['person_emp_length'].median(), inplace=True)
X_train_encoded['loan_int_rate'].fillna(X_train_encoded['loan_int_rate'].median(), inplace=True)

X_test_encoded['person_emp_length'].fillna(X_train_encoded['person_emp_length'].median(), inplace=True)
X_test_encoded['loan_int_rate'].fillna(X_train_encoded['loan_int_rate'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_encoded['person_emp_length'].fillna(X_train_encoded['person_emp_length'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_encoded['loan_int_rate'].fillna(X_train_encoded['loan_int_rate'].median(), inplace=True)
The behavior will change in pa

In [14]:
X_train_encoded.isnull().sum()


person_age                     0
person_income                  0
person_emp_length              0
loan_amnt                      0
loan_int_rate                  0
loan_percent_income            0
cb_person_cred_hist_length     0
person_home_ownership_OTHER    0
person_home_ownership_OWN      0
person_home_ownership_RENT     0
loan_intent_EDUCATION          0
loan_intent_HOMEIMPROVEMENT    0
loan_intent_MEDICAL            0
loan_intent_PERSONAL           0
loan_intent_VENTURE            0
loan_grade_B                   0
loan_grade_C                   0
loan_grade_D                   0
loan_grade_E                   0
loan_grade_F                   0
loan_grade_G                   0
cb_person_default_on_file_Y    0
dtype: int64

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [16]:
model = LogisticRegression(
    max_iter=500,          # fewer iterations
    solver='saga',         # faster for medium datasets
    class_weight='balanced',
    random_state=42
)

model.fit(X_train_encoded, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,500


In [17]:
y_pred = model.predict(X_test_encoded)
y_proba = model.predict_proba(X_test_encoded)[:, 1]

In [18]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[3640 1455]
 [ 546  876]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.71      0.78      5095
           1       0.38      0.62      0.47      1422

    accuracy                           0.69      6517
   macro avg       0.62      0.67      0.63      6517
weighted avg       0.76      0.69      0.72      6517


ROC-AUC Score: 0.7253330186374497


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=200,          
    max_depth=10,              
    class_weight='balanced',   
    random_state=42,
    n_jobs=-1                  
)
rf_model.fit(X_train_encoded, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
y_pred_rf = rf_model.predict(X_test_encoded)
y_proba_rf = rf_model.predict_proba(X_test_encoded)[:,1]

In [21]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_proba_rf))

Confusion Matrix:
 [[4844  251]
 [ 349 1073]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      5095
           1       0.81      0.75      0.78      1422

    accuracy                           0.91      6517
   macro avg       0.87      0.85      0.86      6517
weighted avg       0.91      0.91      0.91      6517


ROC-AUC Score: 0.924798035635168


In [22]:
import pickle

with open("credit_risk_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(list(X_train_encoded.columns), f)
