In [1]:
!pip install pandas scikit-learn imbalanced-learn



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np

In [3]:
df = pd.read_csv("GiveMeSomeCredit-training.csv")

In [4]:
print("First few rows of data:")
print(df.head())

First few rows of data:
   Unnamed: 0  SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines  age  \
0           1                 1                              0.766127   45   
1           2                 0                              0.957151   40   
2           3                 0                              0.658180   38   
3           4                 0                              0.233810   30   
4           5                 0                              0.907239   49   

   NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
0                                     2   0.802982         9120.0   
1                                     0   0.121876         2600.0   
2                                     1   0.085113         3042.0   
3                                     0   0.036050         3300.0   
4                                     1   0.024926        63588.0   

   NumberOfOpenCreditLinesAndLoans  NumberOfTimes90DaysLate  \
0                            

In [5]:
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [8]:
df.fillna({
    'MonthlyIncome': df['MonthlyIncome'].median(),
    'NumberOfDependents': df['NumberOfDependents'].median()
}, inplace=True)

In [9]:
print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [10]:
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df['SeriousDlqin2yrs']


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [13]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [14]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_resampled, y_resampled)

In [15]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]


In [16]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC AUC Score:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[26821  1223]
 [ 1257   699]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96     28044
           1       0.36      0.36      0.36      1956

    accuracy                           0.92     30000
   macro avg       0.66      0.66      0.66     30000
weighted avg       0.92      0.92      0.92     30000


ROC AUC Score: 0.8272818582776291


In [17]:
customer_scores = pd.DataFrame({
    "Customer_ID": np.arange(len(y_test)),
    "Predicted_Default": y_pred,
    "Risk_Score_Probability": y_proba
})

In [18]:
print("\nTop 10 High-Risk Customers:")
print(customer_scores.sort_values(by="Risk_Score_Probability", ascending=False).head(10))


Top 10 High-Risk Customers:
       Customer_ID  Predicted_Default  Risk_Score_Probability
27350        27350                  1                1.000000
17447        17447                  1                1.000000
4593          4593                  1                1.000000
7974          7974                  1                1.000000
23733        23733                  1                0.990000
22307        22307                  1                0.990000
9304          9304                  1                0.990000
8431          8431                  1                0.987719
22561        22561                  1                0.986938
23767        23767                  1                0.986182


In [19]:
import joblib

In [20]:
joblib.dump(model, "credit_risk_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [21]:
import streamlit as st
import joblib
import numpy as np


In [26]:
from sklearn.preprocessing import StandardScaler
import joblib

In [27]:
scaler = StandardScaler()
scaler.fit(X)

In [28]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']