In [1]:
!pip install scikit-learn pandas numpy




In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


In [3]:
# Expecting data.csv in Colab root
df = pd.read_csv("/content/data.csv")

print("Shape:", df.shape)
df.head()


Shape: (6819, 96)


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [4]:
df = df.dropna()

df['Bankrupt?'] = df['Bankrupt?'].astype(int)

X = df.drop(columns=['Bankrupt?'])
y = df['Bankrupt?']

print("Target distribution:")
print(y.value_counts(normalize=True))


Target distribution:
Bankrupt?
0    0.967737
1    0.032263
Name: proportion, dtype: float64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [6]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [7]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")


Accuracy: 0.9729
AUC: 0.9508


In [8]:
predictions_log = X_test.copy()

predictions_log['actual'] = y_test.values
predictions_log['predicted'] = y_pred
predictions_log['predicted_probability'] = y_prob

predictions_log['model_name'] = "Bankruptcy_RF"
predictions_log['model_version'] = "v1.0"
predictions_log['prediction_time'] = datetime.now()

# Simulated inference latency
predictions_log['latency_ms'] = np.random.randint(25, 90, size=len(predictions_log))

predictions_log['status'] = "SUCCESS"

predictions_log.head()


Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income Flag,Equity to Liability,actual,predicted,predicted_probability,model_name,model_version,prediction_time,latency_ms,status
1244,0.503242,0.556149,0.552278,0.609399,0.609399,0.999043,0.797445,0.809349,0.303466,0.781609,...,1,0.022839,0,0,0.005905,Bankruptcy_RF,v1.0,2025-12-18 21:16:17.449691,47,SUCCESS
6303,0.485984,0.547754,0.530649,0.598812,0.598812,0.999002,0.797424,0.809331,0.303516,0.781591,...,1,0.035415,0,0,0.000995,Bankruptcy_RF,v1.0,2025-12-18 21:16:17.449691,72,SUCCESS
4712,0.55438,0.618513,0.609562,0.607857,0.607886,0.999043,0.797758,0.809638,0.304012,0.781922,...,1,0.034892,0,0,0.003398,Bankruptcy_RF,v1.0,2025-12-18 21:16:17.449691,26,SUCCESS
3273,0.66392,0.692324,0.709246,0.630097,0.630097,0.999314,0.797808,0.809651,0.303533,0.781935,...,1,0.061798,0,0,5.8e-05,Bankruptcy_RF,v1.0,2025-12-18 21:16:17.449691,70,SUCCESS
6430,0.450641,0.509976,0.495423,0.650492,0.650226,0.998738,0.797212,0.809081,0.303697,0.78124,...,1,0.20713,0,0,0.001175,Bankruptcy_RF,v1.0,2025-12-18 21:16:17.449691,59,SUCCESS


In [9]:
predictions_log.to_csv(
    "predictions_log.csv",
    index=False
)

print("Saved: predictions_log.csv")


Saved: predictions_log.csv


In [10]:
performance_metrics = pd.DataFrame({
    "date": [datetime.now().date()],
    "model_name": ["Bankruptcy_RF"],
    "model_version": ["v1.0"],
    "accuracy": [accuracy],
    "auc": [auc],
    "prediction_volume": [len(predictions_log)]
})

performance_metrics


Unnamed: 0,date,model_name,model_version,accuracy,auc,prediction_volume
0,2025-12-18,Bankruptcy_RF,v1.0,0.972874,0.950809,1364


In [11]:
performance_metrics.to_csv(
    "performance_metrics.csv",
    index=False
)

print("Saved: performance_metrics.csv")


Saved: performance_metrics.csv


In [12]:
def calculate_psi(expected, actual, buckets=10):
    breakpoints = np.percentile(expected, np.arange(0, 100, 100 / buckets))
    psi = 0

    for i in range(len(breakpoints) - 1):
        expected_perc = np.mean(
            (expected >= breakpoints[i]) & (expected < breakpoints[i + 1])
        )
        actual_perc = np.mean(
            (actual >= breakpoints[i]) & (actual < breakpoints[i + 1])
        )

        psi += (expected_perc - actual_perc) * np.log(
            (expected_perc + 1e-6) / (actual_perc + 1e-6)
        )

    return psi


In [13]:
drift_data = []

for feature in X.columns:
    psi_value = calculate_psi(X_train[feature], X_test[feature])

    drift_data.append({
        "date": datetime.now().date(),
        "feature_name": feature,
        "psi": psi_value,
        "drift_status": (
            "Healthy" if psi_value < 0.1 else
            "Warning" if psi_value < 0.25 else
            "Severe"
        )
    })

feature_drift = pd.DataFrame(drift_data)
feature_drift.head()


Unnamed: 0,date,feature_name,psi,drift_status
0,2025-12-18,ROA(C) before interest and depreciation befor...,0.010194,Healthy
1,2025-12-18,ROA(A) before interest and % after tax,0.009274,Healthy
2,2025-12-18,ROA(B) before interest and depreciation after...,0.012634,Healthy
3,2025-12-18,Operating Gross Margin,0.004382,Healthy
4,2025-12-18,Realized Sales Gross Margin,0.004839,Healthy


In [14]:
feature_drift.to_csv(
    "feature_drift.csv",
    index=False
)

print("Saved: feature_drift.csv")


Saved: feature_drift.csv
