In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from scipy.stats import f_oneway, chi2_contingency
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, log_loss, roc_curve, auc)

In [22]:
df = pd.read_csv('data.csv') 

df.head()


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [23]:
# class imbalance
print("Class distribution before resampling:\n", Counter(df['Bankrupt?']))


Class distribution before resampling:
 Counter({0: 6599, 1: 220})


In [24]:
X = df.drop(['Bankrupt?'], axis=1)
y = df['Bankrupt?']


In [25]:
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Class distribution after SMOTE:", Counter(y_resampled))


Class distribution after SMOTE: Counter({0: 6599, 1: 3299})


In [26]:
correlation_matrix = X_resampled.corr().abs()
high_correlation_drop = []
threshold = 0.70

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > threshold:
            feature_name = correlation_matrix.columns[i]
            if feature_name not in high_correlation_drop:
                high_correlation_drop.append(feature_name)

X_resampled_reduced = X_resampled.drop(columns=high_correlation_drop)
print("Features after dropping correlated ones:", X_resampled_reduced.columns.tolist())


Features after dropping correlated ones: [' ROA(C) before interest and depreciation before interest', ' Operating Gross Margin', ' Operating Profit Rate', ' Non-industry income and expenditure/revenue', ' Operating Expense Rate', ' Research and development expense rate', ' Cash flow rate', ' Interest-bearing debt interest rate', ' Tax rate (A)', ' Net Value Per Share (B)', ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)', ' Realized Sales Gross Profit Growth Rate', ' Operating Profit Growth Rate', ' After-tax Net Profit Growth Rate', ' Continuous Net Profit Growth Rate', ' Total Asset Growth Rate', ' Net Value Growth Rate', ' Total Asset Return Growth Rate Ratio', ' Cash Reinvestment %', ' Current Ratio', ' Quick Ratio', ' Interest Expense Ratio', ' Total debt/Total net worth', ' Debt ratio %', ' Long-term fund suitability ratio (A)', ' Borrowing dependency', ' Contingent liabilities/Net worth', ' Total Asset Turnover', ' Accounts Receivable Turnover', ' Average Collection Days', 

In [27]:
def replace_outliers_iqr(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), df[column].median(), df[column])
    return df

numerical_features = X_resampled_reduced.select_dtypes(include=[np.number]).columns.tolist()
X_resampled_cleaned = replace_outliers_iqr(X_resampled_reduced, numerical_features)


In [28]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled_cleaned)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)


In [30]:
# PCA 
pca = PCA(n_components=0.95)
original_feature_names = X_resampled_cleaned.columns.tolist()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

pca_feature_names = [f'PC{i+1}' for i in range(X_train_pca.shape[1])]
X_train_pca_df = pd.DataFrame(X_train_pca, columns=pca_feature_names)
X_test_pca_df = pd.DataFrame(X_test_pca, columns=pca_feature_names)


In [31]:
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
rf = RandomForestClassifier(class_weight='balanced')
nb = GaussianNB()

# Hyperparameter tuning 
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train_pca, y_train)
print("Best parameters for Random Forest:", grid_search.best_params_)


Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [32]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics 
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_pred)
    
    # Print metrics
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


In [33]:
# Evaluate
print("Logistic Regression:")
evaluate_model(lr, X_train_pca, X_test_pca, y_train, y_test)

print("\nRandom Forest:")
evaluate_model(grid_search.best_estimator_, X_train_pca, X_test_pca, y_train, y_test)

print("\nNaive Bayes:")
evaluate_model(nb, X_train_pca, X_test_pca, y_train, y_test)


Logistic Regression:
Train Accuracy: 0.8978
Test Accuracy: 0.8899
Precision: 0.7757
Recall: 0.9282
F1 Score: 0.8452
ROC AUC Score: 0.8999

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91      1339
           1       0.78      0.93      0.85       641

    accuracy                           0.89      1980
   macro avg       0.87      0.90      0.88      1980
weighted avg       0.90      0.89      0.89      1980


Random Forest:
Train Accuracy: 1.0000
Test Accuracy: 0.9444
Precision: 0.8876
Recall: 0.9485
F1 Score: 0.9170
ROC AUC Score: 0.9455

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.94      0.96      1339
           1       0.89      0.95      0.92       641

    accuracy                           0.94      1980
   macro avg       0.93      0.95      0.94      1980
weighted avg       0.95      0.94      0.94      1980


Naive Bayes:
Train Accur